windows-nt-4.0/private/windows/win4help/ftsrch/txdbase.h

// TxDBase.h -- CTextDatabase class definition

#ifndef __TXDBASE_H__

#define __TXDBASE_H__

#include "SegHash.h"
#include "VMBuffer.h"
#include "Indicate.h"
#include "Classify.h"
#include  "Defines.h"
#include  "TextMat.h"
#include "UnbuffIO.h"
#include   "IOList.h"
#include "IOStream.h"
#include "FTSIFace.h"
#include "Compress.h"
#include     "Util.h"
#include  "Sorting.h"
#include     "dict.h"
#include   "vector.h"

// !!! BugBug !!! Parts of the long comment below are now incorrect and need revision.

//     Term Tags Strategies
//
// The term tag data structures go with term entries in the segmented
// hash tables. We use two hash table - the Global table and the Galactic
// table. The term tags for the two tables are designed for the actions we
// take with the hash table. 
//
// Our index work proceeds in four phases --
//
// 1. Constructing Local Dictionaries
//    During this phase we build a local term dictionary with very restricted
//    context. The dictionary strictly covers a range of up to 65536 tokens.
//    The local dictionary is an unsegmented hash table biased for speed and very
//    low collision rates.
//
// 2. Linking Local Dictionaries with the Global Dictionary
//    When a local dictionary reaches a capacity limit or when we must force
//    our text database to a searchable state, we link local dictionary entries
//    with corresponding global hash table entries. This has two effects --
//
//    A. We merge the local terms with the global terms, adding new unique terms
//       to the global list.
//
//    B. We now have a global linked list which traverses all the references for
//       each term. This was the original searchable format for the database. It
//       works well so long as the database fits entirely within RAM and degrades
//       when our working set significantly exceed RAM space. The current code 
//       doesn't construct these global links, but relies instead on the flattening
//       phase below.
//
// 3. Flattening Linked Lists
//    When the collection of reference links in the local dictionaries reaches
//    a memory size threshold, we traverse the linked lists and construct a
//    collection of flattened vectors of reference indices. At the same time we
//    compress the streams of reference indices. The compression algorithm relies
//    on three fields maintained for each term in the term tag --
//
//    A. iNewRefFirst -- the index of the first instance in the linked stream
//    B. iNewRefLast  -- the index of the last  instance in the linked stream
//    C. cRefsNew     -- the number of instances in the linked stream
//
//    To merge new flattened vectors with previously accumulated vectors we 
//    maintain four additional term tag fields --
//
//    B. iRefListBase -- index to the stream of flattened references for this term.
//    C. cdwRefs      -- size of the flattened reference lists in DWords.
//    D. iRefSequence -- ranking order of this term relative to the galactic table
//                       A -1 value indicates no ranking.
//
//    Note that we simply catenate the compressed reference lists from sequential
//    flattening passes. Thus the stream denoted by iRefListBase has the format --
//
//        {cdw, cRefsStream, iRefFirst, <<basis>, <compressed ref>...>} ...
//
//    where --
//  
//      cdw              is the size of the reference list segment in DWords
//      cRefsStream      is the number of references in the list.
//      iRefFirst        is the first reference in the list
//      <basis>          is a five-bit value which drives the compression
//                       algorithm.
//      <compressed ref> values are variable length bit strings which
//                       represent the delta between successive reference
//                       indices.
//
//    Note that cdw could be derrived from cRefsStream and <basis>. We include
//    it in the reference stream to allow a fast traversal of the stream when
//    we're looking for a particular indexing range.
//  
//    The iRefSequence field is a ordering value maintained in the galactic hash 
//    table for each unique term. It's used to speed up the process of merging new
//    reference vector segments with previously accumulated segments. The strategy
//    is to keep the reference segments in an incrementally committed memory address
//    range and to insert segments by copying reference streams upward in memory, 
//    inserting the new segments as we go. The iRefSequence field gives us the memory 
//    order for the reference streams.
//   
//    Note that for terms with less than four references we keep the reference
//    information entirely in the term tag. That situation is denoted by negative
//    values in cRefsTotal, iRefListBase, and cdwRefs. Zero values are used to
//    mark the one and two cases. The actual index values are the logical
//    negation (~) of those fields in the order mentioned above. When a new vector
//    segment would push the global total beyond three, we merge the old vector 
//    with the new one and create an external list. 
//    
//    Note that the iRefSequence field may be undefined  (-1) or defined >= 0 for
//    a term with less than four references. This is because the galactic table
//    may have other references that push us over the limit.
//   
//    Why do we bother with this complicated scheme? Its value lies in reducing
//    the number of items in the reference stream. During the merge work this
//    reduces the number of items that must be slid upward in memory and it
//    reduces the number of iRefListBase fields that must be adjusted during
//    the merge operation.Approximately 45% of all unique terms are used only
//    once. By keeping small lists in the tag, we reduce the number of external 
//    lists by 75%.
//   
// 4. Galactic Merges
//    When the global table reaches a memory size threshold, we merge its reference
//    information with the galactic hash table and restart our indexing work with
//    an empty global table. The issue here is keeping the global table small enough
//    so that it fits completely within RAM during phase 2 work.
// 
//    The galactic term tags contain only the accumulation fields --
//
//    B. iRefListBase -- index to the stream of flattened references for this term.
//    C. cdwRefs      -- size of the flattened reference lists in DWords.
//    D. iRefSequence -- ranking order of this term relative to the galactic table.
//                       A -1 value indicates no ranking.

typedef struct _TermTagGlobal
        {
            UINT  iGlobalDesc;   // Global   sequence # for term.
            UINT  iGalacticDesc; // Galactic sequence # for term.
            
        //    UINT  iNewRefFirst;  // First linked global ref.
        //    UINT  iNewRefLast;   // Last  linked global ref.
            UINT  cRefsNew;      // # of linked global refs.
            UINT  cRefsGlobal;

        } TermTagGlobal;

typedef TermTagGlobal *PTermTagGlobal;

typedef struct _TermTagGalactic
        {
            UINT  iGalacticDesc; // Galactic sequence # for term.
            
        } TermTagGalactic;

typedef TermTagGalactic *PTermTagGalactic;

typedef struct _DESCRIPTOR
        {
			PWCHAR pwDisplay;  // pbImage is Sort Key, pwDisplay is Display Image.
            
            union
            {
                PWCHAR pbImage;    // Length given by delta with following pd->pbImage.
                UINT   iGalactic;
            };

			union
            {
    			UINT cReferences;   // Used while building a CTextDatabase
                UINT iTokenInfo;    // Used in CTokenCollection
                UINT iTextSet;      // Used in CTitleCollection
            };

			WORD  cwDisplay;
			BYTE  bCharset;
            BYTE  fImageFlags;

        } DESCRIPTOR;

typedef DESCRIPTOR *PDESCRIPTOR;

inline UINT CbImage(PDESCRIPTOR pd)
{
#ifdef MESSAGEBOXES
	
	if (256 < ((pd+1)->pbImage - pd->pbImage))
	{
	 	char ac[256], acToken[101];

		wsprintf(ac, "Token length: %d", ((pd+1)->pbImage - pd->pbImage));
	 	
	 	::MessageBox(NULL, ac, "Very Large Token!", MB_OK);

		CopyMemory(acToken, pd->pbImage, 50);

		acToken[50]= 0;

		wsprintf(ac, "Token Image: \"%s...\"", acToken);

		::MessageBox(NULL, ac, "Part of the token image!", MB_OK);
	}

#else // MESSAGEBOXES

	ASSERT(1024 > ((pd+1)->pbImage - pd->pbImage));

#endif // MESSAGEBOXES
    
    return (pd+1)->pbImage - pd->pbImage;
}

inline UINT CwDisplay(PDESCRIPTOR pd)
{
	ASSERT(1024 > ((pd+1)->pwDisplay - pd->pwDisplay));
    
    return (pd+1)->pwDisplay - pd->pwDisplay;
}

// Flag definitions for DESCRIPTOR.fImageFlags:

// #define LETTER_CHAR       0x0001
// #define CONTAINS_A_TAB    0x0002 
// #define TOKEN_FLAGS_MASK  0x0003
// #define REF_TYPE_MASK     0x000C
// #define BASIS_MASK        0xF800
// #define REFS_LINKED       0x0010

// #define BASIS_SHIFT       11

// Reference types for REF_TYPE_MASK:

// #define SingleRef
// #define PairRef
// #define TripleRef

UINT CBitsToRepresent(UINT ui);
UINT FormatAToken(PDESCRIPTOR pd, int cbOffset, int iColStart, int iColLimit, PWCHAR pbLine);

void SortTokenImages(PDESCRIPTOR pdBase, PDESCRIPTOR **pppdSorted, PDESCRIPTOR **pppdTailSorted,
                     PUINT pcdSorted, UINT cd
                    );


// #define  BUILD_LOCAL_HASH(hv,c)  hv= ((hv << 5) | (hv >> 27)) - c
// #define BUILD_GLOBAL_HASH(hv,c)  hv= ((hv >> 5) | (hv << 27)) - c

typedef struct _LocalToken
        {
            unsigned short iLocalDescriptorEntry;
            unsigned short iLocalReferenceNext;
        } LocalToken;

typedef LocalToken *PLocalToken;

// Descriptor reference tokens are processed in three phases. Tokens are 
// initially created with iLocalDescriptorEntry set and iLocalDescriptorNext 
// zeroed.
//
// Later when we bind a local dictionary to the global dictionary, the
// iLocalDecriptorNext field is used to link together every instance of
// each unique term in the local dictionary.
//
// Finally when we reach a specific memory limit, we flatten the linked lists
// for all local dictionaries to create a vector of reference indices for
// each unique term in the global dictionary. At this point we also map the
// LocalToken structure shown above into GlobalToken values (See below). 
// 
// A GlobalToken is a 16-bit value which refers uniquely to a particular
// global DESCRIPTOR. Since we can easily have more than 64K unique global
// terms, we provide an indirection mechanism which maps some 16-bit values
// into 32-bit values. 
// 
// Here's how it works. We divide GlobalToken values into two ranges.
// Values between 0..59,983 are absolute indices into the global vector of
// unique DESCRIPTORs. Values between 59,984 and 65,535 are mapped to 32-bit
// via a local indrection vector of 32-bit indices.

typedef USHORT       GlobalToken;
typedef GlobalToken *PGlobalToken;

#define LOCAL_HASH_CLASSES     0x8000
#define LOCAL_HASH_MASK        0x7FFF
#define ENTRIES_PER_LOCAL_DICT 6552
#define MAX_REFS_PER_LDICT     0x10000

#define MAX_GLOBAL_TOKENS      (0x10000 - ENTRIES_PER_LOCAL_DICT)

// Note: The constant ENTRIES_PER_LOCAL_DICT is chosen to make the
//       LocalDictionary structure exactly 64K bytes.
//
//       MAX_GLOBAL_TOKENS is a constant which allows streams of token
//       references to fit in 2-byte granules. The first MAX_GLOBAL_TOKENS
//       unique tokens we encounter are considered global. References to
//       those tokens are encode in the value range [0..MAX_GLOBAL_TOKENS-1]
//       while references to tokens outside that set are denoted by values
//       in the range [MAX_GLOBAL_TOKENS .. 0xFFFF]. The latter values can
//       be trivially mapped into indices into the local dictionary which
//       corresponds to the token reference. One effect of this coding is
//       that most local dictionaries will collapse to empty when we convert
//       to the vector representation from the linked token representation.

typedef struct _LocalDictionary
        {
            PLocalToken  pltFirst;    // address of first token for this local dictionary
            UINT         clt;         // count of local tokens which refer to this Local dict
            PDESCRIPTOR *ppdNext;     // next unused slot in apdLocal.
            union
            {
                PDESCRIPTOR apdLocal[ENTRIES_PER_LOCAL_DICT]; // Refs to descriptors used locally
                UINT      aiGalactic[ENTRIES_PER_LOCAL_DICT]; // Galactic indices for local terms
            }; 
            USHORT       aiTokenInstFirst[ENTRIES_PER_LOCAL_DICT]; // List heads for each local
                                                                   // descriptor.
        } LocalDictionary;

typedef LocalDictionary *PLocalDictionary;

#define IVB_TOKEN_STREAM            0   
#define IVB_TOKEN_IMAGES            1
#define IVB_IMAGE_DESCRIPTORS       2
#define IVB_DISPLAY_IMAGES          3

#define COUNT_OF_VIRTUAL_BUFFERS    4
                                
#define vbTokenStream           m_avb[IVB_TOKEN_STREAM     ]
#define vbTokenImages           m_avb[IVB_TOKEN_IMAGES     ]
#define vbImageDescriptors      m_avb[IVB_IMAGE_DESCRIPTORS]
#define vbDisplayImages         m_avb[IVB_DISPLAY_IMAGES]

// Commit and Reservation constants for the virtual buffers
// in the TextDatabaseControl object. These reservations are
// based on an upper limit of 100,000,000 bytes scanned.


#define INIT_TOKEN_REF_COMMIT              0x00010000 // 0x00430000 
#define INIT_TOKEN_REF_RESERVATION         0x08000000
#define INIT_TOKEN_IMAGE_COMMIT            0x00010000 // 0x000A0000  
#define INIT_TOKEN_IMAGE_RESERVATION       0x03700000
#define INIT_IMAGE_DESCRIPTOR_COMMIT       0x00010000 // 0x00160000  
#define INIT_IMAGE_DESCRIPTOR_RESERVATION  0x02A00000
#define INIT_DISPLAY_IMAGE_COMMIT          0x00010000
#define INIT_DISPLAY_IMAGE_RESERVATION     0x03700000

#define BUFFER_INCREMENT    0x2FFFF

#define CB_TEMP_BLOCKS          0x10000 // Approximate block size for unbuffered I/O
#define CB_TRANSACTION_LIMIT    0x40000 // Approximate limit for unbuffered I/O transactions.

const double MEMORY_FACTOR = 0.4; // Fraction of total memory which we're
                                  // allowed to use.
#define CBITS_BASIS_MASK    5
#define BASIS_MASK          (~((~0) << CBITS_BASIS_MASK))

typedef struct _ReferenceDescriptor
        {
            UINT iSerialGalactic;
            UINT idwRefList;
            UINT cdwRefs;
            UINT iLastRef;

        } ReferenceDescriptor;

typedef ReferenceDescriptor *PReferenceDescriptor;

typedef struct _RefClusterDescriptor
        {
            UINT    iFilePosLow;
            UINT    iFilePosHigh;
            UINT    cdw;
            UINT    cTerms;
        
        } RefClusterDescriptor;

typedef RefClusterDescriptor *PRefClusterDescriptor;

enum { 
       MAX_LOCAL_DICTS   = 4096, 
       MAX_REF_SETS      = 256, 
       MAX_REF_CLUSTERS  = 512, 
       CB_MERGE_BUFFER   = 262144, 
       SPARE_FILE_BLOCKS = 6 
     };

// Note: alde and aiTokenRefFirst logically go together. They've been
//       split apart to maintain DWord alignment for the alte items.

typedef struct _UnlinkedState
        {
            PDESCRIPTOR     *appdLocalClasses   [LOCAL_HASH_CLASSES];
            PDESCRIPTOR     *appdCollisionChains[ENTRIES_PER_LOCAL_DICT];
            UINT             cReferences        [ENTRIES_PER_LOCAL_DICT];
        //    USHORT           aiTokenInstLast    [ENTRIES_PER_LOCAL_DICT];  // List tails for each local
                                                                           // descriptor.
            PLocalDictionary pld;
#ifdef _DEBUG
            UINT             cCollisions;
#endif // _DEBUG
            PWCHAR           pbBuffer;
            PWCHAR           pbCurrentLine;
            int              cbLineAdjustment;

	        // The following items are not used to construct local dictionaries.
			// They are placed here so that they will be allocated only when
			// the current text database is indexing text rather than processing
			// queries.

	        RefClusterDescriptor m_rcd[MAX_REF_CLUSTERS];

	        PLocalDictionary m_apLocalDict      [MAX_LOCAL_DICTS  ];     // Need a different upper 
#ifdef _DEBUG        
	        UINT             m_acLocalCollisions[MAX_LOCAL_DICTS  ];
#endif // _DEBUG
	        UINT             m_aiBaseToken      [MAX_LOCAL_DICTS+1];
	        UINT             m_aiBaseCByte      [MAX_LOCAL_DICTS+1];

        } UnlinkedState;

typedef struct _LOCAL_CONTEXT_1    
        {
            CTextDatabase    *ptdb;
            DESCRIPTOR      **ppde;
            UINT              iDescLimit;
            UINT              iLTBase;
            UINT              cAdded;
            USHORT            ild;

        } LOCAL_CONTEXT_1;

typedef struct _LOCAL_CONTEXT_2
        {
            UINT  iSerialNext;
            PUINT paiSerial;

        } LOCAL_CONTEXT_2;
        
typedef struct _CompressionState
        {
        //    UINT iRef;
            UINT cRefs;
        //    UINT cbitsBasis;
        //    union
        //    {
        //        UINT ibitNext;
        //        UINT cbits;
        //    };

        } CompressionState;

typedef struct _LOCAL_CONTEXT_3
        {
            PUINT             puiMap;
            CompressionState *paCS;
            UINT              idBase;
            UINT              cdw;
            UINT              cNewRefLists;

        } LOCAL_CONTEXT_3;

typedef struct _LOCAL_CONTEXT_4
        {
            PDESCRIPTOR *ppd;
            PDESCRIPTOR  pdBase;
        
        } LOCAL_CONTEXT_4;

class CTextDatabase;
class CTokenList;

void MergeLocalEntries(UINT iValue, PVOID pvTag, PVOID pvEnvironment);
void AddLocalEntries  (UINT iValue, PVOID pvTag, PVOID pvEnvironment);

class CTextDatabase : public CTextMatrix
{
    friend class CTokenList;
    friend class CTokenCollection;
    friend class CHiliterTokenList;
    
    friend void MergeLocalEntries(UINT iValue, PVOID pvTag, PVOID pvEnvironment);
    friend void AddLocalEntries  (UINT iValue, PVOID pvTag, PVOID pvEnvironment);

    public:

    //    static CTextDatabase *NewTextDatabase();

        virtual ~CTextDatabase();
        virtual const BYTE *GetSourceName() {ASSERT(0);return NULL;} // Provide this function

        DECLARE_REF_COUNTERS(CTextDatabase)

// Save/Load Interface --

        void StoreImage(CPersist *pDiskImage);

        int  AppendText(PWCHAR pbText, int  cbText, BOOL fArticleEnd, UINT iCharset= ANSI_CHARSET, UINT lcid= 0x409);
        void SyncForQueries();

        UINT CharacterCount ();
        UINT TokenCount     ();
		UINT DescriptorCount();
        UINT MaxTokenWidth  ();

        VOID GetTextMatrix(int iRowStart, int iColStart, 
                           int cRows,     int cCols,     PWCHAR pbDest);

        UINT TextLength(PDESCRIPTOR *ppdSorted, PUINT puiTokenMap, UINT iTokenStart, UINT iTokenLimit);
        UINT CopyText  (PDESCRIPTOR *ppdSorted, PUINT puiTokenMap, UINT iTokenStart, UINT iTokenLimit, PWCHAR pbBuffer, UINT cbBuffer);

        void IndicateVocabularyRefs(CIndicatorSet *pisVocabulary, UINT iPartition,          const UINT *piMap);
        void IndicateVocabularyRefs(CIndicatorSet *pisVocabulary, CIndicatorSet *pisTokens, const UINT *piMap);
        void IndicateArticleRefs   (CIndicatorSet *pisArticles,   UINT iDescriptor,  const UINT *piMap);
        void IndicateTokenRefs     (CIndicatorSet *pisTokens  ,   UINT iDescriptor);
                                      
        CIndicatorSet *TopicInstancesFor    (CTokenList *ptl);
        CIndicatorSet *TokenInstancesFor    (CTokenList *ptl);
        UINT           TokenInstanceCountFor(CTokenList *ptl);
        CIndicatorSet *SymbolLocations();

        CIndicatorSet *VocabularyFor(CIndicatorSet *pisArticles, BOOL fRemovePervasiveTerms= FALSE);
        
		CIndicatorSet *ValidTokens(CTokenList *ptl);

        inline BOOL FPhrases       () { return m_fdwOptions & PHRASE_SEARCH;   }
        inline BOOL FPhraseFeedback() { return m_fdwOptions & PHRASE_FEEDBACK; }
        inline BOOL FVectorSearch  () { return m_fdwOptions & VECTOR_SEARCH;   }
        inline UINT IndexOptions   () { return m_fdwOptions;                   }

		CDictionary	*PDict();
		CCollection	*PColl();

        LCID SortingLCID();

    protected:

#ifdef _DEBUG

             CTextDatabase(PSZ pszTypeName= "TextDatabase");

#else // _DEBUG

             CTextDatabase();

#endif // _DEBUG

        void InitTextDatabase(BOOL fFromFile= FALSE);
        
        void ConnectImage(CPersist *pDiskImage, BOOL fUnpackDisplayForm= TRUE);
        
        inline int Data_cRows() { return 1; }
        inline int Data_cCols() { return m_cbScanned; }

        inline void Data_GetTextMatrix(int  rowTop, int  colLeft,
                                       int  rows, int  cols, PWCHAR lpb, PUINT charsets
                                      )
        {
             GetTextMatrix(rowTop, colLeft, rows, cols, lpb);
        }

        const UINT * TermRanks();
        PUINT TokenBase();

        UINT m_fdwOptions;

    private:

#ifdef _DEBUG

        BOOL   m_fInitialized;

#endif // _DEBUG
        
        UINT   m_fFromFileImage;
        UINT   m_cbScanned;
        UINT   m_cTokensIndexed;
        USHORT m_cLocalDicts;
        USHORT m_iLocalDictBase;

		MY_VIRTUAL_BUFFER m_avb[COUNT_OF_VIRTUAL_BUFFERS];

        CSegHashTable *m_pshtGalactic;
        CSegHashTable *m_pshtGlobal;

        PUINT   m_pwHash;  // Working storage for the AppendSlave routine...
        PBYTE   m_pbType;
        PWCHAR *m_paStart;
        PWCHAR *m_paEnd;
        
        CIndicatorSet *m_pisSymbols;

        PLocalToken m_pltNext;
        PUINT       m_puiTokenNext;
        
        PDESCRIPTOR m_pdNext, m_pdNextGlobal, m_pdNextGalactic, m_pdNextBound;
        PWCHAR      m_pbNext, m_pbNextGlobal, m_pbNextGalactic, m_pbLastGalactic;
        PWCHAR      m_pwDispNext, m_pwDispNextGlobal, m_pwDispNextGalactic, m_pwDispLastGalactic;

        UINT        m_iSerialNumberNext;

        PUINT       m_paiGlobalToRefList;
        
        CUnbufferedIO *m_puioRefTemp;
        CUnbufferedIO *m_puioCompressedRefs;
        PRefListDescriptor m_prldTokenRefs;
        UINT           m_cdwCompressedRefs;
        PUINT          m_pdwCompressedRefs;

        CUnbufferedIO *m_puioCompressedArticleRefs;
        PRefListDescriptor m_prldArticleRefs;
        UINT           m_cdwArticleRefs;
        PUINT          m_pdwArticleRefs;

        CUnbufferedIO *m_puioCompressedVocabularyRefs;
        PRefListDescriptor m_prldVocabularyRefs;
        UINT           m_cdwVocabularyRefs;
        PUINT          m_pdwVocabularyRefs;

        UINT           m_cbBlockSize;
        UINT           m_cbTransactionLimit; 

        UINT                 m_iNextRefSet;
        UINT                 m_ibNextFileBlockLow;
        UINT                 m_ibNextFileBlockHigh;

        PFileBlockLink m_pFirstFreeFileBlock;
        PFileBlockLink m_papFileBlockLinks;

        CIOList *m_piolLeft;
        CIOList *m_piolRight;
        CIOList *m_piolResult;

        LCID         m_lcidSorting;
        PDESCRIPTOR *m_ppdSorted;       // left-to-right sorting vector
        PDESCRIPTOR *m_ppdTailSorted;   // right-to-left sorting vector
        UINT         m_cdSorted;        // number of sorted terms
        UINT         m_cwDisplayMax;

        UINT         m_cTermRanks;
        PUINT        m_pTermRanks;

        CClassifier m_clsfTokens;
        PUINT       m_pafClassifications;
                                                       
		CDictionary		*m_pDict;
		CCollection		*m_pColl;

// BugBug! The private members below are used only during index creation.
//         Convert them to external allocations so we don't pay the price
//         when we're loading an index.

        UnlinkedState *m_pulstate;

        virtual UINT GetPartitionInfo(const UINT **ppaiPartitions, const UINT **ppaiRanks= NULL, const UINT **ppaiMap= NULL) = 0;
        virtual UINT ArticleCount() = 0;

        PDESCRIPTOR DescriptorBase   ();
        PWCHAR      ImageBase        ();
        PWCHAR      DisplayBase      ();

        int AppendSlave(PWCHAR pbText, int  cbText, BOOL fArticleEnd, UINT iCharset, UINT lcid);

        int ExceptionFilter(IN DWORD ExceptionCode, IN PEXCEPTION_POINTERS ExceptionInfo);

        USHORT SearchLocalTable(PWCHAR pbToken, UINT cbToken, UINT  hv, BYTE bType, UINT iCharset, UINT lcid);

        CAValRef *DescriptorList(PDESCRIPTOR pd, UINT cd);

        void ExtendClassifications(PDESCRIPTOR pdSuffix);

        void IndicateMappedRefs(PRefListDescriptor prld, PUINT pdwRefBase, CIndicatorSet *pisArticles, const UINT *piMap);
        
        int IndicateRefs(PRefListDescriptor prld, PUINT pdwRefLists, CIndicatorSet *pis, BOOL fCountOnly, PUINT paiCountArray= NULL);

        void WriteLargeBuff(PVOID pvBuffer, UINT iPosLow, UINT iPosHigh, UINT cbBuffer);

        PLocalDictionary AllocateLocalDictionary();
        PLocalDictionary MoveToNextLocalDict    (PWCHAR pbScanLimit);

        PDESCRIPTOR *FindTokens(CTokenList *ptl, PUINT pcd= NULL);

        void BindToGlobalDict(PWCHAR pbScanLimit);

        void FlattenAndMergeLinks  ();
        void GalacticMerge         ();
        void CoalesceReferenceLists();

        void MergeRefLists(PRefStream prsResult, PRefStream pars, UINT cRefStreams);
        void ConstructVocabularyLists();
        void CompressVocabularyLists(CIOList *piolSource, UINT cdw);
        void CompressArticleRefLists(CIOList *piolSource, UINT cdw);
        void CompressRefLists       (CIOList *piorSource, UINT cdw);
        void CopyRefStreamSegment(CIOList *piolSource, CIOList *piolDestination, UINT cdw);
 };

inline PDESCRIPTOR CTextDatabase::DescriptorBase() { return (PDESCRIPTOR) (vbImageDescriptors.Base); }
inline PWCHAR      CTextDatabase::ImageBase     () { return (PWCHAR     ) (vbTokenImages     .Base); }
inline PWCHAR      CTextDatabase::DisplayBase   () { return (PWCHAR     ) (vbDisplayImages   .Base); }

inline UINT CTextDatabase::CharacterCount () { return m_cbScanned;         }
inline UINT CTextDatabase::DescriptorCount() { return m_iSerialNumberNext; }
inline UINT CTextDatabase::MaxTokenWidth  () { return m_cwDisplayMax;      }

inline PUINT CTextDatabase::TokenBase() 
{ 
    return (PUINT) (vbTokenStream.Base); 
}

inline UINT  CTextDatabase::TokenCount() 
{ 
    PLocalDictionary pld;

	if (m_pulstate && (pld= m_pulstate->pld))
		 return (pld->pltFirst + pld->clt) - (PLocalToken) TokenBase();
    else return m_pltNext - (PLocalToken) TokenBase();
}

inline CIndicatorSet *CTextDatabase::SymbolLocations() { return m_pisSymbols;    }

inline void CTextDatabase::IndicateTokenRefs(CIndicatorSet *pisTokens, UINT iDescriptor)
{
    IndicateRefs(m_prldTokenRefs + iDescriptor, m_pdwCompressedRefs, pisTokens, FALSE);
}

inline void CTextDatabase::IndicateArticleRefs(CIndicatorSet *pisArticles, UINT iDescriptor, const UINT *piMap)
{
    IndicateMappedRefs(m_prldArticleRefs + iDescriptor, m_pdwArticleRefs, pisArticles, piMap);
}

inline	CDictionary	*CTextDatabase::PDict()  {ASSERT(FVectorSearch());  return m_pDict;}
inline	CCollection	*CTextDatabase::PColl()  {ASSERT(FVectorSearch());  return m_pColl;}

inline LCID CTextDatabase::SortingLCID() { return m_lcidSorting; }

#endif // __TXDBASE_H__