|
|
/*************************************************************************
* * * FTCOMMON.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * Routine common for both index & search * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/
#include <mvopsys.h>
#include <mem.h>
#include <mvsearch.h>
#include <groups.h>
#include <orkin.h>
#include "common.h"
static BYTE NEAR s_aszModule[] = __FILE__; // Used by error return functions.
/*************************************************************************
* EXTERNAL VARIABLES * All those variables must be read only *************************************************************************/
FDECODE DecodeTable[] = { GetBitStreamDWord, GetFixedDWord, GetBellDWord, };
/*
* The mask table. Its values correspond to the maximum number * we can get depending on how many bits are left in the byte. * For ease of usage, the table should be used starting at 1 * intead of 0 */ static BYTE NEAR BitLeftMask [] = { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff, };
/*
* The bit mask table. This is used to speed up FGetBool() */ static BYTE BitMask [] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, };
/*************************************************************************
* * INTERNAL GLOBAL FUNCTIONS * All of them should be declared far, unless they are known to be called * in the same segment *************************************************************************/ PUBLIC LST PASCAL FAR ExtractWord(LST, LST, LPW); PUBLIC HRESULT PASCAL FAR ReadStemNode (PNODEINFO, int); PUBLIC HRESULT PASCAL FAR ReadLeafNode (PNODEINFO, int); PUBLIC int PASCAL FAR StrCmpPascal2(LPB lpStr1, LPB lpStr2); PUBLIC HRESULT PASCAL FAR GetBellDWord (PNODEINFO, CKEY, LPDW); PUBLIC HRESULT PASCAL FAR GetBitStreamDWord (PNODEINFO, CKEY, LPDW); PUBLIC HRESULT PASCAL FAR GetFixedDWord (PNODEINFO, CKEY, LPDW); PUBLIC HRESULT PASCAL FAR ReadNewData(PNODEINFO pNodeInfo); PUBLIC int PASCAL FAR ReadFileOffset (FILEOFFSET FAR *, LPB); PUBLIC HRESULT PASCAL FAR FGetBits(PNODEINFO, LPDW, CBIT); PUBLIC HRESULT PASCAL FAR ReadIndexHeader(HFPB, PIH20); PUBLIC HRESULT PASCAL FAR CopyFileOffset (LPB pDest, FILEOFFSET fo); PUBLIC CB PASCAL FAR CbBytePack (LPB lpbOut, DWORD dwIn);
/*************************************************************************
* * INTERNAL PRIVATE FUNCTIONS * All of them should be declared near *************************************************************************/ PRIVATE HRESULT PASCAL NEAR FGetBool(PNODEINFO pNodeInfo); PRIVATE HRESULT PASCAL NEAR FGetScheme(PNODEINFO, LPCKEY);
/*************************************************************************
* @doc INTERNAL * * @func LST PASCAL FAR | ExtractWord | * Extract a compacted word from the input buffer and store * it as a Pascal string. * * @parm LST | lstWord | * Buffer in which to put the Pascal word. * * @parm LST | lstCurPtr | * Current pointer to the source. The word here is compacted * * @parm LPW | pWlen | * pointer to word's length buffer to be udpated * * @rdesc * The function will return the new position of the source pointer * * @comm * There is a trick in the compression. This function works on * the assumption that the SAME buffer is used for lstWord on * successive call to it. The first word is always whole. Subsequent * words may be part of the 1st one * Ex: "solid" and "solidarity" will be encoded as: * [0:5]solid and [5:5]arity * The first call will be made for "solid" which will fill lstWord * with: * 5solid * The next call will update the postfix and the word length * 10solidarity * Note that "solid" is the remain of last call buffer * * Note: Tehre is no error checking for speed *************************************************************************/ PUBLIC LST PASCAL FAR ExtractWord(LST lstWord, LST lstCurPtr, LPW pwRealLength) { SHORT cbPrefix; SHORT cbPostfix; DWORD dwPrefixLength; BYTE fHasWordLength;
// Get the prefix length
lstCurPtr += CbByteUnpack (&dwPrefixLength, lstCurPtr); cbPostfix = (USHORT)dwPrefixLength - 1; cbPrefix = (*lstCurPtr) & 0x7f; fHasWordLength = (*lstCurPtr) & 0x80; lstCurPtr++;
// Update word length
*pwRealLength = *(PUSHORT)lstWord = (USHORT)(cbPrefix + cbPostfix); lstWord += sizeof(SHORT);
// Copy postfix
MEMCPY (lstWord + cbPrefix, lstCurPtr, cbPostfix); lstCurPtr += cbPostfix;
if (fHasWordLength) { lstCurPtr += CbByteUnpack(&dwPrefixLength, lstCurPtr); *pwRealLength = (WORD)dwPrefixLength; } CbBytePack (lstWord + cbPrefix + cbPostfix, (DWORD)(*pwRealLength)); return lstCurPtr; }
PUBLIC CB PASCAL NEAR CbBytePack (LPB lpbOut, DWORD dwIn) { LPB lpbOldOut;
/* Save the old offset */ lpbOldOut = lpbOut;
do { *lpbOut = (BYTE)(dwIn & 0x7F); /* Get 7 bits. */ dwIn >>= 7; if (dwIn) *lpbOut |= 0x80; /* To be continued... */ lpbOut++; } while (dwIn); return (CB)(lpbOut - lpbOldOut); /* Return compressed width */ }
int FAR PASCAL StrCmpPascal2(LPB lpStr1, LPB lpStr2) { int fRet; int register len;
// Get the minimum length
if ((fRet = *(LPUW)lpStr1 - *(LPUW)lpStr2 ) > 0) len = *(LPUW)lpStr2; else len = *(LPUW)lpStr1;
// Skip the lengths */
lpStr1 += sizeof (SHORT); lpStr2 += sizeof (SHORT);
// Start compare byte per byte */
for (; len > 0; len--, lpStr1++, lpStr2++) { if (*lpStr1 != *lpStr2) break; }
if (len == 0) return fRet; return (*lpStr1 - *lpStr2); }
int PASCAL FAR ReadFileOffset (FILEOFFSET FAR *pFo, LPB pSrc) { pFo->dwOffset = GETLONG ((LPUL)pSrc); pSrc += sizeof (DWORD); pFo->dwHigh = GETWORD ((LPUL)pSrc);
return FOFFSET_SIZE; }
PUBLIC HRESULT PASCAL FAR CopyFileOffset (LPB pDest, FILEOFFSET fo) {
*(LPUL)pDest = fo.dwOffset; pDest += sizeof (DWORD); *(LPUW)pDest = (WORD)fo.dwHigh; pDest += sizeof (WORD); return FOFFSET_SIZE; }
/*************************************************************************
* @doc INTERNAL * * @func HRESULT FAR PASCAL | ReadIndexHeader | * Read the index header * * @parm HFPB | hfpb | * Handle of index file * * @parm PIH20 | pHeader | * Pointer to header info structure * * @rdesc S_OK if succeeded, errors otherwise *************************************************************************/ PUBLIC HRESULT FAR PASCAL ReadIndexHeader(HFPB hfpbSubFile, PIH20 pHeader) { ERRB errb;
// foNil should, of course, be nil
ITASSERT(0 == foNil.dwOffset && 0 == foNil.dwHigh);
if (FileSeekRead(hfpbSubFile, pHeader, foNil, sizeof(IH20), &errb) != sizeof(IH20)) { return errb; }
/* Mac code. Those following lines will be optimized out */ pHeader->FileStamp = SWAPWORD(pHeader->FileStamp); pHeader->version = SWAPWORD(pHeader->version); pHeader->lcTopics = SWAPLONG(pHeader->lcTopics); pHeader->foIdxRoot.dwOffset = SWAPLONG(pHeader->foIdxRoot.dwOffset); pHeader->foIdxRoot.dwHigh = SWAPLONG(pHeader->foIdxRoot.dwHigh); pHeader->nidLast = SWAPLONG(pHeader->nidLast); pHeader->nidIdxRoot = SWAPLONG(pHeader->nidIdxRoot); pHeader->cIdxLevels = SWAPWORD(pHeader->cIdxLevels); pHeader->occf = SWAPWORD(pHeader->occf); pHeader->idxf = SWAPWORD(pHeader->idxf); /* Index statistics */ pHeader->dwMaxFieldId = SWAPLONG(pHeader->dwMaxFieldId); pHeader->dwMaxWCount = SWAPLONG(pHeader->dwMaxWCount); pHeader->dwMaxOffset = SWAPLONG(pHeader->dwMaxOffset); pHeader->dwMaxWLen = SWAPLONG(pHeader->dwMaxWLen); pHeader->dwBlockSize = SWAPLONG(pHeader->dwBlockSize); pHeader->dwMinTopicId = SWAPLONG(pHeader->dwMinTopicId); pHeader->dwMaxTopicId = SWAPLONG(pHeader->dwMaxTopicId);
/* New members for index file version 4.0 */ pHeader->dwCodePageID = SWAPLONG(pHeader->dwCodePageID); pHeader->lcid = SWAPLONG(pHeader->lcid); pHeader->dwBreakerInstID = SWAPLONG(pHeader->dwBreakerInstID);
return S_OK; }
/*************************************************************************
* @doc INTERNAL * * @func HRESULT PASCAL FAR | ReadNewData| * Read in mode data * * @parm PNODEINFO | pNodeInfo | * Pointer to data node info * @rdesc S_OK if succesful, otherwise other errors *************************************************************************/ PUBLIC HRESULT PASCAL FAR ReadNewData(PNODEINFO pNodeInfo) { ERRB errb; DWORD cbDataRead; LONG cbRead;
if ((cbDataRead = pNodeInfo->dwDataSizeLeft) == 0) return(S_OK); // Nothing else to read
if (cbDataRead > FILE_BUFFER) cbDataRead = FILE_BUFFER;
#if defined(_DEBUG) && defined(_DUMPALL)
_DPF2("Data read @ %lu, size = %lu\n", pNodeInfo->nodeOffset.dwOffset, \ cbDataRead); #endif
// Read the data block
if ((cbRead = FileSeekRead (pNodeInfo->hfpbIdx, pNodeInfo->pBuffer, pNodeInfo->nodeOffset, cbDataRead, &errb)) != (LONG)cbDataRead) { SetErrCode (&errb, E_ASSERT); } pNodeInfo->pMaxAddress = pNodeInfo->pBuffer + cbRead; pNodeInfo->dwDataSizeLeft -= cbRead; pNodeInfo->nodeOffset = FoAddDw(pNodeInfo->nodeOffset, cbRead); pNodeInfo->pCurPtr = pNodeInfo->pBuffer; pNodeInfo->ibit = cbitBYTE - 1;
return S_OK; }
/*************************************************************************
* @doc INTERNAL * * @func HRESULT PASCAL FAR | ReadLeafNode | * Leaf node structure: * Next block Ptr | CbLeft |* Word | PointerToNode *| Slack * 6b 2b | Var | 6b * * @parm PNODEINFO | pNodeInfo | * Pointer to leaf info * * @parm HRESULT | cLevel | * Level of the node. 0 is the top node * * @rdesc S_OK if succesful, otherwise other errors *************************************************************************/ PUBLIC HRESULT PASCAL FAR ReadLeafNode (PNODEINFO pNodeInfo, int cLevel) { ERRB errb;
#if defined(_DEBUG) && defined(_DUMPALL)
_DPF2("Leaf read @ %lu, size = %lu\n", pNodeInfo->nodeOffset.dwOffset, \ pNodeInfo->dwBlockSize); #endif
pNodeInfo->pBuffer = pNodeInfo->pLeafNode; if (FileSeekRead (pNodeInfo->hfpbIdx, pNodeInfo->pBuffer, pNodeInfo->nodeOffset, pNodeInfo->dwBlockSize, &errb) != (long)pNodeInfo->dwBlockSize) { return (errb); } // Remember to subtract cbLeft from the node size
pNodeInfo->cbLeft = GETWORD((LPUW)(pNodeInfo->pBuffer + FOFFSET_SIZE)); pNodeInfo->pCurPtr = pNodeInfo->pBuffer + FOFFSET_SIZE + sizeof(WORD); pNodeInfo->pMaxAddress = pNodeInfo->pBuffer + pNodeInfo->dwBlockSize - pNodeInfo->cbLeft; return S_OK; }
/*************************************************************************
* @doc INTERNAL * * @func HRESULT PASCAL FAR | ReadStemNode | * Read in a new node from the disk if it is not the top node. * For the top node, just reset various pointers * Stem node structure: * CbLeft |* Word | PointerToNode *| Slack * 2b | Var | 6b * * @parm PNODEINFO | pNodeInfo | * Pointer to leaf info * * @parm int | cLevel | * Level of the node. 0 is the top node * * @rdesc S_OK if succesful, otherwise other errors *************************************************************************/ PUBLIC HRESULT PASCAL FAR ReadStemNode (PNODEINFO pNodeInfo, int cLevel) { ERRB errb; #if 0
DWORD cbLeft; #endif
if (cLevel == 0) { // The top node is buffered.
pNodeInfo->pBuffer = pNodeInfo->pTopNode;
} else { // The rest isn't.
#if defined(_DEBUG) && defined(_DUMPALL)
_DPF2("Stem read @ %lu, size = %lu\n", pNodeInfo->nodeOffset.dwOffset, \ pNodeInfo->dwBlockSize); #endif
pNodeInfo->pBuffer = pNodeInfo->pStemNode; if (FileSeekRead (pNodeInfo->hfpbIdx, pNodeInfo->pBuffer, pNodeInfo->nodeOffset, pNodeInfo->dwBlockSize, &errb) != (long)pNodeInfo->dwBlockSize) { return (errb); } } // Remember to subtract cbLeft from the node size
#if 0
pNodeInfo->pCurPtr = pNodeInfo->pBuffer + CbByteUnpack (&cbLeft, pNodeInfo->pBuffer); #else
pNodeInfo->cbLeft = GETWORD(pNodeInfo->pBuffer); pNodeInfo->pCurPtr = pNodeInfo->pBuffer + sizeof(WORD); #endif
pNodeInfo->pMaxAddress = pNodeInfo->pBuffer + pNodeInfo->dwBlockSize - pNodeInfo->cbLeft; return S_OK; }
PUBLIC HRESULT PASCAL FAR GetFixedDWord (PNODEINFO pNodeInfo, CKEY ckey, LPDW lpdw) { return FGetBits(pNodeInfo, lpdw, (CBIT)(ckey.ucCenter + 1)); }
/*************************************************************************
* @doc INTERNAL * * @func PUBLIC HRESULT PASCAL FAR | GetBellDWord | * This function decode a dword encoded with the Bell scheme * * @parm PNODEINFO | pNodeInfo | * Pointer to NODEINFO structure * * @parm CKEY | ckey | * Decoding key * * @parm LPDW | lpdw | * Place to store the result * * @rdesc S_OK if everything is OK, errors otherwise * * @comm * The Bell compression scheme works as followed. If we have 1000 * numbers, 90% of them require 6 bits, and 10% require more than * 6 bits. In this case, ckey.ucCenter will be 6, ie. the minimum * number of bits. The extra number of bits needed to store the * extra numbers is stored as a series of 1's * - First, we check the next bit. If it is 1, then extra bits * are needed. We then keep reading all the 1's bits to know * the number of extra bits needed * - We then read the number using the number of bits needed *************************************************************************/ PUBLIC HRESULT PASCAL FAR GetBellDWord (PNODEINFO pNodeInfo, CKEY ckey, LPDW lpdw) { register BYTE bData; register int cBitLeft; register int cCount; HRESULT fRet; LPB lpbCurPtr; DWORD dwVal; LPB lpMaxAddress = pNodeInfo->pMaxAddress; int tmp; DWORD dwBlockSize = pNodeInfo->dwBlockSize;
cCount = ckey.ucCenter; dwVal = 0L;
if ((lpbCurPtr = pNodeInfo->pCurPtr) >= lpMaxAddress) { if ((fRet = ReadNewData (pNodeInfo)) != S_OK) return fRet; lpbCurPtr = pNodeInfo->pCurPtr; lpMaxAddress = pNodeInfo->pMaxAddress; }
/* This is duplicate of FGetBool() to speed things up */
/* Make copy of pNodeInfo->lrgbCurPtr and pNodeInfo->ibit to avoid indexed
* references */ bData = *lpbCurPtr;
/* Check to make suere that we do have enough data */
if ((cBitLeft = pNodeInfo->ibit) < 0) { if (++lpbCurPtr >= lpMaxAddress) { if ((fRet = ReadNewData (pNodeInfo)) != S_OK) return fRet; lpbCurPtr = pNodeInfo->pCurPtr; lpMaxAddress = pNodeInfo->pMaxAddress; } bData = *lpbCurPtr; cBitLeft = cbitBYTE - 1; }
if ((bData & BitMask[cBitLeft--]) > 0) { /* Get the number of all extra bits */
for (;;) { /* This is duplicate of FGetBool() to speed things up */
if (cBitLeft < 0) { if (++lpbCurPtr >= lpMaxAddress) { if ((fRet = ReadNewData (pNodeInfo)) != S_OK) return fRet; lpbCurPtr = pNodeInfo->pCurPtr; lpMaxAddress = pNodeInfo->pMaxAddress; } bData = *lpbCurPtr; cBitLeft = cbitBYTE - 1; }
if ((bData & BitMask[cBitLeft--]) > 0) cCount++; else break; }
dwVal = (1L << cCount);
}
cBitLeft ++;
if (cCount) {
/* Duplicate of FGetBits() to speed things up */ do { if (cBitLeft <= 0) { if (++lpbCurPtr >= lpMaxAddress) { if ((fRet = ReadNewData (pNodeInfo)) != S_OK) return fRet; lpbCurPtr = pNodeInfo->pCurPtr; lpMaxAddress = pNodeInfo->pMaxAddress; } cBitLeft = cbitBYTE; bData = *lpbCurPtr; }
if (cCount >= (tmp = cBitLeft)) { dwVal |= ((DWORD)(bData & BitLeftMask[cBitLeft])) << (cCount -= cBitLeft); cBitLeft = 0; } else { dwVal |= (((DWORD)(bData& BitLeftMask[tmp])) >> (cBitLeft -= cCount)); break; } } while (cCount); }
/* Update values */ *lpdw = dwVal; pNodeInfo->ibit = cBitLeft - 1;
pNodeInfo->pCurPtr = lpbCurPtr; return S_OK; }
PUBLIC HRESULT PASCAL FAR GetBitStreamDWord (PNODEINFO pNodeInfo, CKEY ckey, LPDW lpdw) { int fBit; *lpdw = 0;
for (;;) { if ((fBit = FGetBool(pNodeInfo)) > 0) (*lpdw)++; else if (!fBit) return S_OK; else return E_FAIL; } }
/*************************************************************************
* * INTERNAL PRIVATE FUNCTIONS * All of them should be declared near *************************************************************************/ // Get a single bit from the index.
PRIVATE HRESULT PASCAL FAR FGetBool ( PNODEINFO pNodeInfo) // Current leaf info.
{ if ((short)pNodeInfo->ibit < 0) { pNodeInfo->pCurPtr++; pNodeInfo->ibit = cbitBYTE - 1; } return *pNodeInfo->pCurPtr & (1 << pNodeInfo->ibit--); }
PRIVATE HRESULT PASCAL FAR FGetScheme( PNODEINFO pNodeInfo, // Current leaf info.
LPCKEY lpckey) // Output buffer.
{ int fSchemeBit; // Scratch boolean.
HRESULT fRet;
lpckey->cschScheme = CSCH_NONE; if ((fSchemeBit = FGetBool(pNodeInfo)) == (int)-1) { return E_FAIL; } if (fSchemeBit) lpckey->cschScheme = CSCH_FIXED; else { if ((fSchemeBit = FGetBool(pNodeInfo)) == (int)-1) { return E_FAIL; } if (fSchemeBit) lpckey->cschScheme = CSCH_BELL; } if ((lpckey->cschScheme == CSCH_BELL) || (lpckey->cschScheme == CSCH_FIXED)) { DWORD dwTmp;
if ((fRet = FGetBits(pNodeInfo, &dwTmp, 5)) != S_OK) { return fRet; } lpckey->ucCenter = (BYTE)dwTmp; } return S_OK; }
// - - - - - - - - -
// Get some number of bits from the index. If this function were
// faster life would be better. It's called incredibly frequently.
PUBLIC HRESULT PASCAL FAR FGetBits( PNODEINFO pNodeInfo, // Current leaf info.
LPDW lpdw, // Output buffer.
register CBIT cbit) // How many bits to get.
{ register BYTE cBitLeft; register BYTE tmp; LPB lpbCurPtr; DWORD dwVal; // HRESULT fRet;
if (cbit == 0) { *lpdw = 0; return S_OK; } dwVal = 0; cBitLeft = pNodeInfo->ibit + 1; lpbCurPtr = pNodeInfo->pCurPtr;
do { HRESULT fRet;
if (cBitLeft <= 0) { lpbCurPtr++; if (lpbCurPtr >= pNodeInfo->pMaxAddress) { if ((fRet = ReadNewData (pNodeInfo)) != S_OK) return fRet; lpbCurPtr = pNodeInfo->pCurPtr; }
cBitLeft = cbitBYTE; } if (cbit >= (CBIT)(tmp = cBitLeft)) { dwVal |= ((DWORD)(*lpbCurPtr & BitLeftMask[cBitLeft])) << (cbit -= cBitLeft); cBitLeft = 0; } else { dwVal |= (((DWORD)(*lpbCurPtr & BitLeftMask[tmp])) >> (cBitLeft -= cbit)); break; } } while (cbit);
/* Update values */ *lpdw = dwVal; pNodeInfo->ibit = cBitLeft - 1; pNodeInfo->pCurPtr = lpbCurPtr; return S_OK; }
|