|
|
//+-------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1992 - 2000.
//
// File: cdoc.cxx
//
// Contents: a radically stripped down version of the document class
// that gets rid of the notion of paragragph and maintains only
// information relative to the stream
//
//--------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#include <cidebug.hxx>
#include <dynstack.hxx>
#include <cimbmgr.hxx>
#include <propspec.hxx>
#include <vquery.hxx>
#include <pageman.hxx>
#include <dblink.hxx>
#include <imprsnat.hxx>
#include <queryexp.hxx>
#include "whmsg.h"
#include "webdbg.hxx"
#include "cdoc.hxx"
//+-------------------------------------------------------------------------
//
// Function: ComparePositions
//
// Arguments: const void* pPos1 - pointer to first position
// const void* pPos2 - pointer to second position
//
// Synopsis: Comparison function used by qsort to sort positions array
//
//--------------------------------------------------------------------------
int _cdecl ComparePositions( const void* pPos1, const void* pPos2 ) { Position* pp1= (Position*) pPos1; Position* pp2= (Position*) pPos2;
Win4Assert(0 != pp1 && 0 !=pp2);
if (pp1->GetBegOffset() == pp2->GetBegOffset()) return 0; else if (pp1->GetBegOffset() < pp2->GetBegOffset()) return -1; else return 1; }
void Hit::Sort() { qsort( _aPos, _cPos, sizeof(Position), &ComparePositions ); }
//+-------------------------------------------------------------------------
//
// Member: Hit::Hit, public
//
// Arguments: [aPos] - array of positions
// [cPos] - number of Positions in [aPos]
//
// Synopsis: Create hit from an array of positions
//
//--------------------------------------------------------------------------
Hit::Hit( const Position * aPos, unsigned cPos ) : _cPos(cPos) { _aPos = new Position[cPos];
memcpy( _aPos, aPos, sizeof(Position) * cPos ); }
Hit::~Hit() { delete[] _aPos; }
//+-------------------------------------------------------------------------
//
// Member: HitIter::GetPositionCount, public
//
// Synopsis: return number of positions or zero
//
//--------------------------------------------------------------------------
int HitIter::GetPositionCount() const { if (_iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit]) return _pDoc->_aHit[_iHit]->GetPositionCount();
return 0; }
//+-------------------------------------------------------------------------
//
// Member: HitIter::GetPosition, public
//
// Synopsis: return position by value
//
//--------------------------------------------------------------------------
Position HitIter::GetPosition ( int i ) const { if ( _iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit] ) return _pDoc->_aHit[_iHit]->GetPos(i); else { Position pos; return( pos ); } }
//+-------------------------------------------------------------------------
//
// Member: CDocument::CDocument, public constructor
//
// Arguments: [filename] - the name of the file to hit highlight
// [rank] - the rank of document in the hierarchy - NOT USED
// [rSearch] - ISearch object
// [cmsReadTimeout] - timeout for the initial file read
// [lockSingleThreadedFilter] - lock used for all single
// threaded filters
// [propertyList] - properties to be emitted
// [ulDisplayScript] - setting for displaying scripts
//
// Synopsis: Stream the file in chunk by chunk, scan it for hits,
// and record those positions in the stream matching the restricition.
//
//--------------------------------------------------------------------------
CDocument::CDocument( WCHAR * filename, ULONG rank, ISearchQueryHits & rSearch, DWORD cmsReadTimeout, CReleasableLock & lockSingleThreadedFilter, CEmptyPropertyList & propertyList, ULONG ulDisplayScript ) : _filename( filename ), _rank( rank ), _bufEnd( 0 ), _iChunkHint( 0 ), _cHit( 0 ), _rSearch( rSearch ), _cmsReadTimeout( cmsReadTimeout ), _lockSingleThreadedFilter( lockSingleThreadedFilter ) { BOOL noHits = FALSE;
//
// cut away anything after the non-drive colon
// like in c:\wzmail\foo.fld:12.wzm
//
WCHAR* pChar = _filename; if ( _filename[1] == L':') pChar += 2; while (*pChar != 0 && *pChar != L':') pChar++; if(*pChar == L':') *pChar = 0;
//
// allocate a buffer to hold the file
//
AllocBuffer();
//
// attach to IFilter
//
BOOL fKnownFilter = BindToFilter();
// Check if this file's extension has a script mapping (if necessary)
BOOL fHasScriptMap = FALSE;
if ( ( DISPLAY_SCRIPT_NONE == ulDisplayScript ) || ( ( DISPLAY_SCRIPT_KNOWN_FILTER == ulDisplayScript ) && ( !fKnownFilter ) ) ) { WCHAR *pwcExt = wcsrchr( _filename, L'.' ); webDebugOut(( DEB_ITRACE, "extension: '%ws'\n", pwcExt ));
if ( 0 != pwcExt ) { //
// .asp files include .inc files. .inc files don't have a script
// map but they contain script. I'm not aware of a good way to
// enumerate all possible include file extensions for asp.
//
if ( !_wcsicmp( pwcExt, L".inc" ) ) fHasScriptMap = TRUE; else { //
// Must be system to read the metabase
//
CImpersonateSystem system; CMetaDataMgr mdMgr( TRUE, W3VRoot ); fHasScriptMap = mdMgr.ExtensionHasScriptMap( pwcExt ); } } }
webDebugOut(( DEB_ITRACE, "fHasScriptMap %d, fKnownFilter %d, ulDisplayScript %d\n", fHasScriptMap, fKnownFilter, ulDisplayScript ));
if ( fHasScriptMap ) { if ( ( DISPLAY_SCRIPT_NONE == ulDisplayScript ) || ( ( DISPLAY_SCRIPT_KNOWN_FILTER == ulDisplayScript ) && ( !fKnownFilter ) ) ) { THROW( CException( MSG_WEBHITS_PATH_INVALID ) ); } }
//
// Initialize IFilter. Pass the list of properties to be emitted, since
// some other properties may have sensitive information (eg passwords in
// vbscript code in .asp files).
//
// First count how many properties exist.
ULONG cProps = propertyList.GetCount(); // Copy the properties
CDbColumns aSpecs( cProps ); CDbColId prop; for ( unsigned iProp = 0; iProp < cProps; iProp++ ) aSpecs.Add( prop, iProp );
typedef CPropEntry * PCPropEntry; XArray<PCPropEntry> xapPropEntries(cProps);
SCODE sc = propertyList.GetAllEntries(xapPropEntries.GetPointer(), cProps); Win4Assert(S_OK == sc);
if (FAILED (sc)) THROW (CException(sc));
PCPropEntry *apPropEntries = xapPropEntries.GetPointer(); for (ULONG i = 0; i < cProps; i++) { CDbColId * pcol = (CDbColId *) &aSpecs.Get( i );
*pcol = apPropEntries[i]->PropSpec(); if ( !pcol->IsValid()) THROW (CException(E_OUTOFMEMORY)); }
webDebugOut(( DEB_ITRACE, "%d properties being processed\n", cProps ));
ULONG ulFlags; sc = _xFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS | IFILTER_INIT_CANON_HYPHENS | IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, cProps, (FULLPROPSPEC *) aSpecs.GetColumnsArray(), &ulFlags );
if (FAILED (sc)) THROW (CException(sc));
//
// pull the contents of the file into the buffer
//
ReadFile();
// Some broken filters don't work right if you Init() them twice, so
// throw away the IFilter, and get it again.
_xFilter.Free(); BindToFilter();
sc = _xFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS | IFILTER_INIT_CANON_HYPHENS | IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, cProps, (FULLPROPSPEC *) aSpecs.GetColumnsArray(), &ulFlags ); if (FAILED (sc)) THROW (CException(sc));
//
// attach to ISearchQueryHits, which will find the hits
//
sc = _rSearch.Init( _xFilter.GetPointer(), ulFlags );
if (FAILED (sc)) { if ( QUERY_E_INVALIDRESTRICTION != sc ) THROW (CException(sc));
// we can still show the file
noHits = TRUE; }
//
// pull up all the hits
//
TRY { if (!noHits) { ULONG count; FILTERREGION* aRegion; SCODE sc = _rSearch.NextHitOffset( &count, &aRegion ); while ( S_OK == sc ) { XCoMem<FILTERREGION> xRegion( aRegion );
webDebugOut(( DEB_ITRACE, "CDOCUMENT: next hit: count %d, chunk %d offset %d, ext %d\n", count, aRegion[0].idChunk, aRegion[0].cwcStart, aRegion[0].cwcExtent )); CDynArrayInPlace<Position> aPos( count ); //
// get the positions in the hit
//
for (unsigned i = 0; i < count; i++) { aPos[i] = RegionToPos( aRegion [i] ); webDebugOut(( DEB_ITRACE, " region %d, start %d, length %d\n", i, aPos[i].GetBegOffset(), aPos[i].GetLength() )); } xRegion.Free();
XPtr<Hit> xHit( new Hit( aPos.GetPointer(), count ) );
_aHit[_cHit] = xHit.GetPointer(); _cHit++;
xHit.Acquire(); sc = _rSearch.NextHitOffset( &count, &aRegion ); }
if ( FAILED( sc ) ) THROW( CException( sc ) ); } } CATCH( CException, e ) { FreeHits(); RETHROW(); } END_CATCH;
// done with the filter
_xFilter.Free();
if ( _lockSingleThreadedFilter.IsHeld() ) _lockSingleThreadedFilter.Release(); } //CDocument
//+-------------------------------------------------------------------------
//
// Member: CDocument::~CDocument, public
//
// Synopsis: Free CDocument
//
//--------------------------------------------------------------------------
CDocument::~CDocument() { FreeHits(); } //~CDocument
//+-------------------------------------------------------------------------
//
// Member: CDocument::Free, public
//
// Synopsis: Free CDocument storage
//
//--------------------------------------------------------------------------
void CDocument::FreeHits() { //
// walk through _aHit, deleting each Positions array that the
// cells are pointing to
//
for ( unsigned i = 0; i < _cHit; i++ ) { delete _aHit[i]; _aHit[i] = 0; } _cHit = 0; } //Free
//+-------------------------------------------------------------------------
//
// Member: CDocument::RegionToPos, public
//
// Synopsis: Convert a FILTERREGION to a position
//
//--------------------------------------------------------------------------
Position CDocument::RegionToPos( FILTERREGION& region ) { //
// Use a linear search here. In profile runs this has never shown
// up as a problem. Fix if this changes.
//
ULONG offset = ULONG (-1);
//
// check whether we're not trying to access an illegal chunk
//
if (_iChunkHint >= _chunkCount || _chunk[_iChunkHint].ChunkId() != region.idChunk ) { _iChunkHint = 0;
while ( _iChunkHint < _chunkCount && _chunk[_iChunkHint].ChunkId() < region.idChunk ) { _iChunkHint++; }
if (_iChunkHint >= _chunkCount || _chunk[_iChunkHint].ChunkId() != region.idChunk) { return Position(); } }
//
// _iChunkHint now contains the index of the appropriate chunk in the
// chunk array
//
Win4Assert ( _iChunkHint < _chunkCount ); Win4Assert ( _chunk[_iChunkHint].ChunkId() == region.idChunk );
//
// offset now stores the linear offset of the position from the
// beginning of the stream/buffer
//
offset = _chunk[_iChunkHint].Offset() + region.cwcStart;
return Position (offset,region.cwcExtent ); } //RegionToPos
//+-------------------------------------------------------------------------
//
// Member: CDocument::AllocBuffer, public
//
// Synopsis: Allocate buffer for file text
//
//--------------------------------------------------------------------------
void CDocument::AllocBuffer() { HANDLE hFile = CreateFile( _filename, GENERIC_READ, FILE_SHARE_READ, 0, // security
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0 ); // template
if ( INVALID_HANDLE_VALUE == hFile ) THROW( CException() );
ULONG cbBuf = GetFileSize( hFile, 0 ); CloseHandle( hFile );
// Allow extra room for custom properties to be emitted from the
// filter, plus the conversion to unicode
_xBuffer.Init( cbBuf + cbBuf / 2 ); } //AllocBuffer
//+-------------------------------------------------------------------------
//
// Member: CDocument::BindToFilter, public
//
// Synopsis: Bind to appropriate filter for the CDocument
//
// Returns: TRUE if an appropriate filter was found
// FALSE if defaulted to the text filter
//
//--------------------------------------------------------------------------
BOOL CDocument::BindToFilter() { //
// Bind to the filter interface -- try free threaded first. If the
// filter isn't thread-safe, grab the lock and get the filter.
//
SCODE sc = LoadBHIFilter( _filename, 0, _xFilter.GetQIPointer(), FALSE );
// Is the filter not thread safe? If so, get the lock to protect
// the filter. No checking is done to see that this particular
// filter is in use -- just that some non-thread-safe filter is in use.
if ( S_FALSE == sc ) { // If the lock isn't held yet, get it (BindToFilter is called
// twice by CDocument's constructor, so check IsHeld())
if ( !_lockSingleThreadedFilter.IsHeld() ) _lockSingleThreadedFilter.Request();
// retry to load the filter as single-threaded
sc = LoadBHIFilter( _filename, 0, _xFilter.GetQIPointer(), TRUE ); }
BOOL fFoundFilter = TRUE;
if ( FAILED(sc) ) { sc = LoadTextFilter( _filename, _xFilter.GetPPointer() ); if (FAILED(sc)) THROW (CException(sc));
fFoundFilter = FALSE; }
return fFoundFilter; } //BindToFilter
//+-------------------------------------------------------------------------
//
// Function: GetThreadTime
//
// Synopsis: Gets the current total cpu usage for the thread
//
//--------------------------------------------------------------------------
LONGLONG GetThreadTime() { FILETIME ftDummy1, ftDummy2; LONGLONG llUser, llKernel; Win4Assert( sizeof(LONGLONG) == sizeof(FILETIME) );
GetThreadTimes( GetCurrentThread(), &ftDummy1, // Creation time
&ftDummy2, // Exit time
(FILETIME *) &llUser, // user mode time
(FILETIME *) &llKernel ); // kernel mode tiem
return llKernel + llUser; } //GetThreadTime
//+-------------------------------------------------------------------------
//
// Member: CDocument::ReadFile, public
//
// Synopsis: Read file into buffer using the filter
//
//--------------------------------------------------------------------------
void CDocument::ReadFile() { // get the maximum cpu time in 100s of nano seconds.
LONGLONG llLimitCpuTime = _cmsReadTimeout * 1000 * 10000; llLimitCpuTime += GetThreadTime();
ULONG cwcSoFar = 0; int cChunk = 0; BOOL fSeenProp = FALSE; STAT_CHUNK statChunk; SCODE sc = _xFilter->GetChunk ( &statChunk );
//
// Take them into account at some point
// to test more complicated chunking
//
//
// keep getting chunks of the file, placing them in the buffer,
// and setting the chunk offset markers that will be used to
// interpolate the buffer
//
while ( SUCCEEDED(sc) || FILTER_E_LINK_UNAVAILABLE == sc || FILTER_E_EMBEDDING_UNAVAILABLE == sc || FILTER_E_NO_TEXT == sc ) {
//
// Eliminate all chunks with idChunkSource 0 right here - these
// cannot be hit highlighted.
// Also eliminate all CHUNK_VALUE chunks.
//
if ( SUCCEEDED( sc ) && (statChunk.flags & CHUNK_TEXT) && (0 != statChunk.idChunkSource) ) { //
// set markers
//
Win4Assert ( cChunk == 0 || statChunk.idChunk > _chunk [cChunk - 1].ChunkId() );
//
// If there was an end of sentence or paragraph or chapter, we
// should introduce an appropriate spacing character.
//
if ( statChunk.breakType != CHUNK_NO_BREAK && cwcSoFar < _xBuffer.Count() ) { switch (statChunk.breakType) { case CHUNK_EOW: case CHUNK_EOS: _xBuffer[cwcSoFar++] = L' '; // introduce a space character
break;
case CHUNK_EOP: case CHUNK_EOC: _xBuffer[cwcSoFar++] = UNICODE_PARAGRAPH_SEPARATOR; break; } }
//
// The Offset into the stream depends on whether this is an
// 'original' chunk or not
//
CCiPropSpec* pProp = (CCiPropSpec*) &statChunk.attribute;
webDebugOut(( DEB_ITRACE, "Chunk %d, Source %d, Contents %d, start %d, cwc %d\n", statChunk.idChunk, statChunk.idChunkSource, pProp->IsContents(), statChunk.cwcStartSource, statChunk.cwcLenSource ));
if ( (statChunk.idChunk == statChunk.idChunkSource) && pProp->IsContents() ) { _chunk[cChunk].SetChunkId( statChunk.idChunk ); _chunk[cChunk].SetOffset( cwcSoFar ); cChunk++; #if 0
} else if ( statChunk.idChunk != statChunk.idChunkSource ) { _chunk [cChunk].SetChunkId (statChunk.idChunk);
//
// we have to first find the offset of the source chunk
//
for (int i=cChunk-1;i>=0;i--) { if (_chunk[i].ChunkId() == statChunk.idChunkSource) { _chunk[cChunk].SetOffset(_chunk[i].Offset()+statChunk.cwcStartSource); break; } } cChunk++;
}
//
// if the chunk is a contents chunk and idChunkSrc = idChunk,
// then pull it in
//
if ( (statChunk.idChunk == statChunk.idChunkSource) && pProp->IsContents() ) { #endif
webDebugOut(( DEB_ITRACE, "CDOC: markers: chunk %d offset %d\n", _chunk[cChunk-1].ChunkId(), _chunk[cChunk-1].Offset() ));
//
// push the text into memory
//
do { ULONG cwcThis = _xBuffer.Count() - cwcSoFar; if ( 0 == cwcThis ) break;
sc = _xFilter->GetText( &cwcThis, _xBuffer.GetPointer() + cwcSoFar );
if (SUCCEEDED(sc)) { cwcSoFar += cwcThis; } } while (SUCCEEDED(sc)); } } // If SUCCEEDED( sc )
if ( GetThreadTime() > llLimitCpuTime ) { webDebugOut(( DEB_ERROR, "Webhits took too long. Timeout\n" )); THROW( CException( MSG_WEBHITS_TIMEOUT ) ); }
//
// next chunk, please
//
sc = _xFilter->GetChunk ( &statChunk ); }
_bufEnd = _xBuffer.GetPointer() + cwcSoFar; _chunkCount = cChunk; } //ReadFile
WCHAR* CDocument::GetWritablePointerToOffset( long offset ) { if (offset >= 0) { if (_xBuffer.GetPointer() + offset < _bufEnd) return _xBuffer.GetPointer() + offset; else return _bufEnd; } else { return _xBuffer.GetPointer(); } } //GetWritablePointerToOffset
//+-------------------------------------------------------------------------
//
// Member: CDocument::GetPointerToOffset, public
//
// Arguments: [offset] - the offset in the stream that we want a pointer to
//
// Synopsis: Return a constant pointer to a specific offset in the buffer
//
//--------------------------------------------------------------------------
const WCHAR* CDocument::GetPointerToOffset(long offset) { return (const WCHAR *) GetWritablePointerToOffset(offset); } //GetPointerToOffset
|