|
|
//+-------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1992 - 2000.
//
// File: document.cxx
//
// Contents: The Document part of the browser
//
//--------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#define TheSearch pSearch
const int UNICODE_PARAGRAPH_SEPARATOR=0x2029;
const GUID guidStorage = PSGUID_STORAGE;
//+-------------------------------------------------------------------------
//
// Member: Position::Compare, public
//
// Synopsis: Compare two positions
//
//--------------------------------------------------------------------------
int Position::Compare( const Position& pos ) const { int diff = _para - pos.Para(); if ( diff == 0 ) diff = _begOff - pos.BegOff(); return diff; }
//+-------------------------------------------------------------------------
//
// Member: Hit::Hit, public
//
// Synopsis: Create hit from an array of positions
//
//--------------------------------------------------------------------------
Hit::Hit( const Position * aPos, unsigned cPos ) : _cPos(cPos) { _aPos = new Position[cPos];
memcpy( _aPos, aPos, sizeof(Position) * cPos ); }
Hit::~Hit() { delete _aPos; }
//+-------------------------------------------------------------------------
//
// Member: HitIter::GetPositionCount, public
//
// Synopsis: return number of positions or zero
//
//--------------------------------------------------------------------------
int HitIter::GetPositionCount() const { if (_iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit]) return _pDoc->_aHit[_iHit]->Count();
return 0; }
//+-------------------------------------------------------------------------
//
// Member: HitIter::GetPosition, public
//
// Synopsis: return position by value
//
//--------------------------------------------------------------------------
Position HitIter::GetPosition ( int i ) const { if ( _iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit] ) return _pDoc->_aHit[_iHit]->GetPos(i); else { Position pos; return( pos ); } }
//+-------------------------------------------------------------------------
//
// Member: Document::Document, public
//
// Synopsis: Initialize document with filename
//
//--------------------------------------------------------------------------
Document::Document(WCHAR const* filename, LONG rank, BOOL fDelete) : _filename(0), _rank (rank), _buffer(0), _bufLen(0), _bufEnd(0), _pFilter(0), _aParaOffset(0), _isInit(FALSE), _cHit(0), _aParaLine(0), _maxParaLen(0), _cPara(0), _chunkCount(0), _fDelete( fDelete ) { _filename = new WCHAR[ wcslen( filename ) + 1 ]; wcscpy( _filename, filename ); }
//+-------------------------------------------------------------------------
//
// Member: Document::Document, public
//
// Synopsis: Initialize document
//
//--------------------------------------------------------------------------
Document::Document() : _filename(0), _buffer(0), _bufLen(0), _bufEnd(0), _pFilter(0), _aParaOffset(0), _isInit(FALSE), _cHit(0), _aParaLine(0), _maxParaLen(0), _cPara(0), _chunkCount(0), _fDelete( FALSE ) {}
//+-------------------------------------------------------------------------
//
// Member: Document::~Document, public
//
// Synopsis: Free document
//
//--------------------------------------------------------------------------
Document::~Document() { Free(); }
//+-------------------------------------------------------------------------
//
// Member: Document::Free, public
//
// Synopsis: Free document storage
//
//--------------------------------------------------------------------------
void Document::Free() { if ( 0 != _filename ) { if ( _fDelete ) DeleteFile( _filename );
delete [] _filename; }
if (!_isInit) return;
for ( unsigned i = 0; i < _cHit; i++ ) { delete _aHit[i]; _aHit[i] = 0; }
// _aHit is embedded
delete []_aParaOffset; _aParaOffset = 0;
if (_aParaLine) { for (int i = 0; i < _cPara; i++) { while (_aParaLine[i].next != 0) { ParaLine* p = _aParaLine[i].next; _aParaLine[i].next = _aParaLine[i].next->next; delete p; } } delete _aParaLine; }
delete _buffer;
_buffer = 0;
_bufEnd = 0; _cHit = 0;
_isInit = FALSE; } //Free
//+-------------------------------------------------------------------------
//
// Member: Document::Init, public
//
// Synopsis: Read-in file, fill array of hits
//
//--------------------------------------------------------------------------
SCODE Document::Init(ISearchQueryHits *pSearch) { BOOL noHits = FALSE;
SCODE sc = S_OK;
TRY { AllocBuffer( _filename ); BindToFilter( _filename );
ULONG ulFlags; sc = _pFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS | IFILTER_INIT_CANON_HYPHENS | IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, 0, 0, &ulFlags );
if (FAILED (sc)) THROW (CException(sc));
ReadFile();
BreakParas();
if (Paras() != 0) { BreakLines();
#if 0
// some filters don't behave correctly if you just re-init them,
// so release the filter and re-open it.
_pFilter->Release(); _pFilter = 0; BindToFilter(); #endif
sc = _pFilter->Init ( IFILTER_INIT_CANON_PARAGRAPHS | IFILTER_INIT_CANON_HYPHENS | IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, 0, 0, &ulFlags ); sc = TheSearch->Init( _pFilter, ulFlags );
if (FAILED (sc)) { if ( QUERY_E_ALLNOISE != sc ) THROW (CException(sc)); // we can still show the file
sc = S_OK; noHits = TRUE; }
// SUCCESS
_isInit = TRUE; } } CATCH ( CException, e ) { _isInit = FALSE; sc = e.GetErrorCode(); } END_CATCH;
if (!noHits) { //
// pull up all the hits
//
ULONG count; FILTERREGION* aRegion; SCODE sc = TheSearch->NextHitOffset ( &count, &aRegion );
while (sc == S_OK) { XCoMem<FILTERREGION> xRegion( aRegion );
CDynArrayInPlace<Position> aPos( count );
for (unsigned i = 0; i < count; i++) aPos [i] = RegionToPos ( aRegion [i] );
xRegion.Free();
XPtr<Hit> xHit( new Hit( aPos.GetPointer(), count ) );
_aHit[_cHit] = xHit.Get(); _cHit++; xHit.Acquire();
sc = TheSearch->NextHitOffset ( &count, &aRegion ); } } else { _cHit = 0; _isInit = (_bufEnd - _buffer) != 0; }
if ( _pFilter ) { _pFilter->Release(); _pFilter = 0; }
return _isInit ? S_OK : sc; }
Position Document::RegionToPos ( FILTERREGION& region ) { static int paraHint = 0; static int iChunkHint = 0; static Position posNull;
ULONG offset = ULONG (-1);
// translate region to offset into buffer
if (iChunkHint >= _chunkCount || _chunk[iChunkHint].ChunkId() != region.idChunk ) { iChunkHint = 0;
while ( iChunkHint < _chunkCount && _chunk[iChunkHint].ChunkId() < region.idChunk ) { iChunkHint++; }
if (iChunkHint >= _chunkCount || _chunk[iChunkHint].ChunkId() != region.idChunk) return posNull; }
Win4Assert ( iChunkHint < _chunkCount ); Win4Assert ( _chunk[iChunkHint].ChunkId() == region.idChunk );
offset = _chunk[iChunkHint].Offset() + region.cwcStart;
if (paraHint >= _cPara || _aParaOffset[paraHint] > offset ) paraHint = 0;
Win4Assert ( _aParaOffset[paraHint] <= offset );
for ( ; paraHint <= _cPara; paraHint++) { // _aParaOffset[_cPara] is valid!
if (_aParaOffset[paraHint] > offset) { Win4Assert (paraHint > 0); paraHint--; return Position ( paraHint, offset - _aParaOffset[paraHint], region.cwcExtent ); } }
return posNull; }
//+-------------------------------------------------------------------------
//
// Member: Document::AllocBuffer, public
//
// Synopsis: Allocate buffer for file text
//
//--------------------------------------------------------------------------
void Document::AllocBuffer ( WCHAR const * pwcPath ) { //
// We should keep allocating buffers on demand,
// but for this simple demo we'll just get the
// file size up front and do a single buffer
// allocation of 2.25 the size (to accommodate
// Unicode expansion). THIS IS JUST A DEMO!
//
HANDLE hFile = CreateFile ( pwcPath, GENERIC_READ, FILE_SHARE_READ, 0, // security
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0 ); // template
if ( INVALID_HANDLE_VALUE == hFile ) THROW( CException() );
_bufLen = GetFileSize(hFile, 0 ); CloseHandle ( hFile );
// Unicode from ASCII, twice and then some
_bufLen = 2 * _bufLen + _bufLen / 4 + 1;
_buffer = new WCHAR [_bufLen + 1]; _buffer[ _bufLen ] = 0; }
typedef HRESULT (__stdcall * PFnLoadTextFilter)( WCHAR const * pwcPath, IFilter ** ppIFilter );
PFnLoadTextFilter g_pLoadTextFilter = 0;
SCODE MyLoadTextFilter( WCHAR const *pwc, IFilter **ppFilter ) { if ( 0 == g_pLoadTextFilter ) { g_pLoadTextFilter = (PFnLoadTextFilter) GetProcAddress( GetModuleHandle( L"query.dll" ), "LoadTextFilter" );
if ( 0 == g_pLoadTextFilter ) return HRESULT_FROM_WIN32( GetLastError() ); }
return g_pLoadTextFilter( pwc, ppFilter ); }
//+-------------------------------------------------------------------------
//
// Member: Document::BindToFilter, public
//
// Synopsis: Bind to appropriate filter for the document
//
//--------------------------------------------------------------------------
void Document::BindToFilter( WCHAR const * pwcPath ) { //
// Bind to the filter interface
//
SCODE sc = LoadIFilter( pwcPath, 0, (void **)&_pFilter );
if ( FAILED(sc) ) { sc = MyLoadTextFilter( pwcPath, &_pFilter ); if ( FAILED(sc) ) THROW( CException(sc) ); } }
//+-------------------------------------------------------------------------
//
// Member: Document::ReadFile, public
//
// Synopsis: Read file into buffer using the filter
//
//--------------------------------------------------------------------------
void Document::ReadFile () { SCODE sc; ULONG lenSoFar = 0; int cChunk = 0; BOOL fSeenProp = FALSE;
STAT_CHUNK statChunk; sc = _pFilter->GetChunk ( &statChunk );
// what about all these glueing flags?
// Take them into account at some point
// to test more complicated chunking
while (SUCCEEDED(sc) || FILTER_E_LINK_UNAVAILABLE == sc || FILTER_E_EMBEDDING_UNAVAILABLE == sc ) {
if ( SUCCEEDED( sc ) && (statChunk.flags & CHUNK_TEXT) ) { // read the contents only
if ( statChunk.attribute.guidPropSet == guidStorage && statChunk.attribute.psProperty.ulKind == PRSPEC_PROPID && statChunk.attribute.psProperty.propid == PID_STG_CONTENTS ) { if ( statChunk.breakType != CHUNK_NO_BREAK ) { switch( statChunk.breakType ) { case CHUNK_EOW: case CHUNK_EOS: _buffer[lenSoFar++] = L' '; break; case CHUNK_EOP: case CHUNK_EOC: _buffer[lenSoFar++] = UNICODE_PARAGRAPH_SEPARATOR; break; } }
_chunk [cChunk].SetChunkId (statChunk.idChunk); Win4Assert ( cChunk == 0 || statChunk.idChunk > _chunk [cChunk - 1].ChunkId () ); _chunk [cChunk].SetOffset (lenSoFar); cChunk++;
do { ULONG lenThis = _bufLen - lenSoFar; if (lenThis == 0) break;
sc = _pFilter->GetText( &lenThis, _buffer+lenSoFar );
// The buffer may be filled with zeroes. Nice filter.
if ( SUCCEEDED(sc) && 0 != lenThis ) { lenThis = __min( lenThis, wcslen( _buffer + lenSoFar ) ); lenSoFar += lenThis; } } while (SUCCEEDED(sc)); } } // if SUCCEEDED( sc )
// next chunk, please
sc = _pFilter->GetChunk ( &statChunk ); }
_bufEnd = _buffer + lenSoFar;
Win4Assert( lenSoFar <= _bufLen );
_chunkCount = cChunk; }
//+-------------------------------------------------------------------------
//
// Member: Document::BreakParas, public
//
// Synopsis: Break document into paragraphs separated by line feeds
//
//--------------------------------------------------------------------------
#define PARAS 25
void Document::BreakParas() { int maxParas = PARAS; _aParaOffset = new unsigned [ maxParas ]; WCHAR * pCur = _buffer; _cPara = 0; _maxParaLen = 0;
do { if ( _cPara == maxParas ) { // grow array
unsigned * tmp = new unsigned [maxParas * 2]; for ( int n = 0; n < maxParas; n++ ) tmp[n] = _aParaOffset[n]; delete []_aParaOffset; _aParaOffset = tmp; maxParas *= 2; } _aParaOffset [_cPara] = (UINT)(pCur - _buffer);
pCur = EatPara(pCur);
_cPara++;
} while ( pCur < _bufEnd );
// store end of buffer offset as _aParaOffset[_cPara]
if ( _cPara == maxParas ) { // grow array
unsigned * tmp = new unsigned [maxParas + 1]; for ( int n = 0; n < maxParas; n++ ) tmp[n] = _aParaOffset[n]; delete []_aParaOffset; _aParaOffset = tmp; maxParas += 1; }
_aParaOffset [_cPara] = (UINT)(pCur - _buffer - 1); }
//+-------------------------------------------------------------------------
//
// Member: Document::EatPara, private
//
// Synopsis: Skip till the line feed
//
//--------------------------------------------------------------------------
WCHAR * Document::EatPara( WCHAR * pCur ) { // search for newline or null
int pos = 0; int c;
while ( pCur < _bufEnd && (c = *pCur) != L'\n' && c != L'\r' && c != L'\0' && c != UNICODE_PARAGRAPH_SEPARATOR ) { pos++; pCur++; } // eat newline and/or carriage return
pCur++; if ( pCur < _bufEnd && *(pCur-1) == L'\r' && *pCur == L'\n' ) pCur++;
if ( pos > _maxParaLen ) _maxParaLen = pos; return pCur; }
int BreakLine ( WCHAR* buf, int cwcBuf, int cwcMax ) { if (cwcBuf <= cwcMax) return cwcBuf; Win4Assert (cwcMax > 0); // look backwards for whitespace
int len = cwcMax; int c = buf[len-1]; while (c != L' ' && c != L'\t') { len--; if (len < 1) break; c = buf[len-1]; } if (len == 0) { // a single word larger than screen width
// try scanning forward
len = cwcMax; c = buf[len]; while (c != L' ' && c != L'\t') { len++; if (len == cwcBuf) break; c = buf[len]; } } return len; }
const int MAX_LINE_LEN = 110;
void Document::BreakLines() { _aParaLine = new ParaLine [_cPara]; for (int i = 0; i < _cPara; i++) { int cwcLeft = _aParaOffset[i+1] - _aParaOffset[i];
if (cwcLeft < MAX_LINE_LEN) _aParaLine[i].offEnd = cwcLeft; else { ParaLine* pParaLine = &_aParaLine[i]; WCHAR* buf = _buffer + _aParaOffset[i]; int cwcOffset = 0;
for (;;) { int cwcLine = BreakLine ( buf + cwcOffset, cwcLeft, MAX_LINE_LEN ); cwcOffset += cwcLine; pParaLine->offEnd = cwcOffset; cwcLeft -= cwcLine; if (cwcLeft == 0) break; pParaLine->next = new ParaLine; pParaLine = pParaLine->next; }; } } }
//+-------------------------------------------------------------------------
//
// Member: Document::GetLine, public
//
// Arguments: [nPara] -- paragraph number
// [off] -- offset within paragraph
// [cwc] -- in/out chars to copy / copied
// [buf] -- target buffer
//
// Synopsis: Copy text from paragraph to buffer
//
//--------------------------------------------------------------------------
BOOL Document::GetLine(int nPara, int off, int& cwc, WCHAR* buf) { Win4Assert (_buffer != 0); if (nPara >= _cPara) return FALSE;
const WCHAR * pText = _buffer + _aParaOffset[nPara] + off;
// _aParaOffset [_cPara] is the offset of the end of buffer
int cwcPara = _aParaOffset[nPara+1] - (_aParaOffset[nPara] + off);
cwc = __min ( cwc, cwcPara ); memcpy ( buf, pText, cwc * sizeof(WCHAR)); return TRUE; }
//+-------------------------------------------------------------------------
//
// Member: Document::GetWord, public
//
// Synopsis:
// Copy the string into buffer
//
//--------------------------------------------------------------------------
void Document::GetWord(int nPara, int offSrc, int cwcSrc, WCHAR* buf) { Win4Assert (_buffer != 0); Win4Assert ( nPara < _cPara );
WCHAR * p = _buffer + _aParaOffset[nPara];
Win4Assert ( p + offSrc + cwcSrc <= _bufEnd );
memcpy ( buf, p + offSrc, cwcSrc * sizeof(WCHAR)); }
|