windows-server-2003/inetsrv/query/apps/srch/brdoc.cxx

//+-------------------------------------------------------------------------
//
//  Microsoft Windows
//  Copyright (C) Microsoft Corporation, 1992 - 2000.
//
//  File:       document.cxx
//
//  Contents:   The Document part of the browser
//
//--------------------------------------------------------------------------

#include <pch.cxx>
#pragma hdrstop

#define TheSearch pSearch

const int UNICODE_PARAGRAPH_SEPARATOR=0x2029;

const GUID guidStorage = PSGUID_STORAGE;

//+-------------------------------------------------------------------------
//
//  Member:     Position::Compare, public
//
//  Synopsis:   Compare two positions
//
//--------------------------------------------------------------------------

int Position::Compare( const Position& pos ) const
{
   int diff = _para - pos.Para();
   if ( diff == 0 )
      diff = _begOff - pos.BegOff();
   return diff;
}

//+-------------------------------------------------------------------------
//
//  Member:     Hit::Hit, public
//
//  Synopsis:   Create hit from an array of positions
//
//--------------------------------------------------------------------------

Hit::Hit( const Position * aPos, unsigned cPos )
: _cPos(cPos)
{
    _aPos = new Position[cPos];

    memcpy( _aPos, aPos, sizeof(Position) * cPos );
}

Hit::~Hit()
{
    delete _aPos;
}

//+-------------------------------------------------------------------------
//
//  Member:     HitIter::GetPositionCount, public
//
//  Synopsis:   return number of positions or zero
//
//--------------------------------------------------------------------------

int HitIter::GetPositionCount() const
{
    if (_iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit])
        return _pDoc->_aHit[_iHit]->Count();

    return 0;
}

//+-------------------------------------------------------------------------
//
//  Member:     HitIter::GetPosition, public
//
//  Synopsis:   return position by value
//
//--------------------------------------------------------------------------

Position HitIter::GetPosition ( int i ) const
{
     if ( _iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit] )
          return _pDoc->_aHit[_iHit]->GetPos(i);
     else
     {
          Position pos;
          return( pos );
     }
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::Document, public
//
//  Synopsis:   Initialize document with filename
//
//--------------------------------------------------------------------------

Document::Document(WCHAR const* filename, LONG rank, BOOL fDelete)
: _filename(0),
  _rank (rank),
  _buffer(0),
  _bufLen(0),
  _bufEnd(0),
  _pFilter(0),
  _aParaOffset(0),
  _isInit(FALSE),
  _cHit(0),
  _aParaLine(0),
  _maxParaLen(0),
  _cPara(0),
  _chunkCount(0),
  _fDelete( fDelete )
{
    _filename = new WCHAR[ wcslen( filename ) + 1 ];
    wcscpy( _filename, filename );
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::Document, public
//
//  Synopsis:   Initialize document
//
//--------------------------------------------------------------------------

Document::Document()
: _filename(0),
  _buffer(0),
  _bufLen(0),
  _bufEnd(0),
  _pFilter(0),
  _aParaOffset(0),
  _isInit(FALSE),
  _cHit(0),
  _aParaLine(0),
  _maxParaLen(0),
  _cPara(0),
  _chunkCount(0),
  _fDelete( FALSE )
{}

//+-------------------------------------------------------------------------
//
//  Member:     Document::~Document, public
//
//  Synopsis:   Free document
//
//--------------------------------------------------------------------------

Document::~Document()
{
    Free();
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::Free, public
//
//  Synopsis:   Free document storage
//
//--------------------------------------------------------------------------

void Document::Free()
{
    if ( 0 != _filename )
    {
        if ( _fDelete )
            DeleteFile( _filename );

        delete [] _filename;
    }

    if (!_isInit)
        return;

    for ( unsigned i = 0; i < _cHit; i++ )
    {
        delete _aHit[i];
        _aHit[i] = 0;
    }

    // _aHit is embedded

    delete []_aParaOffset;
    _aParaOffset = 0;

    if (_aParaLine)
    {
        for (int i = 0; i < _cPara; i++)
        {
            while (_aParaLine[i].next != 0)
            {
                ParaLine* p = _aParaLine[i].next;
                _aParaLine[i].next = _aParaLine[i].next->next;
                delete p;
            }
        }
        delete _aParaLine;
    }

    delete _buffer;

    _buffer = 0;

    _bufEnd = 0;
    _cHit = 0;

    _isInit = FALSE;
} //Free

//+-------------------------------------------------------------------------
//
//  Member:     Document::Init, public
//
//  Synopsis:   Read-in file, fill array of hits
//
//--------------------------------------------------------------------------

SCODE Document::Init(ISearchQueryHits *pSearch)
{
    BOOL noHits = FALSE;

    SCODE sc = S_OK;

    TRY
    {
        AllocBuffer( _filename );
        BindToFilter( _filename );

        ULONG ulFlags;
        sc = _pFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS |
                             IFILTER_INIT_CANON_HYPHENS |
                             IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,
                             0, 0, &ulFlags );

        if (FAILED (sc))
            THROW (CException(sc));

        ReadFile();

        BreakParas();

        if (Paras() != 0)
        {
            BreakLines();

#if 0
            // some filters don't behave correctly if you just re-init them,
            // so release the filter and re-open it.

            _pFilter->Release();
            _pFilter = 0;
            BindToFilter();
#endif

            sc = _pFilter->Init ( IFILTER_INIT_CANON_PARAGRAPHS |
                                  IFILTER_INIT_CANON_HYPHENS |
                                  IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,
                                  0, 0, &ulFlags );
            sc = TheSearch->Init( _pFilter, ulFlags );

            if (FAILED (sc))
            {
                if ( QUERY_E_ALLNOISE != sc )
                    THROW (CException(sc));
                // we can still show the file

                sc = S_OK;
                noHits = TRUE;
            }

            // SUCCESS
            _isInit = TRUE;
        }
    }
    CATCH ( CException, e )
    {
        _isInit = FALSE;
        sc = e.GetErrorCode();
    }
    END_CATCH;

    if (!noHits)
    {
        //
        // pull up all the hits
        //

        ULONG count;
        FILTERREGION* aRegion;
        SCODE sc = TheSearch->NextHitOffset ( &count, &aRegion );

        while (sc == S_OK)
        {
            XCoMem<FILTERREGION> xRegion( aRegion );

            CDynArrayInPlace<Position> aPos( count );

            for (unsigned i = 0; i < count; i++)
                aPos [i] = RegionToPos ( aRegion [i] );

            xRegion.Free();

            XPtr<Hit> xHit( new Hit( aPos.GetPointer(), count ) );

            _aHit[_cHit] = xHit.Get();
            _cHit++;
            xHit.Acquire();

            sc = TheSearch->NextHitOffset ( &count, &aRegion );
        }
    }
    else
    {
        _cHit = 0;
        _isInit = (_bufEnd - _buffer) != 0;
    }

    if ( _pFilter )
    {
        _pFilter->Release();
        _pFilter = 0;
    }

    return _isInit ? S_OK : sc;
}

Position Document::RegionToPos ( FILTERREGION& region )
{
    static int paraHint = 0;
    static int iChunkHint = 0;
    static Position posNull;

    ULONG offset = ULONG (-1);

    // translate region to offset into buffer
    if (iChunkHint >= _chunkCount || _chunk[iChunkHint].ChunkId() != region.idChunk )
    {
        iChunkHint = 0;

        while ( iChunkHint < _chunkCount && _chunk[iChunkHint].ChunkId() < region.idChunk )
        {
            iChunkHint++;
        }

        if (iChunkHint >= _chunkCount || _chunk[iChunkHint].ChunkId() != region.idChunk)
            return posNull;
    }

    Win4Assert ( iChunkHint < _chunkCount );
    Win4Assert ( _chunk[iChunkHint].ChunkId() == region.idChunk );

    offset = _chunk[iChunkHint].Offset() + region.cwcStart;

    if (paraHint >= _cPara || _aParaOffset[paraHint] > offset )
        paraHint = 0;

    Win4Assert ( _aParaOffset[paraHint] <= offset );

    for ( ; paraHint <= _cPara; paraHint++)
    {
        // _aParaOffset[_cPara] is valid!

        if (_aParaOffset[paraHint] > offset)
        {
            Win4Assert (paraHint > 0);
            paraHint--;
            return Position ( paraHint,
                              offset - _aParaOffset[paraHint],
                              region.cwcExtent );
        }
    }

    return posNull;
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::AllocBuffer, public
//
//  Synopsis:   Allocate buffer for file text
//
//--------------------------------------------------------------------------

void Document::AllocBuffer ( WCHAR const * pwcPath )
{
    //
    //  We should keep allocating buffers on demand,
    //  but for this simple demo we'll just get the
    //  file size up front and do a single buffer
    //  allocation of 2.25 the size (to accommodate
    //  Unicode expansion). THIS IS JUST A DEMO!
    //

    HANDLE hFile = CreateFile ( pwcPath,
                               GENERIC_READ,
                               FILE_SHARE_READ,
                               0, // security
                               OPEN_EXISTING,
                               FILE_ATTRIBUTE_NORMAL,
                               0 ); // template

    if ( INVALID_HANDLE_VALUE == hFile )
        THROW( CException() );

    _bufLen = GetFileSize(hFile, 0 );
    CloseHandle ( hFile );

    // Unicode from ASCII, twice and then some

    _bufLen = 2 * _bufLen + _bufLen / 4 + 1;

    _buffer = new WCHAR [_bufLen + 1];
    _buffer[ _bufLen ] = 0;
}

typedef HRESULT (__stdcall * PFnLoadTextFilter)( WCHAR const * pwcPath,
                                                 IFilter ** ppIFilter );

PFnLoadTextFilter g_pLoadTextFilter = 0;

SCODE MyLoadTextFilter( WCHAR const *pwc, IFilter **ppFilter )
{
    if ( 0 == g_pLoadTextFilter )
    {
        g_pLoadTextFilter = (PFnLoadTextFilter) GetProcAddress( GetModuleHandle( L"query.dll" ), "LoadTextFilter" );

        if ( 0 == g_pLoadTextFilter )
            return HRESULT_FROM_WIN32( GetLastError() );
    }

    return g_pLoadTextFilter( pwc, ppFilter );
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::BindToFilter, public
//
//  Synopsis:   Bind to appropriate filter for the document
//
//--------------------------------------------------------------------------

void Document::BindToFilter( WCHAR const * pwcPath )
{
    //
    // Bind to the filter interface
    //

    SCODE sc = LoadIFilter( pwcPath, 0, (void **)&_pFilter );

    if ( FAILED(sc) )
    {
        sc = MyLoadTextFilter( pwcPath, &_pFilter );
        if ( FAILED(sc) )
            THROW( CException(sc) );
    }
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::ReadFile, public
//
//  Synopsis:   Read file into buffer using the filter
//
//--------------------------------------------------------------------------

void Document::ReadFile ()
{
    SCODE sc;
    ULONG lenSoFar = 0;
    int   cChunk = 0;
    BOOL  fSeenProp = FALSE;

    STAT_CHUNK statChunk;
    sc = _pFilter->GetChunk ( &statChunk );

    // what about all these glueing flags?
    // Take them into account at some point
    // to test more complicated chunking

    while (SUCCEEDED(sc)
          || FILTER_E_LINK_UNAVAILABLE == sc
          || FILTER_E_EMBEDDING_UNAVAILABLE == sc )
    {

        if ( SUCCEEDED( sc ) && (statChunk.flags & CHUNK_TEXT) )
        {
            // read the contents only

            if ( statChunk.attribute.guidPropSet == guidStorage &&
                 statChunk.attribute.psProperty.ulKind == PRSPEC_PROPID &&
                 statChunk.attribute.psProperty.propid == PID_STG_CONTENTS )
            {
                if ( statChunk.breakType != CHUNK_NO_BREAK )
                {
                    switch( statChunk.breakType )
                    {
                        case CHUNK_EOW:
                        case CHUNK_EOS:
                            _buffer[lenSoFar++] = L' ';
                            break;
                        case CHUNK_EOP:
                        case CHUNK_EOC:
                            _buffer[lenSoFar++] = UNICODE_PARAGRAPH_SEPARATOR;
                            break;
                    }
                }

                _chunk [cChunk].SetChunkId (statChunk.idChunk);
                Win4Assert ( cChunk == 0 || statChunk.idChunk > _chunk [cChunk - 1].ChunkId () );
                _chunk [cChunk].SetOffset (lenSoFar);
                cChunk++;

                do
                {
                    ULONG lenThis = _bufLen - lenSoFar;
                    if (lenThis == 0)
                        break;

                    sc = _pFilter->GetText( &lenThis, _buffer+lenSoFar );

                    // The buffer may be filled with zeroes.  Nice filter.

                    if ( SUCCEEDED(sc) && 0 != lenThis )
                    {
                        lenThis = __min( lenThis,
                                         wcslen( _buffer + lenSoFar ) );
                        lenSoFar += lenThis;
                    }
                }
                while (SUCCEEDED(sc));
            }
        } // if SUCCEEDED( sc )

        // next chunk, please
        sc = _pFilter->GetChunk ( &statChunk );
    }

    _bufEnd = _buffer + lenSoFar;

    Win4Assert( lenSoFar <= _bufLen );

    _chunkCount = cChunk;
}


//+-------------------------------------------------------------------------
//
//  Member:     Document::BreakParas, public
//
//  Synopsis:   Break document into paragraphs separated by line feeds
//
//--------------------------------------------------------------------------

#define PARAS 25

void Document::BreakParas()
{
    int maxParas = PARAS;
    _aParaOffset = new unsigned [ maxParas ];
    WCHAR * pCur = _buffer;
    _cPara = 0;
    _maxParaLen = 0;

    do
    {
        if ( _cPara == maxParas )
        {
            // grow array
            unsigned * tmp = new unsigned [maxParas * 2];
            for ( int n = 0; n < maxParas; n++ )
                tmp[n] = _aParaOffset[n];
            delete []_aParaOffset;
            _aParaOffset = tmp;
            maxParas *= 2;
        }
        _aParaOffset [_cPara] = (UINT)(pCur - _buffer);

        pCur = EatPara(pCur);

        _cPara++;

    } while ( pCur < _bufEnd );

    // store end of buffer offset as _aParaOffset[_cPara]

    if ( _cPara == maxParas )
    {
        // grow array
        unsigned * tmp = new unsigned [maxParas + 1];
        for ( int n = 0; n < maxParas; n++ )
            tmp[n] = _aParaOffset[n];
        delete []_aParaOffset;
        _aParaOffset = tmp;
        maxParas += 1;
    }

    _aParaOffset [_cPara] = (UINT)(pCur - _buffer - 1);
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::EatPara, private
//
//  Synopsis:   Skip till the line feed
//
//--------------------------------------------------------------------------

WCHAR * Document::EatPara( WCHAR * pCur )
{
    // search for newline or null
    int pos = 0;
    int c;

    while ( pCur < _bufEnd
            && (c = *pCur) != L'\n'
            && c != L'\r'
            && c != L'\0'
            && c != UNICODE_PARAGRAPH_SEPARATOR )
    {
        pos++;
        pCur++;
    }
    // eat newline and/or carriage return
    pCur++;
    if ( pCur < _bufEnd
         && *(pCur-1) == L'\r'
         && *pCur == L'\n' )
         pCur++;

    if ( pos > _maxParaLen )
        _maxParaLen = pos;
    return pCur;
}

int BreakLine ( WCHAR* buf, int cwcBuf, int cwcMax )
{
    if (cwcBuf <= cwcMax)
        return cwcBuf;
    Win4Assert (cwcMax > 0);
    // look backwards for whitespace
    int len = cwcMax;
    int c = buf[len-1];
    while (c != L' ' && c != L'\t')
    {
        len--;
        if (len < 1)
            break;
        c = buf[len-1];
    }
    if (len == 0)
    {
        // a single word larger than screen width
        // try scanning forward
        len = cwcMax;
        c = buf[len];
        while (c != L' ' && c != L'\t')
        {
            len++;
            if (len == cwcBuf)
                break;
            c = buf[len];
        }
    }
    return len;
}

const int MAX_LINE_LEN = 110;

void Document::BreakLines()
{
    _aParaLine = new ParaLine [_cPara];
    for (int i = 0; i < _cPara; i++)
    {
        int cwcLeft = _aParaOffset[i+1] - _aParaOffset[i];

        if (cwcLeft < MAX_LINE_LEN)
            _aParaLine[i].offEnd = cwcLeft;
        else
        {
            ParaLine* pParaLine = &_aParaLine[i];
            WCHAR* buf = _buffer + _aParaOffset[i];
            int cwcOffset = 0;

            for (;;)
            {
                int cwcLine = BreakLine ( buf + cwcOffset, cwcLeft, MAX_LINE_LEN );
                cwcOffset += cwcLine;
                pParaLine->offEnd = cwcOffset;
                cwcLeft -= cwcLine;
                if (cwcLeft == 0)
                    break;
                pParaLine->next = new ParaLine;
                pParaLine = pParaLine->next;
            };
        }
    }
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::GetLine, public
//
//  Arguments:  [nPara] -- paragraph number
//              [off] -- offset within paragraph
//              [cwc] -- in/out chars to copy / copied
//              [buf] -- target buffer
//
//  Synopsis:   Copy text from paragraph to buffer
//
//--------------------------------------------------------------------------


BOOL Document::GetLine(int nPara, int off, int& cwc, WCHAR* buf)
{
    Win4Assert (_buffer != 0);
    if (nPara >= _cPara)
        return FALSE;

    const WCHAR * pText = _buffer + _aParaOffset[nPara] + off;

    // _aParaOffset [_cPara] is the offset of the end of buffer
    int cwcPara = _aParaOffset[nPara+1] - (_aParaOffset[nPara] + off);

    cwc = __min ( cwc, cwcPara );
    memcpy ( buf, pText, cwc * sizeof(WCHAR));
    return TRUE;
}

//+-------------------------------------------------------------------------
//
//  Member:     Document::GetWord, public
//
//  Synopsis:
//  Copy the string into buffer
//
//--------------------------------------------------------------------------

void Document::GetWord(int nPara, int offSrc, int cwcSrc, WCHAR* buf)
{
    Win4Assert (_buffer != 0);
    Win4Assert ( nPara < _cPara );

    WCHAR * p = _buffer + _aParaOffset[nPara];

    Win4Assert ( p + offSrc + cwcSrc <= _bufEnd );

    memcpy ( buf, p + offSrc, cwcSrc * sizeof(WCHAR));
}