Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

753 lines
19 KiB

  1. //+-------------------------------------------------------------------------
  2. //
  3. // Microsoft Windows
  4. // Copyright (C) Microsoft Corporation, 1992 - 2000.
  5. //
  6. // File: document.cxx
  7. //
  8. // Contents: The Document part of the browser
  9. //
  10. //--------------------------------------------------------------------------
  11. #include <pch.cxx>
  12. #pragma hdrstop
  13. #define TheSearch pSearch
  14. const int UNICODE_PARAGRAPH_SEPARATOR=0x2029;
  15. const GUID guidStorage = PSGUID_STORAGE;
  16. //+-------------------------------------------------------------------------
  17. //
  18. // Member: Position::Compare, public
  19. //
  20. // Synopsis: Compare two positions
  21. //
  22. //--------------------------------------------------------------------------
  23. int Position::Compare( const Position& pos ) const
  24. {
  25. int diff = _para - pos.Para();
  26. if ( diff == 0 )
  27. diff = _begOff - pos.BegOff();
  28. return diff;
  29. }
  30. //+-------------------------------------------------------------------------
  31. //
  32. // Member: Hit::Hit, public
  33. //
  34. // Synopsis: Create hit from an array of positions
  35. //
  36. //--------------------------------------------------------------------------
  37. Hit::Hit( const Position * aPos, unsigned cPos )
  38. : _cPos(cPos)
  39. {
  40. _aPos = new Position[cPos];
  41. memcpy( _aPos, aPos, sizeof(Position) * cPos );
  42. }
  43. Hit::~Hit()
  44. {
  45. delete _aPos;
  46. }
  47. //+-------------------------------------------------------------------------
  48. //
  49. // Member: HitIter::GetPositionCount, public
  50. //
  51. // Synopsis: return number of positions or zero
  52. //
  53. //--------------------------------------------------------------------------
  54. int HitIter::GetPositionCount() const
  55. {
  56. if (_iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit])
  57. return _pDoc->_aHit[_iHit]->Count();
  58. return 0;
  59. }
  60. //+-------------------------------------------------------------------------
  61. //
  62. // Member: HitIter::GetPosition, public
  63. //
  64. // Synopsis: return position by value
  65. //
  66. //--------------------------------------------------------------------------
  67. Position HitIter::GetPosition ( int i ) const
  68. {
  69. if ( _iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit] )
  70. return _pDoc->_aHit[_iHit]->GetPos(i);
  71. else
  72. {
  73. Position pos;
  74. return( pos );
  75. }
  76. }
  77. //+-------------------------------------------------------------------------
  78. //
  79. // Member: Document::Document, public
  80. //
  81. // Synopsis: Initialize document with filename
  82. //
  83. //--------------------------------------------------------------------------
  84. Document::Document(WCHAR const* filename, LONG rank, BOOL fDelete)
  85. : _filename(0),
  86. _rank (rank),
  87. _buffer(0),
  88. _bufLen(0),
  89. _bufEnd(0),
  90. _pFilter(0),
  91. _aParaOffset(0),
  92. _isInit(FALSE),
  93. _cHit(0),
  94. _aParaLine(0),
  95. _maxParaLen(0),
  96. _cPara(0),
  97. _chunkCount(0),
  98. _fDelete( fDelete )
  99. {
  100. _filename = new WCHAR[ wcslen( filename ) + 1 ];
  101. wcscpy( _filename, filename );
  102. }
  103. //+-------------------------------------------------------------------------
  104. //
  105. // Member: Document::Document, public
  106. //
  107. // Synopsis: Initialize document
  108. //
  109. //--------------------------------------------------------------------------
  110. Document::Document()
  111. : _filename(0),
  112. _buffer(0),
  113. _bufLen(0),
  114. _bufEnd(0),
  115. _pFilter(0),
  116. _aParaOffset(0),
  117. _isInit(FALSE),
  118. _cHit(0),
  119. _aParaLine(0),
  120. _maxParaLen(0),
  121. _cPara(0),
  122. _chunkCount(0),
  123. _fDelete( FALSE )
  124. {}
  125. //+-------------------------------------------------------------------------
  126. //
  127. // Member: Document::~Document, public
  128. //
  129. // Synopsis: Free document
  130. //
  131. //--------------------------------------------------------------------------
  132. Document::~Document()
  133. {
  134. Free();
  135. }
  136. //+-------------------------------------------------------------------------
  137. //
  138. // Member: Document::Free, public
  139. //
  140. // Synopsis: Free document storage
  141. //
  142. //--------------------------------------------------------------------------
  143. void Document::Free()
  144. {
  145. if ( 0 != _filename )
  146. {
  147. if ( _fDelete )
  148. DeleteFile( _filename );
  149. delete [] _filename;
  150. }
  151. if (!_isInit)
  152. return;
  153. for ( unsigned i = 0; i < _cHit; i++ )
  154. {
  155. delete _aHit[i];
  156. _aHit[i] = 0;
  157. }
  158. // _aHit is embedded
  159. delete []_aParaOffset;
  160. _aParaOffset = 0;
  161. if (_aParaLine)
  162. {
  163. for (int i = 0; i < _cPara; i++)
  164. {
  165. while (_aParaLine[i].next != 0)
  166. {
  167. ParaLine* p = _aParaLine[i].next;
  168. _aParaLine[i].next = _aParaLine[i].next->next;
  169. delete p;
  170. }
  171. }
  172. delete _aParaLine;
  173. }
  174. delete _buffer;
  175. _buffer = 0;
  176. _bufEnd = 0;
  177. _cHit = 0;
  178. _isInit = FALSE;
  179. } //Free
  180. //+-------------------------------------------------------------------------
  181. //
  182. // Member: Document::Init, public
  183. //
  184. // Synopsis: Read-in file, fill array of hits
  185. //
  186. //--------------------------------------------------------------------------
  187. SCODE Document::Init(ISearchQueryHits *pSearch)
  188. {
  189. BOOL noHits = FALSE;
  190. SCODE sc = S_OK;
  191. TRY
  192. {
  193. AllocBuffer( _filename );
  194. BindToFilter( _filename );
  195. ULONG ulFlags;
  196. sc = _pFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS |
  197. IFILTER_INIT_CANON_HYPHENS |
  198. IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,
  199. 0, 0, &ulFlags );
  200. if (FAILED (sc))
  201. THROW (CException(sc));
  202. ReadFile();
  203. BreakParas();
  204. if (Paras() != 0)
  205. {
  206. BreakLines();
  207. #if 0
  208. // some filters don't behave correctly if you just re-init them,
  209. // so release the filter and re-open it.
  210. _pFilter->Release();
  211. _pFilter = 0;
  212. BindToFilter();
  213. #endif
  214. sc = _pFilter->Init ( IFILTER_INIT_CANON_PARAGRAPHS |
  215. IFILTER_INIT_CANON_HYPHENS |
  216. IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,
  217. 0, 0, &ulFlags );
  218. sc = TheSearch->Init( _pFilter, ulFlags );
  219. if (FAILED (sc))
  220. {
  221. if ( QUERY_E_ALLNOISE != sc )
  222. THROW (CException(sc));
  223. // we can still show the file
  224. sc = S_OK;
  225. noHits = TRUE;
  226. }
  227. // SUCCESS
  228. _isInit = TRUE;
  229. }
  230. }
  231. CATCH ( CException, e )
  232. {
  233. _isInit = FALSE;
  234. sc = e.GetErrorCode();
  235. }
  236. END_CATCH;
  237. if (!noHits)
  238. {
  239. //
  240. // pull up all the hits
  241. //
  242. ULONG count;
  243. FILTERREGION* aRegion;
  244. SCODE sc = TheSearch->NextHitOffset ( &count, &aRegion );
  245. while (sc == S_OK)
  246. {
  247. XCoMem<FILTERREGION> xRegion( aRegion );
  248. CDynArrayInPlace<Position> aPos( count );
  249. for (unsigned i = 0; i < count; i++)
  250. aPos [i] = RegionToPos ( aRegion [i] );
  251. xRegion.Free();
  252. XPtr<Hit> xHit( new Hit( aPos.GetPointer(), count ) );
  253. _aHit[_cHit] = xHit.Get();
  254. _cHit++;
  255. xHit.Acquire();
  256. sc = TheSearch->NextHitOffset ( &count, &aRegion );
  257. }
  258. }
  259. else
  260. {
  261. _cHit = 0;
  262. _isInit = (_bufEnd - _buffer) != 0;
  263. }
  264. if ( _pFilter )
  265. {
  266. _pFilter->Release();
  267. _pFilter = 0;
  268. }
  269. return _isInit ? S_OK : sc;
  270. }
  271. Position Document::RegionToPos ( FILTERREGION& region )
  272. {
  273. static int paraHint = 0;
  274. static int iChunkHint = 0;
  275. static Position posNull;
  276. ULONG offset = ULONG (-1);
  277. // translate region to offset into buffer
  278. if (iChunkHint >= _chunkCount || _chunk[iChunkHint].ChunkId() != region.idChunk )
  279. {
  280. iChunkHint = 0;
  281. while ( iChunkHint < _chunkCount && _chunk[iChunkHint].ChunkId() < region.idChunk )
  282. {
  283. iChunkHint++;
  284. }
  285. if (iChunkHint >= _chunkCount || _chunk[iChunkHint].ChunkId() != region.idChunk)
  286. return posNull;
  287. }
  288. Win4Assert ( iChunkHint < _chunkCount );
  289. Win4Assert ( _chunk[iChunkHint].ChunkId() == region.idChunk );
  290. offset = _chunk[iChunkHint].Offset() + region.cwcStart;
  291. if (paraHint >= _cPara || _aParaOffset[paraHint] > offset )
  292. paraHint = 0;
  293. Win4Assert ( _aParaOffset[paraHint] <= offset );
  294. for ( ; paraHint <= _cPara; paraHint++)
  295. {
  296. // _aParaOffset[_cPara] is valid!
  297. if (_aParaOffset[paraHint] > offset)
  298. {
  299. Win4Assert (paraHint > 0);
  300. paraHint--;
  301. return Position ( paraHint,
  302. offset - _aParaOffset[paraHint],
  303. region.cwcExtent );
  304. }
  305. }
  306. return posNull;
  307. }
  308. //+-------------------------------------------------------------------------
  309. //
  310. // Member: Document::AllocBuffer, public
  311. //
  312. // Synopsis: Allocate buffer for file text
  313. //
  314. //--------------------------------------------------------------------------
  315. void Document::AllocBuffer ( WCHAR const * pwcPath )
  316. {
  317. //
  318. // We should keep allocating buffers on demand,
  319. // but for this simple demo we'll just get the
  320. // file size up front and do a single buffer
  321. // allocation of 2.25 the size (to accommodate
  322. // Unicode expansion). THIS IS JUST A DEMO!
  323. //
  324. HANDLE hFile = CreateFile ( pwcPath,
  325. GENERIC_READ,
  326. FILE_SHARE_READ,
  327. 0, // security
  328. OPEN_EXISTING,
  329. FILE_ATTRIBUTE_NORMAL,
  330. 0 ); // template
  331. if ( INVALID_HANDLE_VALUE == hFile )
  332. THROW( CException() );
  333. _bufLen = GetFileSize(hFile, 0 );
  334. CloseHandle ( hFile );
  335. // Unicode from ASCII, twice and then some
  336. _bufLen = 2 * _bufLen + _bufLen / 4 + 1;
  337. _buffer = new WCHAR [_bufLen + 1];
  338. _buffer[ _bufLen ] = 0;
  339. }
  340. typedef HRESULT (__stdcall * PFnLoadTextFilter)( WCHAR const * pwcPath,
  341. IFilter ** ppIFilter );
  342. PFnLoadTextFilter g_pLoadTextFilter = 0;
  343. SCODE MyLoadTextFilter( WCHAR const *pwc, IFilter **ppFilter )
  344. {
  345. if ( 0 == g_pLoadTextFilter )
  346. {
  347. g_pLoadTextFilter = (PFnLoadTextFilter) GetProcAddress( GetModuleHandle( L"query.dll" ), "LoadTextFilter" );
  348. if ( 0 == g_pLoadTextFilter )
  349. return HRESULT_FROM_WIN32( GetLastError() );
  350. }
  351. return g_pLoadTextFilter( pwc, ppFilter );
  352. }
  353. //+-------------------------------------------------------------------------
  354. //
  355. // Member: Document::BindToFilter, public
  356. //
  357. // Synopsis: Bind to appropriate filter for the document
  358. //
  359. //--------------------------------------------------------------------------
  360. void Document::BindToFilter( WCHAR const * pwcPath )
  361. {
  362. //
  363. // Bind to the filter interface
  364. //
  365. SCODE sc = LoadIFilter( pwcPath, 0, (void **)&_pFilter );
  366. if ( FAILED(sc) )
  367. {
  368. sc = MyLoadTextFilter( pwcPath, &_pFilter );
  369. if ( FAILED(sc) )
  370. THROW( CException(sc) );
  371. }
  372. }
  373. //+-------------------------------------------------------------------------
  374. //
  375. // Member: Document::ReadFile, public
  376. //
  377. // Synopsis: Read file into buffer using the filter
  378. //
  379. //--------------------------------------------------------------------------
  380. void Document::ReadFile ()
  381. {
  382. SCODE sc;
  383. ULONG lenSoFar = 0;
  384. int cChunk = 0;
  385. BOOL fSeenProp = FALSE;
  386. STAT_CHUNK statChunk;
  387. sc = _pFilter->GetChunk ( &statChunk );
  388. // what about all these glueing flags?
  389. // Take them into account at some point
  390. // to test more complicated chunking
  391. while (SUCCEEDED(sc)
  392. || FILTER_E_LINK_UNAVAILABLE == sc
  393. || FILTER_E_EMBEDDING_UNAVAILABLE == sc )
  394. {
  395. if ( SUCCEEDED( sc ) && (statChunk.flags & CHUNK_TEXT) )
  396. {
  397. // read the contents only
  398. if ( statChunk.attribute.guidPropSet == guidStorage &&
  399. statChunk.attribute.psProperty.ulKind == PRSPEC_PROPID &&
  400. statChunk.attribute.psProperty.propid == PID_STG_CONTENTS )
  401. {
  402. if ( statChunk.breakType != CHUNK_NO_BREAK )
  403. {
  404. switch( statChunk.breakType )
  405. {
  406. case CHUNK_EOW:
  407. case CHUNK_EOS:
  408. _buffer[lenSoFar++] = L' ';
  409. break;
  410. case CHUNK_EOP:
  411. case CHUNK_EOC:
  412. _buffer[lenSoFar++] = UNICODE_PARAGRAPH_SEPARATOR;
  413. break;
  414. }
  415. }
  416. _chunk [cChunk].SetChunkId (statChunk.idChunk);
  417. Win4Assert ( cChunk == 0 || statChunk.idChunk > _chunk [cChunk - 1].ChunkId () );
  418. _chunk [cChunk].SetOffset (lenSoFar);
  419. cChunk++;
  420. do
  421. {
  422. ULONG lenThis = _bufLen - lenSoFar;
  423. if (lenThis == 0)
  424. break;
  425. sc = _pFilter->GetText( &lenThis, _buffer+lenSoFar );
  426. // The buffer may be filled with zeroes. Nice filter.
  427. if ( SUCCEEDED(sc) && 0 != lenThis )
  428. {
  429. lenThis = __min( lenThis,
  430. wcslen( _buffer + lenSoFar ) );
  431. lenSoFar += lenThis;
  432. }
  433. }
  434. while (SUCCEEDED(sc));
  435. }
  436. } // if SUCCEEDED( sc )
  437. // next chunk, please
  438. sc = _pFilter->GetChunk ( &statChunk );
  439. }
  440. _bufEnd = _buffer + lenSoFar;
  441. Win4Assert( lenSoFar <= _bufLen );
  442. _chunkCount = cChunk;
  443. }
  444. //+-------------------------------------------------------------------------
  445. //
  446. // Member: Document::BreakParas, public
  447. //
  448. // Synopsis: Break document into paragraphs separated by line feeds
  449. //
  450. //--------------------------------------------------------------------------
  451. #define PARAS 25
  452. void Document::BreakParas()
  453. {
  454. int maxParas = PARAS;
  455. _aParaOffset = new unsigned [ maxParas ];
  456. WCHAR * pCur = _buffer;
  457. _cPara = 0;
  458. _maxParaLen = 0;
  459. do
  460. {
  461. if ( _cPara == maxParas )
  462. {
  463. // grow array
  464. unsigned * tmp = new unsigned [maxParas * 2];
  465. for ( int n = 0; n < maxParas; n++ )
  466. tmp[n] = _aParaOffset[n];
  467. delete []_aParaOffset;
  468. _aParaOffset = tmp;
  469. maxParas *= 2;
  470. }
  471. _aParaOffset [_cPara] = (UINT)(pCur - _buffer);
  472. pCur = EatPara(pCur);
  473. _cPara++;
  474. } while ( pCur < _bufEnd );
  475. // store end of buffer offset as _aParaOffset[_cPara]
  476. if ( _cPara == maxParas )
  477. {
  478. // grow array
  479. unsigned * tmp = new unsigned [maxParas + 1];
  480. for ( int n = 0; n < maxParas; n++ )
  481. tmp[n] = _aParaOffset[n];
  482. delete []_aParaOffset;
  483. _aParaOffset = tmp;
  484. maxParas += 1;
  485. }
  486. _aParaOffset [_cPara] = (UINT)(pCur - _buffer - 1);
  487. }
  488. //+-------------------------------------------------------------------------
  489. //
  490. // Member: Document::EatPara, private
  491. //
  492. // Synopsis: Skip till the line feed
  493. //
  494. //--------------------------------------------------------------------------
  495. WCHAR * Document::EatPara( WCHAR * pCur )
  496. {
  497. // search for newline or null
  498. int pos = 0;
  499. int c;
  500. while ( pCur < _bufEnd
  501. && (c = *pCur) != L'\n'
  502. && c != L'\r'
  503. && c != L'\0'
  504. && c != UNICODE_PARAGRAPH_SEPARATOR )
  505. {
  506. pos++;
  507. pCur++;
  508. }
  509. // eat newline and/or carriage return
  510. pCur++;
  511. if ( pCur < _bufEnd
  512. && *(pCur-1) == L'\r'
  513. && *pCur == L'\n' )
  514. pCur++;
  515. if ( pos > _maxParaLen )
  516. _maxParaLen = pos;
  517. return pCur;
  518. }
  519. int BreakLine ( WCHAR* buf, int cwcBuf, int cwcMax )
  520. {
  521. if (cwcBuf <= cwcMax)
  522. return cwcBuf;
  523. Win4Assert (cwcMax > 0);
  524. // look backwards for whitespace
  525. int len = cwcMax;
  526. int c = buf[len-1];
  527. while (c != L' ' && c != L'\t')
  528. {
  529. len--;
  530. if (len < 1)
  531. break;
  532. c = buf[len-1];
  533. }
  534. if (len == 0)
  535. {
  536. // a single word larger than screen width
  537. // try scanning forward
  538. len = cwcMax;
  539. c = buf[len];
  540. while (c != L' ' && c != L'\t')
  541. {
  542. len++;
  543. if (len == cwcBuf)
  544. break;
  545. c = buf[len];
  546. }
  547. }
  548. return len;
  549. }
  550. const int MAX_LINE_LEN = 110;
  551. void Document::BreakLines()
  552. {
  553. _aParaLine = new ParaLine [_cPara];
  554. for (int i = 0; i < _cPara; i++)
  555. {
  556. int cwcLeft = _aParaOffset[i+1] - _aParaOffset[i];
  557. if (cwcLeft < MAX_LINE_LEN)
  558. _aParaLine[i].offEnd = cwcLeft;
  559. else
  560. {
  561. ParaLine* pParaLine = &_aParaLine[i];
  562. WCHAR* buf = _buffer + _aParaOffset[i];
  563. int cwcOffset = 0;
  564. for (;;)
  565. {
  566. int cwcLine = BreakLine ( buf + cwcOffset, cwcLeft, MAX_LINE_LEN );
  567. cwcOffset += cwcLine;
  568. pParaLine->offEnd = cwcOffset;
  569. cwcLeft -= cwcLine;
  570. if (cwcLeft == 0)
  571. break;
  572. pParaLine->next = new ParaLine;
  573. pParaLine = pParaLine->next;
  574. };
  575. }
  576. }
  577. }
  578. //+-------------------------------------------------------------------------
  579. //
  580. // Member: Document::GetLine, public
  581. //
  582. // Arguments: [nPara] -- paragraph number
  583. // [off] -- offset within paragraph
  584. // [cwc] -- in/out chars to copy / copied
  585. // [buf] -- target buffer
  586. //
  587. // Synopsis: Copy text from paragraph to buffer
  588. //
  589. //--------------------------------------------------------------------------
  590. BOOL Document::GetLine(int nPara, int off, int& cwc, WCHAR* buf)
  591. {
  592. Win4Assert (_buffer != 0);
  593. if (nPara >= _cPara)
  594. return FALSE;
  595. const WCHAR * pText = _buffer + _aParaOffset[nPara] + off;
  596. // _aParaOffset [_cPara] is the offset of the end of buffer
  597. int cwcPara = _aParaOffset[nPara+1] - (_aParaOffset[nPara] + off);
  598. cwc = __min ( cwc, cwcPara );
  599. memcpy ( buf, pText, cwc * sizeof(WCHAR));
  600. return TRUE;
  601. }
  602. //+-------------------------------------------------------------------------
  603. //
  604. // Member: Document::GetWord, public
  605. //
  606. // Synopsis:
  607. // Copy the string into buffer
  608. //
  609. //--------------------------------------------------------------------------
  610. void Document::GetWord(int nPara, int offSrc, int cwcSrc, WCHAR* buf)
  611. {
  612. Win4Assert (_buffer != 0);
  613. Win4Assert ( nPara < _cPara );
  614. WCHAR * p = _buffer + _aParaOffset[nPara];
  615. Win4Assert ( p + offSrc + cwcSrc <= _bufEnd );
  616. memcpy ( buf, p + offSrc, cwcSrc * sizeof(WCHAR));
  617. }