Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2058 lines
66 KiB

  1. #define VER3
  2. /*************************************************************************
  3. * *
  4. * INDEX.C *
  5. * *
  6. * Copyright (C) Microsoft Corporation 1990-1994 *
  7. * All Rights reserved. *
  8. * *
  9. **************************************************************************
  10. * *
  11. * Module Intent *
  12. * This is the second stage of the index building process. After all *
  13. * of the word have been add in stage 1, IndexBuild will be called. *
  14. * IndexBuild starts the second stage. We will merge-sort the temp file *
  15. * generated in phase 1 to create a second temp file to send to phase 3. *
  16. * *
  17. **************************************************************************
  18. * *
  19. * Current Owner: BinhN *
  20. * *
  21. **************************************************************************/
  22. #include <mvopsys.h>
  23. #include <mem.h>
  24. #include <memory.h>
  25. #include <io.h>
  26. #include <math.h>
  27. #include <mvsearch.h>
  28. #include <orkin.h>
  29. #include "common.h"
  30. #include "index.h"
  31. #ifdef _DEBUG
  32. static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/
  33. #endif
  34. #ifndef _32BIT
  35. #define ESOUTPUT_BUFFER 0xFFFC // Size of output file buffer
  36. // This must be at the size of the largest word + 12
  37. // or word + 14 if OCCF_LENGTH is set
  38. #else
  39. #define ESOUTPUT_BUFFER 0xFFFFC // Size of output file buffer
  40. // This must be at the size of the largest word + 12
  41. // or word + 14 if OCCF_LENGTH is set
  42. #endif
  43. #define FLUSH_NEW_RECORD 1
  44. #define FLUSH_EXCEPT_LAST 2
  45. /*************************************************************************
  46. *
  47. * INTERNAL PRIVATE FUNCTIONS
  48. *
  49. * All of them should be declared near
  50. *
  51. *************************************************************************/
  52. PRIVATE HRESULT NEAR PASCAL FillInputBuffer (LPESB, HFPB);
  53. PRIVATE HRESULT NEAR PASCAL ESFlushBuffer (LPESI);
  54. PRIVATE HRESULT NEAR PASCAL ESFillBuffer (_LPIPB, LPESB);
  55. PRIVATE HRESULT NEAR PASCAL ESMemory2Disk (_LPIPB, PMERGEHEADER, int);
  56. PRIVATE HRESULT NEAR PASCAL ProcessFiles (_LPIPB lpipb, LPMERGEPARAMS);
  57. PRIVATE int NEAR PASCAL CompareRecordBuffers (_LPIPB, LPB, LPB);
  58. PRIVATE VOID NEAR PASCAL PQueueUp (_LPIPB, LPESB FAR *, LONG);
  59. PRIVATE VOID NEAR PASCAL PQueueDown (_LPIPB);
  60. PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER, PTOPICDATA, int);
  61. PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA, PTOPICDATA, int);
  62. PRIVATE LPV PASCAL NEAR GetBlockNode (PBLKCOMBO lpBlockCombo);
  63. PRIVATE VOID PASCAL NEAR SetQueue (LPESI pEsi);
  64. PRIVATE HRESULT PASCAL NEAR ESBBlockAllocate (_LPIPB lpipb, DWORD lMemSize);
  65. PRIVATE BOOL PASCAL LoadEsiTemp (_LPIPB lpipb, LPESI lpesi, LPB lpbEsiFile,
  66. LPB lpbIsiFile, PHRESULT phr);
  67. PRIVATE VOID PASCAL NEAR SaveEsiTemp (LPIPB lpipb, LPESI lpesi);
  68. PRIVATE VOID PASCAL NEAR UpdateEsiTemp (LPIPB lpipb);
  69. PRIVATE BOOL PASCAL NEAR FindTopic(LPMERGEPARAMS lpmp, DWORD dwTopicId);
  70. /*************************************************************************
  71. *
  72. * INTERNAL PUBLIC FUNCTIONS
  73. *
  74. * All of them should be declared far, unless we know they belong to
  75. * the same segment. They should be included in some include file
  76. *
  77. *************************************************************************/
  78. PUBLIC HRESULT FAR PASCAL FlushTree(_LPIPB lpipb);
  79. PUBLIC HRESULT FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS);
  80. HRESULT FAR PASCAL AllocSigmaTable (_LPIPB lpipb);
  81. /*************************************************************************
  82. *
  83. * @doc EXTERNAL API INDEX
  84. *
  85. * @func BOOL FAR PASCAL | MVIndexBuild |
  86. * This function will build an index file based on the information
  87. * collected in the Index parameter block.
  88. *
  89. * @parm HFPB | hSysFile |
  90. * If it is non-null, it is the handle of an already opened system file.
  91. * In this case the index is a subfile of the opened system file
  92. * If it is 0, the index file is a regular DOS file
  93. *
  94. * @parm LPIPB | lpipb |
  95. * Pointer to Index Parameter Block. This structure contains all the
  96. * information necessary to build the index file
  97. *
  98. * @parm HFPB | hfpb |
  99. * Index hfpb if pstrFile is NULL
  100. *
  101. * @parm LPSTR | pstrFile |
  102. * Index filename if hfpb is NULL
  103. *
  104. * @rdesc S_OK, or other errors
  105. *
  106. * @xref MVIndexInitiate()
  107. *************************************************************************/
  108. /*
  109. * This operates in three main steps:
  110. *
  111. * 1. Send finish to first phase to dump the buffer. Then merge-sort
  112. * that file into a temporary index. Keep statistics on the information
  113. * written to this temporary index.
  114. *
  115. * 2. Analyze the statistics gathered during the temporary index
  116. * building phase. This analysis results in the choice of
  117. * compression processes that will be used in the next step.
  118. *
  119. * 3. Permanent index building phase. During this phase, the
  120. * temporary index is read, compressed like crazy, and written
  121. * to a permanent index file. Unlike the temporary index, the
  122. * permanent index contains directory nodes as well as leaf
  123. * nodes.
  124. *
  125. *************************************************************************/
  126. PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexBuild (HFPB hSysFile,
  127. _LPIPB lpipb, HFPB hfpb, LPSTR pstrFile)
  128. {
  129. ERRB errb;
  130. PHRESULT phr = &errb;
  131. BYTE bKeyIndex = CKEY_OCC_BASE; // Index into cKey array for compression
  132. HRESULT fRet; // Return value from this function.
  133. DWORD loop;
  134. // Sanity check
  135. if (lpipb == NULL || (NULL == hfpb && NULL == pstrFile))
  136. return E_INVALIDARG;
  137. // Flush the internal sort
  138. // Flushes any records in the tree to disk
  139. fRet = FlushTree(lpipb);
  140. // Free all memory blocks
  141. FreeISI (lpipb);
  142. if (fRet != S_OK)
  143. return(fRet);
  144. // lpipb->lcTopics++; // Adjust to base-1 from base-0
  145. if (lpipb->esi.cesb == 0)
  146. // Nothing to process, there will be no index file
  147. return S_OK;
  148. if (lpipb->idxf & KEEP_TEMP_FILE)
  149. SaveEsiTemp (lpipb, &lpipb->esi);
  150. // If we're doing term-weighting, set up a huge array to contain the
  151. // sigma terms. The size of the array depends on the total # of topics
  152. // We also create an array of LOG values to save calculations later
  153. if (lpipb->idxf & IDXF_NORMALIZE)
  154. {
  155. if ((fRet = AllocSigmaTable (lpipb)) != S_OK)
  156. return(fRet);
  157. }
  158. if ((fRet = MergeSortTreeFile (lpipb, NULL)) != S_OK)
  159. return SetErrCode (phr, fRet);
  160. if ((lpipb->idxf & KEEP_TEMP_FILE) == 0)
  161. FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE);
  162. // If we are doing term-weighting we have to square root all sigma values
  163. if (lpipb->idxf & IDXF_NORMALIZE)
  164. {
  165. // ISBU_IR_CHANGE not needed here 'cos computing sqrt is necessary in both cases
  166. for (loop = 0; loop < lpipb->dwMaxTopicId + 1; ++loop)
  167. lpipb->wi.hrgsigma[loop] =
  168. (float)sqrt ((double)lpipb->wi.hrgsigma[loop]);
  169. }
  170. // Analyze data to get the best compression scheme
  171. // TopicId
  172. // Note: We can't use fixed field compression for topic, since they
  173. // can be modified by update. A fixed field format may become
  174. // insufficient to store larger values of topic differences
  175. VGetBestScheme(&lpipb->cKey[CKEY_TOPIC_ID],
  176. &lpipb->BitCount[CKEY_TOPIC_ID][0], lcbitBITSTREAM_ILLEGAL, TRUE);
  177. // Occurrence Count
  178. VGetBestScheme(&lpipb->cKey[CKEY_OCC_COUNT],
  179. &lpipb->BitCount[CKEY_OCC_COUNT][0], lcbitBITSTREAM_ILLEGAL, TRUE);
  180. if (lpipb->occf & OCCF_COUNT)
  181. {
  182. VGetBestScheme(&lpipb->cKey[bKeyIndex],
  183. &lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
  184. bKeyIndex++;
  185. }
  186. if (lpipb->occf & OCCF_OFFSET)
  187. {
  188. VGetBestScheme(&lpipb->cKey[bKeyIndex],
  189. &lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
  190. bKeyIndex++;
  191. }
  192. if (lpipb->idxf & KEEP_TEMP_FILE)
  193. UpdateEsiTemp (lpipb);
  194. // Build the permanent index
  195. fRet = BuildBTree(hSysFile, lpipb, lpipb->esi.aszTempName, hfpb, pstrFile);
  196. if (lpipb->idxf & IDXF_NORMALIZE)
  197. {
  198. FreeHandle (lpipb->wi.hSigma);
  199. FreeHandle (lpipb->wi.hLog);
  200. }
  201. return fRet;
  202. }
  203. /*************************************************************************
  204. *
  205. * @doc INDEX
  206. *
  207. * @func HRESULT NEAR PASCAL | FillInputBuffer |
  208. * Fills the buffer by reading from the specified file.
  209. *
  210. * @parm PESB | pEsb |
  211. * Pointer to external sort block to fill
  212. *
  213. * @parm HFPB | hFile |
  214. * Handle to the input file
  215. *
  216. * @rdesc S_OK, or errors if failed
  217. *
  218. *************************************************************************/
  219. HRESULT NEAR PASCAL FillInputBuffer(LPESB pEsb, HFPB hFile)
  220. {
  221. ERRB errb;
  222. DWORD dwBytesRead;
  223. // Read in data
  224. if ((dwBytesRead = FileSeekRead (hFile,
  225. (LPB)pEsb->lrgbMem, pEsb->lfo, pEsb->dwEsbSize, &errb)) == 0)
  226. return errb;
  227. // Update utility variables
  228. pEsb->lfo = FoAddDw(pEsb->lfo, dwBytesRead);
  229. pEsb->dwEsbSize = (CB)dwBytesRead;
  230. pEsb->ibBuf = 0;
  231. return S_OK;
  232. }
  233. /*************************************************************************
  234. *
  235. * @doc INDEX
  236. *
  237. * @func HRESULT NEAR PASCAL | ESFlushBuffer |
  238. * Flushes the output buffer to disk and resets it.
  239. *
  240. * @parm LPESI | pEsi |
  241. * Pointer to ESI block
  242. *
  243. * @rdesc S_OK, or errors if failed
  244. *
  245. *************************************************************************/
  246. HRESULT NEAR PASCAL ESFlushBuffer(LPESI pEsi)
  247. {
  248. ERRB errb;
  249. DWORD dwLen;
  250. dwLen = pEsi->ibBuf;
  251. if (dwLen != (DWORD)FileWrite (pEsi->hfpb, pEsi->pOutputBuffer,
  252. dwLen, &errb))
  253. return errb;
  254. pEsi->lfoTempOffset = FoAddDw (pEsi->lfoTempOffset, dwLen);
  255. pEsi->ibBuf = 0;
  256. return S_OK;
  257. }
  258. /*************************************************************************
  259. *
  260. * @doc INDEX
  261. *
  262. * @func HRESULT NEAR PASCAL | ESFillBuffer |
  263. * Updates the input buffer with new data from the input file.
  264. *
  265. * @parm _LPIPB | lpipb |
  266. * Pointer to index parameter block
  267. *
  268. * @parm LPESB | pEsb |
  269. * Pointer to ESB block to be filled
  270. *
  271. * @rdesc S_OK, or other errors
  272. *************************************************************************/
  273. HRESULT NEAR PASCAL ESFillBuffer(_LPIPB lpipb, LPESB pEsb)
  274. {
  275. DWORD dwBytesRead;
  276. DWORD dwExtra = pEsb->dwEsbSize - pEsb->ibBuf;
  277. ERRB errb;
  278. // Read either the entire buffer size or whatever is left
  279. dwBytesRead = DwSubFo (pEsb->lfoMax, pEsb->lfo);
  280. if (dwBytesRead > pEsb->dwEsbSize - dwExtra)
  281. dwBytesRead = pEsb->dwEsbSize - dwExtra;
  282. // Save unproccessed information to beginning of buffer
  283. if (dwExtra)
  284. MEMMOVE ((LPB)pEsb->lrgbMem, pEsb->lrgbMem + pEsb->ibBuf, dwExtra);
  285. // Read in the new data
  286. if ((dwBytesRead = FileSeekRead (lpipb->isi.hfpb, (LPB)(pEsb->lrgbMem +
  287. dwExtra), pEsb->lfo, dwBytesRead, &errb)) == 0 &&
  288. errb != S_OK)
  289. return(errb);
  290. pEsb->lfo = FoAddDw(pEsb->lfo, dwBytesRead);
  291. pEsb->ibBuf = 0;
  292. pEsb->dwEsbSize = dwBytesRead + dwExtra;
  293. return(S_OK);
  294. }
  295. /*************************************************************************
  296. *
  297. * @doc INTERNAL INDEXING
  298. *
  299. * @func HRESULT FAR PASCAL | MergeSortTree File |
  300. * Sorts the file generated from the tree output into one
  301. * list of sorted elements.
  302. *
  303. * @parm _LPIPB | lpipb |
  304. * Pointer to index parameter block
  305. *
  306. *************************************************************************/
  307. PUBLIC HRESULT PASCAL FAR MergeSortTreeFile (_LPIPB lpipb, LPMERGEPARAMS lpmp)
  308. {
  309. // Local replacement variables
  310. LPESI pEsi; // Pointer to external sort info
  311. LPISI pIsi; // Pointer to internal sort info
  312. HFPB hInputFile; // Handle to input file
  313. ERRB errb;
  314. PHRESULT phr = &errb;
  315. DWORD cesb; // Input buffer count
  316. LPESB FAR* lrgPriorityQueue; // Pointer to Priority Queue
  317. WORD uiQueueSize = 0; // Count of entries in Queue
  318. DWORD dwBufferSize;
  319. // Working variables
  320. HRESULT fRet;
  321. LPESB pEsb; // Temp pointer to linked list
  322. // Sanity check
  323. if (lpipb == NULL)
  324. return E_INVALIDARG;
  325. // Variables initialization
  326. pEsi = &lpipb->esi; // Pointer to external sort info
  327. pIsi = &lpipb->isi; // Pointer to internal sort info
  328. cesb = pEsi->cesb; // Input buffer count
  329. // Open input file
  330. if ((pIsi->hfpb = FileOpen (NULL, pIsi->aszTempName,
  331. REGULAR_FILE, READ, phr)) == NULL)
  332. return *phr;
  333. hInputFile = pIsi->hfpb;
  334. // Allocate & fill input buffers
  335. for (pEsb = pEsi->lpesbRoot; pEsb != NULL; pEsb = pEsb->lpesbNext)
  336. {
  337. DWORD cbRead;
  338. dwBufferSize = (lpipb->dwMemAllowed * 6) / (8 * pEsi->cesb);
  339. // Alocate buffer space
  340. if ((pEsb->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT,
  341. dwBufferSize)) == NULL)
  342. {
  343. fRet = E_OUTOFMEMORY;
  344. exit1:
  345. FreeEsi (lpipb);
  346. FileClose(hInputFile);
  347. pIsi->hfpb = NULL;
  348. return fRet;
  349. }
  350. pEsb->lrgbMem = (LRGB)_GLOBALLOCK (pEsb->hMem);
  351. if ((cbRead = DwSubFo(pEsb->lfoMax, pEsb->lfo)) > dwBufferSize)
  352. cbRead = dwBufferSize;
  353. // Fill buffer from disk
  354. if (FileSeekRead (hInputFile, pEsb->lrgbMem, pEsb->lfo,
  355. cbRead, phr) != (LONG)cbRead)
  356. {
  357. fRet = *phr;
  358. _GLOBALUNLOCK(pEsb->hMem);
  359. _GLOBALFREE(pEsb->hMem);
  360. pEsb->hMem = NULL;
  361. goto exit1;
  362. }
  363. pEsb->dwEsbSize = cbRead;
  364. pEsb->ibBuf = 0;
  365. pEsb->lfo = FoAddDw (pEsb->lfo, cbRead);
  366. }
  367. // Allocate a priority queue array. The size of the array
  368. // is the number of external sort info blocks plus 1, since
  369. // location 0 is not used.
  370. if ((pEsi->hPriorityQueue = _GLOBALALLOC (DLLGMEM_ZEROINIT,
  371. (DWORD)(pEsi->cesb + 1) * sizeof (LPB))) == NULL)
  372. {
  373. fRet = E_OUTOFMEMORY;
  374. goto exit1;
  375. }
  376. pEsi->lrgPriorityQueue =
  377. (LPESB FAR *)_GLOBALLOCK (pEsi->hPriorityQueue);
  378. lrgPriorityQueue = pEsi->lrgPriorityQueue;
  379. // Attach input buffers to Priority Queue
  380. // Remebering to start at offset 1 NOT 0 (PQ's have a null 0 element)
  381. for (pEsb = pEsi->lpesbRoot; pEsb != NULL; pEsb = pEsb->lpesbNext)
  382. {
  383. lrgPriorityQueue[++uiQueueSize] = pEsb;
  384. PQueueUp (lpipb, lrgPriorityQueue, uiQueueSize);
  385. }
  386. pEsi->uiQueueSize = uiQueueSize;
  387. // Clear largest Record Size field
  388. // lpipb->dwMaxRecordSize = 0;
  389. fRet = ProcessFiles(lpipb, lpmp);
  390. _GLOBALUNLOCK (pEsi->hPriorityQueue);
  391. _GLOBALFREE (pEsi->hPriorityQueue);
  392. pEsi->hPriorityQueue = NULL;
  393. goto exit1;
  394. }
  395. /*************************************************************************
  396. *
  397. * @doc INDEX
  398. *
  399. * @func HRESULT NEAR PASCAL | ESMemory2Disk |
  400. * Copies temp record to output buffer.
  401. *
  402. * @parm _LPIPB | lpipb |
  403. * Pointer to index parameter block
  404. *
  405. * @parm PMERGEHEADER | pHeader |
  406. * Pointer to header to flush
  407. *
  408. * @parm int | flag |
  409. * - if FLUSH_NEW_RECORD, the flush is due to new record, we flush
  410. * everything, else we may do a partial flush only
  411. * - if FLUSH_EXCEPT_LAST, we don't flush the last topic
  412. *
  413. * @rdesc S_OK, or other errors
  414. *************************************************************************/
  415. PRIVATE HRESULT NEAR PASCAL ESMemory2Disk
  416. (_LPIPB lpipb, PMERGEHEADER pHeader, int flag)
  417. {
  418. // Local replacement variables
  419. LPESI pEsi = &lpipb->esi;
  420. LPB pMax = pEsi->pOutputBuffer + ESOUTPUT_BUFFER - 2 * sizeof(DWORD);
  421. DWORD dwOccCount;
  422. LPB pOutputBuffer = pEsi->pOutputBuffer;
  423. ERRB errb;
  424. PHRESULT phr = &errb;
  425. HRESULT fRet;
  426. BYTE cNumOcc;
  427. OCCF occf;
  428. // Working variables
  429. PTOPICDATA pTopic; // Temp var to traverse the topic linked list
  430. DWORD loop, sub; // Various loop counters
  431. DWORD dwTopicIdDelta;
  432. DWORD OccDelta[5]; // Delta base for all occurrence data
  433. DWORD LastOcc[5];
  434. FLOAT rLog; // (1/n) - IDXF_NORMALIZE is set
  435. FLOAT rLogSquared; // (1/n)^2 - IDXF_NORMALIZE is set
  436. LPB pStart;
  437. LPB pCurPtr;
  438. // Set up pointers
  439. pStart = pCurPtr = pOutputBuffer + pEsi->ibBuf;
  440. // Variable replacement
  441. occf = lpipb->occf;
  442. // Size of string
  443. loop = pHeader->dwStrLen;
  444. // Make sure the string, FileId, Topic Count and Record Size fit
  445. // We add in and extra DWORD for 5 byte compresssion problems and
  446. // to cover the Word Length if there is one.
  447. if ((pStart + loop + sizeof (DWORD) * 5) >= pMax)
  448. {
  449. if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
  450. return(fRet);
  451. pStart = pCurPtr = pOutputBuffer;
  452. }
  453. if (pHeader->fEmitRecord == FALSE)
  454. {
  455. // If we never emitted the record header then we emitted now
  456. // Reset the flag
  457. pHeader->fEmitRecord = TRUE;
  458. // Skip record size field
  459. pCurPtr += sizeof (DWORD);
  460. // Pascal string
  461. MEMCPY (pCurPtr, pHeader->lpbWord, loop);
  462. pCurPtr += loop;
  463. // Word Length
  464. if (occf & OCCF_LENGTH)
  465. pCurPtr += CbBytePack (pCurPtr, pHeader->dwWordLength);
  466. // FieldId
  467. if (occf & OCCF_FIELDID)
  468. pCurPtr += CbBytePack (pCurPtr, pHeader->dwFieldId);
  469. // Topic Count
  470. if (flag & FLUSH_NEW_RECORD)
  471. {
  472. // This is the whole record. dwTopicCount value is correct
  473. SETLONG((LPUL)pCurPtr, pHeader->dwTopicCount);
  474. }
  475. else
  476. {
  477. // Save the offset for backpatching
  478. pHeader->foTopicCount = FoAddDw (pEsi->lfoTempOffset,
  479. (DWORD)(pCurPtr - pOutputBuffer));
  480. pHeader->pTopicCount = pCurPtr;
  481. }
  482. pCurPtr += sizeof(DWORD);
  483. // Write Record Length
  484. *(LPUL)pStart = (DWORD)(pCurPtr - pStart - sizeof (DWORD));
  485. }
  486. else if (flag & FLUSH_NEW_RECORD)
  487. {
  488. // We emit the record before, since pheader->fEmitRecord == TRUE
  489. // We need to backpatch the topic count
  490. if (FoCompare(pHeader->foTopicCount, pEsi->lfoTempOffset) >= 0)
  491. {
  492. // Everything is still in memory, just do local backpatch
  493. SETLONG((LPUL)(pHeader->pTopicCount), pHeader->dwTopicCount);
  494. }
  495. else
  496. {
  497. // Do backpatch in the file by seeking back to the right
  498. // place
  499. if (FileSeekWrite(pEsi->hfpb, &pHeader->dwTopicCount,
  500. pHeader->foTopicCount, sizeof(DWORD), phr) != sizeof(DWORD))
  501. return(*phr);
  502. // Restore the current file offset
  503. FileSeek(pEsi->hfpb, pEsi->lfoTempOffset, 0, phr);
  504. }
  505. }
  506. // Convert all occ data to delta values & compress them
  507. pTopic = pHeader->pTopic;
  508. cNumOcc = lpipb->ucNumOccDataFields;
  509. for (; pTopic;)
  510. {
  511. POCCDATA pOccData;
  512. PTOPICDATA pReleased;
  513. if ((flag & FLUSH_EXCEPT_LAST) && pTopic->pNext == NULL)
  514. break;
  515. // Set TopicId delta
  516. dwTopicIdDelta = pTopic->dwTopicId - pHeader->dwLastTopicId;
  517. pHeader->dwLastTopicId = pTopic->dwTopicId;
  518. // Save bit size to the statistics array
  519. lpipb->BitCount[CKEY_TOPIC_ID][CbitBitsDw (dwTopicIdDelta)] += 1;
  520. // Write TopicID Delta
  521. if (pCurPtr > pMax)
  522. {
  523. pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
  524. if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
  525. return(fRet);
  526. pCurPtr = pOutputBuffer;
  527. }
  528. pCurPtr += CbBytePack (pCurPtr, dwTopicIdDelta);
  529. if (cNumOcc == 0)
  530. {
  531. pReleased = pTopic;
  532. pTopic = pTopic->pNext;
  533. // Add the released to the freed linked list
  534. pReleased->pNext = (PTOPICDATA)lpipb->TopicBlock.pFreeList;
  535. lpipb->TopicBlock.pFreeList = (PLIST)pReleased;
  536. lpipb->TopicBlock.dwCount--;
  537. continue;
  538. }
  539. if (dwOccCount = pTopic->dwOccCount)
  540. {
  541. // Reset count occdata delta for every new topic
  542. MEMSET (OccDelta, 0, 5 * sizeof (DWORD));
  543. MEMSET (LastOcc, 0, 5 * sizeof (DWORD));
  544. // Copy Occurrence Count
  545. if (pCurPtr > pMax)
  546. {
  547. pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
  548. if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
  549. return(fRet);
  550. pCurPtr = pOutputBuffer;
  551. }
  552. pCurPtr += CbBytePack (pCurPtr, dwOccCount);
  553. // Save bit size to the statistics array
  554. lpipb->BitCount[1][CbitBitsDw (dwOccCount)] += 1;
  555. // Repeat for each occurrence block
  556. for (pOccData = pTopic->pOccData,
  557. sub = dwOccCount; sub > 0 && pOccData; --sub)
  558. {
  559. LPDW lpDw;
  560. int iIndex;
  561. POCCDATA pReleased;
  562. if (pCurPtr + 5 * sizeof(DWORD) > pMax)
  563. {
  564. pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
  565. if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
  566. return(fRet);
  567. pStart = pCurPtr = pOutputBuffer;
  568. }
  569. lpDw = &pOccData->OccData[0];
  570. iIndex = CKEY_OCC_BASE;
  571. if (occf & OCCF_COUNT)
  572. {
  573. // Convert each value to a delta value
  574. OccDelta[iIndex] = *lpDw - LastOcc[iIndex];
  575. LastOcc[iIndex] = *lpDw;
  576. lpDw++;
  577. // Save to bit size to the statistics array
  578. lpipb->BitCount[iIndex][CbitBitsDw (OccDelta[iIndex])] += 1;
  579. // Compress occurrence field to buffer
  580. pCurPtr += CbBytePack (pCurPtr, OccDelta[iIndex]);
  581. iIndex++;
  582. }
  583. if (occf & OCCF_OFFSET)
  584. {
  585. // Convert each value to a delta value
  586. OccDelta[iIndex] = *lpDw - LastOcc[iIndex];
  587. LastOcc[iIndex] = *lpDw;
  588. lpDw++;
  589. // Save to bit size to the statistics array
  590. lpipb->BitCount[iIndex][CbitBitsDw (OccDelta[iIndex])] += 1;
  591. // Compress occurrence field to buffer
  592. pCurPtr += CbBytePack (pCurPtr, OccDelta[iIndex]);
  593. iIndex++;
  594. }
  595. pReleased = pOccData;
  596. pOccData = pOccData->pNext;
  597. pReleased->pNext = (POCCDATA)lpipb->OccBlock.pFreeList;
  598. lpipb->OccBlock.pFreeList = (PLIST)pReleased;
  599. lpipb->OccBlock.dwCount--;
  600. }
  601. // Check for mismatch between count and links
  602. #ifdef _DEBUG
  603. if (sub)
  604. SetErrCode (phr, E_ASSERT);
  605. if (pOccData)
  606. SetErrCode (phr, E_ASSERT);
  607. #endif
  608. }
  609. // Update the sigma values if we are doing term weighing
  610. // erinfox: remove test against flag. Sometimes sigma never
  611. // got calculated for a topic and that caused a divide by zero
  612. // later on.
  613. if ((lpipb->idxf & IDXF_NORMALIZE) /* && (flag & FLUSH_NEW_RECORD)*/)
  614. {
  615. if (pTopic->dwTopicId > lpipb->dwMaxTopicId)
  616. {
  617. // Incease the size of the sigma table. This can happen when
  618. // updating with new topics
  619. _GLOBALUNLOCK (lpipb->wi.hSigma);
  620. if ((lpipb->wi.hSigma = _GLOBALREALLOC (lpipb->wi.hSigma,
  621. (pTopic->dwTopicId + 1) * sizeof(float),
  622. DLLGMEM_ZEROINIT)) == NULL)
  623. {
  624. return (SetErrCode(phr, E_OUTOFMEMORY));
  625. }
  626. lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK(lpipb->wi.hSigma);
  627. lpipb->dwMaxTopicId = pTopic->dwTopicId ;
  628. }
  629. if (lpipb->bState == INDEXING_STATE)
  630. {
  631. #ifndef ISBU_IR_CHANGE
  632. FLOAT fOcc;
  633. if (pHeader->dwTopicCount >= cLOG_MAX)
  634. {
  635. // we have to guard against the possibility of the log resulting in
  636. // a value <= 0.0. Very rare, but possible in the future. This happens
  637. // if dwTopicCount approaches or exceeds the N we are using (N == 100 million)
  638. if (pHeader->dwTopicCount >= cNintyFiveMillion)
  639. rLog = cVerySmallWt; // log10(100 mil/ 95 mil) == 0.02
  640. else
  641. //rLog = (float) log10(cHundredMillion/(double)pHeader->dwTopicCount);
  642. rLog = (float) (8.0 - log10((double)pHeader->dwTopicCount));
  643. rLogSquared = rLog*rLog;
  644. }
  645. else
  646. rLogSquared = lpipb->wi.lrgrLog[(WORD)pHeader->dwTopicCount];
  647. // Update sigma value
  648. // NOTE : We are bounding dwOccCount by a value of eTFThreshold
  649. // The RHS of the equation below has an upperbound of 2 power 30.
  650. fOcc = (float) min(cTFThreshold, dwOccCount);
  651. lpipb->wi.hrgsigma[pTopic->dwTopicId] += (SIGMA) fOcc*fOcc*rLogSquared;
  652. //(SIGMA) (fOcc * fOcc * rLogSquared/(float)0xFFFF);
  653. #else
  654. // Failed for update : UNDONE
  655. if (pHeader->dwTopicCount >= cLOG_MAX)
  656. {
  657. rLog = (float)1.0 / (float)pHeader->dwTopicCount;
  658. rLogSquared = rLog * rLog;
  659. }
  660. else
  661. rLogSquared = lpipb->wi.lrgrLog[(WORD)pHeader->dwTopicCount];
  662. // Update sigma value
  663. lpipb->wi.hrgsigma[pTopic->dwTopicId] +=
  664. (SIGMA)(dwOccCount * dwOccCount) * rLogSquared;
  665. #endif // ISBU_IR_CHANGE
  666. }
  667. }
  668. pReleased = pTopic;
  669. pTopic = pTopic->pNext;
  670. // Add the released to the freed linked list
  671. pReleased->pNext = (PTOPICDATA)lpipb->TopicBlock.pFreeList;
  672. lpipb->TopicBlock.pFreeList = (PLIST)pReleased;
  673. lpipb->TopicBlock.dwCount--;
  674. }
  675. pHeader->pTopic = pHeader->pLastTopic = pTopic;
  676. // Update output offset
  677. pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
  678. return(S_OK);
  679. }
  680. /*************************************************************************
  681. *
  682. * @doc INDEX
  683. *
  684. * @func HRESULT NEAR PASCAL | ProcessFiles |
  685. * Sorts the file generated from the tree output into one
  686. * list of sorted elements.
  687. *
  688. * @parm _LPIPB | lpipb |
  689. * Pointer to index parameter block
  690. *
  691. * @rdesc S_OK, or errors if failed
  692. *
  693. * @notes
  694. * This function processed the input buffers and uses dynamic
  695. * memory allocation to sort each word as it come in. Once a
  696. * word stops repeating, it is flush to disk and the memory is
  697. * reset for the next word.
  698. *************************************************************************/
  699. HRESULT NEAR PASCAL ProcessFiles(_LPIPB lpipb, LPMERGEPARAMS lpmp)
  700. {
  701. // Local replacement variables
  702. LPISI pIsi = &lpipb->isi;
  703. LPESI pEsi = &lpipb->esi;
  704. LPESB FAR * lrgPriorityQueue = pEsi->lrgPriorityQueue;
  705. LONG uiQueueSize = pEsi->uiQueueSize;
  706. LPB pQueuePtr;
  707. WORD cNumOcc = lpipb->ucNumOccDataFields;
  708. WORD OccSize = sizeof(OCCDATA) - sizeof(DWORD) + cNumOcc *
  709. sizeof(DWORD);
  710. int occf = lpipb->occf;
  711. LPB pBufMax;
  712. HANDLE hWord;
  713. LPB lpbWord;
  714. DWORD dwUniqueTerm = 0; // Used for calback function
  715. #ifdef _DEBUG
  716. BYTE astWord[300];
  717. BYTE astLastWord[300];
  718. #endif
  719. // Working variables
  720. PMERGEHEADER pHeader; // Pointer to merge header
  721. LPESB pEsb; // Temp ESB pointer
  722. PTOPICDATA pNewTopic; // Used to create new topic
  723. DWORD loop; // Temp loop counter
  724. HANDLE hHeader;
  725. HFPB hOutputFile; // Handle to output file
  726. int fRet; // Return value
  727. USHORT uStringSize; // Size of Psacal String
  728. ERRB errb;
  729. PHRESULT phr = &errb;
  730. static long Count = 0;
  731. // Setup Block Manager
  732. if ((fRet = ESBBlockAllocate (lpipb, lpipb->dwMemAllowed / 4)) != S_OK)
  733. return(fRet);
  734. // Allocate output buffer
  735. if ((pEsi->hBuf = _GLOBALALLOC
  736. (DLLGMEM_ZEROINIT, ESOUTPUT_BUFFER)) == NULL)
  737. {
  738. fRet = E_OUTOFMEMORY;
  739. exit1:
  740. return fRet;
  741. }
  742. pEsi->pOutputBuffer = (LPB)_GLOBALLOCK (pEsi->hBuf);
  743. pEsi->ibBuf = 0;
  744. // Create output file
  745. GETTEMPFILENAME ((char)0, "eso", 0, pEsi->aszTempName);
  746. if ((pEsi->hfpb = FileOpen(NULL, pEsi->aszTempName,
  747. REGULAR_FILE, WRITE, &errb)) == NULL)
  748. {
  749. fRet = E_FILECREATE;
  750. exit2:
  751. FreeHandle (pEsi->hBuf);
  752. pEsi->hBuf = NULL;
  753. goto exit1;
  754. }
  755. hOutputFile = pEsi->hfpb;
  756. // Setup new record in memory
  757. if ((hHeader = _GLOBALALLOC
  758. (DLLGMEM_ZEROINIT, sizeof (MERGEHEADER))) == NULL)
  759. {
  760. fRet = E_OUTOFMEMORY;
  761. exit3:
  762. FileClose (hOutputFile);
  763. goto exit2;
  764. }
  765. pHeader = (PMERGEHEADER)_GLOBALLOCK (hHeader);
  766. // Allocate buffer for a word, which include 64K + sizeof(WORD) + slack
  767. if ((hWord = _GLOBALALLOC(DLLGMEM_ZEROINIT, 0x10004)) == NULL)
  768. {
  769. exit4:
  770. _GLOBALUNLOCK(hHeader);
  771. _GLOBALFREE (hHeader);
  772. goto exit3;
  773. }
  774. pHeader->lpbWord = lpbWord = (LPB)_GLOBALLOCK(hWord);
  775. #ifdef _DEBUG
  776. astWord[0] = 0;
  777. #endif
  778. // Process all input buffers
  779. do
  780. {
  781. DWORD dwWordLength;
  782. DWORD dwFieldId;
  783. LPB lpStart;
  784. DWORD dwTopicCount;
  785. #ifdef _DEBUG
  786. Count++;
  787. #endif
  788. // Grab smallest record and send to buffer
  789. pEsb = lrgPriorityQueue[1];
  790. // Set the fill limit
  791. pBufMax = pEsb->lrgbMem + pEsb->dwEsbSize - 256;
  792. if ((pQueuePtr = pEsb->lrgbMem + pEsb->ibBuf) >= pBufMax)
  793. {
  794. if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
  795. goto exit4;
  796. pQueuePtr = pEsb->lrgbMem;
  797. }
  798. // Save the record beginning
  799. pQueuePtr += sizeof(DWORD);
  800. lpStart = pQueuePtr;
  801. // Get string
  802. uStringSize = GETWORD ((LPUW)pQueuePtr) + sizeof (SHORT);
  803. pQueuePtr += uStringSize;
  804. #ifdef _DEBUG
  805. if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
  806. SetErrCode (phr, E_ASSERT);
  807. #endif
  808. if (occf & OCCF_LENGTH)
  809. pQueuePtr += CbByteUnpack (&dwWordLength, pQueuePtr);
  810. else
  811. dwWordLength = 0;
  812. #ifdef _DEBUG
  813. if (pQueuePtr >= pEsb->lrgbMem + pEsb->dwEsbSize)
  814. SetErrCode (phr, E_ASSERT);
  815. #endif
  816. if (occf & OCCF_FIELDID)
  817. pQueuePtr += CbByteUnpack (&dwFieldId, pQueuePtr);
  818. else
  819. dwFieldId = 0;
  820. #ifdef _DEBUG
  821. if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
  822. SetErrCode (phr, E_ASSERT);
  823. #endif
  824. // Is the word in the buffer equal to the new word?
  825. // If it is not then flush the old word
  826. if (*(LPUW)pHeader->lpbWord)
  827. {
  828. fRet = (StrCmp2BytePascal (pHeader->lpbWord, lpStart)
  829. || dwWordLength > pHeader->dwWordLength);
  830. if (fRet == 0) // Same word, reduce the unique words count
  831. lpipb->dwUniqueWord--;
  832. if (fRet || dwFieldId > pHeader->dwFieldId)
  833. {
  834. #if defined(_DEBUG) && !defined(_MAC)
  835. // Word out of order
  836. if (StrCmp2BytePascal (pHeader->lpbWord, lpStart) > 0)
  837. assert(FALSE);
  838. #endif
  839. if ((fRet = ESMemory2Disk (lpipb, pHeader, TRUE)) != S_OK)
  840. return(fRet);
  841. // Reset pHeader
  842. MEMSET (pHeader, 0, sizeof (MERGEHEADER));
  843. // Set the word buffer
  844. pHeader->lpbWord = lpbWord;
  845. #ifdef _DEBUG
  846. STRCPY(astLastWord, astWord);
  847. #endif
  848. // Call the user callback every once in a while
  849. if (!(++dwUniqueTerm % 8192L)
  850. && (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS))
  851. {
  852. PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo;
  853. CALLBACKINFO Info;
  854. Info.dwPhase = 2;
  855. Info.dwIndex = (DWORD)((float)dwUniqueTerm / lpipb->dwUniqueWord * 100);
  856. fRet = (*pCallbackInfo->MessageFunc)
  857. (ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info);
  858. if (S_OK != fRet)
  859. goto exit5;
  860. }
  861. }
  862. }
  863. // Update the data
  864. pHeader->dwFieldId = dwFieldId;
  865. pHeader->dwWordLength = dwWordLength;
  866. pHeader->dwStrLen = uStringSize;
  867. // Copy word and header info
  868. MEMCPY (pHeader->lpbWord, (LPB)lpStart, uStringSize);
  869. #ifdef _DEBUG
  870. if (uStringSize >= 300)
  871. uStringSize = 300;
  872. MEMCPY (astWord, lpStart + 2, uStringSize - 2);
  873. astWord[uStringSize - 2] = 0;
  874. //if (STRCMP(astWord, "87db") == 0)
  875. // _asm int 3;
  876. #endif
  877. pQueuePtr += CbByteUnpack (&dwTopicCount, pQueuePtr);
  878. pHeader->dwTopicCount += dwTopicCount;
  879. #ifdef _DEBUG
  880. if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
  881. SetErrCode (phr, E_ASSERT);
  882. #endif
  883. pNewTopic = NULL;
  884. // Copy topic(s) to memory
  885. for (loop = dwTopicCount; loop > 0; loop--)
  886. {
  887. DWORD dwTopicId;
  888. // Get the topic id
  889. pQueuePtr += CbByteUnpack (&dwTopicId, pQueuePtr);
  890. // kevynct: if there is a to-delete list, and this topic is on it, skip it
  891. if (lpmp && FindTopic(lpmp, dwTopicId))
  892. {
  893. // Get the occ count
  894. if (cNumOcc)
  895. {
  896. DWORD dwOccCount;
  897. DWORD dwT;
  898. pQueuePtr += CbByteUnpack (&dwOccCount, pQueuePtr);
  899. #ifdef _DEBUG
  900. if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
  901. SetErrCode (phr, E_ASSERT);
  902. #endif
  903. for (; dwOccCount > 0; dwOccCount--)
  904. {
  905. // Fill up the buffer if run out of data
  906. if (pQueuePtr >= pBufMax)
  907. {
  908. pEsb->ibBuf = (DWORD)(pQueuePtr - pEsb->lrgbMem);
  909. if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
  910. goto exit5;
  911. pQueuePtr = pEsb->lrgbMem;
  912. }
  913. switch (cNumOcc)
  914. {
  915. case 5:
  916. pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
  917. case 4:
  918. pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
  919. case 3:
  920. pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
  921. case 2:
  922. pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
  923. case 1:
  924. pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
  925. }
  926. #ifdef _DEBUG
  927. if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
  928. SetErrCode (phr, E_ASSERT);
  929. #endif
  930. } // end occ loop
  931. } // end if occ non-zero
  932. pHeader->dwTopicCount--;
  933. continue;
  934. } // end of to-delete condition
  935. // Allocate a topicdata node
  936. if ((pNewTopic == NULL) &&
  937. (pNewTopic = GetBlockNode (&lpipb->TopicBlock)) == NULL)
  938. {
  939. if ((fRet = ESMemory2Disk(lpipb, pHeader, FLUSH_EXCEPT_LAST)) != S_OK)
  940. {
  941. exit5:
  942. _GLOBALUNLOCK(hWord);
  943. _GLOBALFREE(hWord);
  944. goto exit4;
  945. }
  946. if ((pNewTopic = GetBlockNode (&lpipb->TopicBlock)) == NULL)
  947. {
  948. // Extremely weird, since we just release a bunch of
  949. // memory
  950. fRet = E_ASSERT;
  951. goto exit5;
  952. }
  953. }
  954. pNewTopic->dwTopicId = dwTopicId;
  955. #ifdef _DEBUG
  956. if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
  957. SetErrCode (phr, E_ASSERT);
  958. #endif
  959. // Set the other fields
  960. pNewTopic->pOccData = pNewTopic->pLastOccData = NULL;
  961. // Get the occ count
  962. if (cNumOcc)
  963. {
  964. DWORD dwOccCount;
  965. POCCDATA pOccData;
  966. LPDW lpDw;
  967. pQueuePtr += CbByteUnpack (&pNewTopic->dwOccCount,
  968. pQueuePtr);
  969. #ifdef _DEBUG
  970. if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
  971. SetErrCode (phr, E_ASSERT);
  972. #endif
  973. for (dwOccCount = pNewTopic->dwOccCount; dwOccCount > 0;
  974. dwOccCount--)
  975. {
  976. // Get all occ fields
  977. if ((pOccData = (POCCDATA)GetBlockNode
  978. (&lpipb->OccBlock)) == NULL )
  979. {
  980. if ((fRet = ESMemory2Disk(lpipb, pHeader,
  981. FLUSH_EXCEPT_LAST)) != S_OK)
  982. goto exit5;
  983. if ((pOccData =
  984. (POCCDATA)GetBlockNode(&lpipb->OccBlock)) == NULL)
  985. {
  986. // Extremely weird, since we just release a bunch of
  987. // memory, unless there are so many duplicates of the same word
  988. // in the topic
  989. fRet = E_TOOMANYDUPS;
  990. goto exit5;
  991. }
  992. }
  993. // Fill up the buffer if run out of data
  994. if (pQueuePtr >= pBufMax)
  995. {
  996. pEsb->ibBuf = (DWORD) (pQueuePtr - pEsb->lrgbMem);
  997. if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
  998. goto exit5;
  999. pQueuePtr = pEsb->lrgbMem;
  1000. }
  1001. lpDw = (LPDW)&pOccData->OccData;
  1002. switch (cNumOcc)
  1003. {
  1004. case 5:
  1005. pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
  1006. case 4:
  1007. pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
  1008. case 3:
  1009. pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
  1010. case 2:
  1011. pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
  1012. case 1:
  1013. pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
  1014. }
  1015. #ifdef _DEBUG
  1016. if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
  1017. SetErrCode (phr, E_ASSERT);
  1018. #endif
  1019. // Attach to the linked list
  1020. // Note that we are assumimg that the occurrences are
  1021. // already sorted, so no checking is done here
  1022. if (pNewTopic->pOccData == NULL)
  1023. {
  1024. pNewTopic->pLastOccData = pNewTopic->pOccData
  1025. = pOccData;
  1026. }
  1027. else
  1028. {
  1029. // Add to the end of the linked list
  1030. pNewTopic->pLastOccData->pNext = pOccData;
  1031. pNewTopic->pLastOccData = pOccData;
  1032. }
  1033. pOccData->pNext = NULL;
  1034. }
  1035. }
  1036. if (pNewTopic = MergeTopicNode (pHeader, pNewTopic, cNumOcc))
  1037. pHeader->dwTopicCount --;
  1038. }
  1039. // Update the offset
  1040. pEsb->ibBuf = (DWORD) (pQueuePtr - pEsb->lrgbMem);
  1041. // If next record doesn't fit in buffer
  1042. // Then reset to beginning and load data
  1043. if (pEsb->dwEsbSize - pEsb->ibBuf <= sizeof(DWORD) ||
  1044. pEsb->dwEsbSize - pEsb->ibBuf <= GETLONG((LPUL)pQueuePtr) +
  1045. 2 * sizeof(DWORD))
  1046. {
  1047. if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
  1048. goto exit4;
  1049. }
  1050. // Adjust priority queue
  1051. if (uiQueueSize > 1)
  1052. {
  1053. if (DwSubFo (pEsb->lfo, pEsb->lfoMax) != 0 &&
  1054. pEsb->ibBuf >= pEsb->dwEsbSize)
  1055. {
  1056. // Replace first record with last
  1057. lrgPriorityQueue[1] = lrgPriorityQueue[uiQueueSize];
  1058. lrgPriorityQueue[uiQueueSize] = NULL;
  1059. uiQueueSize--;
  1060. pEsi->uiQueueSize = uiQueueSize;
  1061. }
  1062. #if 0
  1063. else
  1064. { // If the stream still has input add it back into the Queue
  1065. lrgPriorityQueue[uiQueueSize] = pEsb;
  1066. PQueueUp(lpipb, lrgPriorityQueue, uiQueueSize);
  1067. }
  1068. #endif
  1069. PQueueDown(lpipb); // Maintain sort order
  1070. }
  1071. else if (DwSubFo (pEsb->lfo, pEsb->lfoMax) != 0 &&
  1072. pEsb->ibBuf >= pEsb->dwEsbSize)
  1073. {
  1074. uiQueueSize--;
  1075. pEsi->uiQueueSize = uiQueueSize;
  1076. if ((fRet = ESMemory2Disk (lpipb, pHeader, FLUSH_NEW_RECORD)) != S_OK)
  1077. return(fRet);
  1078. }
  1079. } while (uiQueueSize);
  1080. fRet = ESFlushBuffer(pEsi);
  1081. goto exit5;
  1082. }
  1083. BOOL PASCAL NEAR FindTopic(LPMERGEPARAMS lpmp, DWORD dwTopicId)
  1084. {
  1085. register LPDW lpdw;
  1086. LPDW lpdwMac;
  1087. Assert(lpmp->dwCount > 0);
  1088. Assert(lpmp->lpTopicIdLast >= lpmp->rgTopicId);
  1089. Assert(lpmp->lpTopicIdLast < lpmp->rgTopicId + lpmp->dwCount);
  1090. if (lpmp->rgTopicId[0] > dwTopicId
  1091. ||
  1092. *(lpdwMac = lpmp->rgTopicId + lpmp->dwCount - 1) < dwTopicId)
  1093. return FALSE;
  1094. if (*lpmp->lpTopicIdLast == dwTopicId)
  1095. return TRUE;
  1096. if (*lpmp->lpTopicIdLast > dwTopicId)
  1097. {
  1098. // re-start at the beginning
  1099. lpmp->lpTopicIdLast = lpmp->rgTopicId;
  1100. }
  1101. for (lpdw = lpmp->lpTopicIdLast; lpdw < lpdwMac + 1; lpdw++)
  1102. if (*lpdw == dwTopicId)
  1103. {
  1104. lpmp->lpTopicIdLast = lpdw;
  1105. return TRUE;
  1106. }
  1107. return FALSE;
  1108. }
  1109. /*************************************************************************
  1110. *
  1111. * @doc INTERNAL INDEXING
  1112. *
  1113. * @func int | CompareRecordBuffers |
  1114. * Called from PQueueUp/Down to sort the input buffers based first
  1115. * upon the string's, then TopicID's, then word length's, etc.
  1116. *
  1117. * @parm _LPIPB | lpipb |
  1118. * Pointer to the index parameter block
  1119. *
  1120. * @parm LPB | pBuffer A |
  1121. * Pointer to the first input buffer
  1122. *
  1123. * @parm LPB | pBuffer B |
  1124. * Pointer to the second input buffer
  1125. *
  1126. * @rdesc
  1127. * If pBufferA < pBufferB return < 0
  1128. * If pBufferA == pBufferB return = 0
  1129. * If pBufferA > pBufferB return > 0
  1130. *************************************************************************/
  1131. int PASCAL NEAR CompareRecordBuffers (_LPIPB lpipb, LPB pBufferA, LPB pBufferB)
  1132. {
  1133. // Local Replacement Variables
  1134. int occf = lpipb->occf;
  1135. int cNumOcc = lpipb->ucNumOccDataFields;
  1136. DWORD dwOccMin;
  1137. // Working Variables
  1138. int fRet;
  1139. int Len;
  1140. DWORD dwDataA;
  1141. DWORD dwDataB;
  1142. pBufferA += sizeof (DWORD); // Skip record length
  1143. pBufferB += sizeof (DWORD); // Skip record length
  1144. // Compare Pascal strings
  1145. if ((fRet = StrCmp2BytePascal(pBufferA, pBufferB)) != 0)
  1146. return fRet;
  1147. pBufferA += (Len = GETWORD ((LPUW)pBufferA) + sizeof (SHORT));
  1148. pBufferB += Len;
  1149. // Strings equal - compare FieldIds
  1150. // Compare Word Lengths
  1151. if (occf & OCCF_LENGTH)
  1152. {
  1153. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1154. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1155. if ((fRet = (int)(dwDataA - dwDataB)) != 0)
  1156. return fRet;
  1157. }
  1158. if (occf & OCCF_FIELDID)
  1159. {
  1160. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1161. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1162. if ((fRet = (int)(dwDataA - dwDataB)) != 0)
  1163. return fRet;
  1164. }
  1165. // Skip topic count
  1166. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1167. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1168. // Compare 1st topic Id
  1169. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1170. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1171. if ((fRet = (int)(dwDataA - dwDataB)) != 0)
  1172. return fRet;
  1173. // Get the occurrence count
  1174. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1175. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1176. if ((fRet = (int)(dwDataA - dwDataB)) < 0)
  1177. dwOccMin = dwDataA;
  1178. else
  1179. dwOccMin = dwDataB;
  1180. for (; dwOccMin; dwOccMin--)
  1181. {
  1182. switch (cNumOcc)
  1183. {
  1184. case 5:
  1185. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1186. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1187. if ((fRet = (int)(dwDataA - dwDataB)) != 0)
  1188. return fRet;
  1189. break;
  1190. case 4:
  1191. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1192. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1193. if ((fRet = (int)(dwDataA - dwDataB)) != 0)
  1194. return fRet;
  1195. break;
  1196. case 3:
  1197. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1198. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1199. if ((fRet = (int)(dwDataA - dwDataB)) != 0)
  1200. return fRet;
  1201. break;
  1202. case 2:
  1203. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1204. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1205. if ((fRet = (int)(dwDataA - dwDataB)) != 0)
  1206. return fRet;
  1207. break;
  1208. case 1:
  1209. pBufferA += CbByteUnpack (&dwDataA, pBufferA);
  1210. pBufferB += CbByteUnpack (&dwDataB, pBufferB);
  1211. if ((fRet = (int)(dwDataA - dwDataB)) != 0)
  1212. return fRet;
  1213. break;
  1214. }
  1215. }
  1216. return fRet;
  1217. }
  1218. /*************************************************************************
  1219. *
  1220. * @doc INTERNAL INDEXING
  1221. *
  1222. * @func VOID | PQueueUp |
  1223. * The function restores the heap condition of a PQ, ie. the parent
  1224. * node must be less than the children. When the top node is inserted
  1225. * the heap condition may be violated if the resulting node
  1226. * is smaller than its parent. In this case the nodes have to
  1227. * be switched.
  1228. *
  1229. * @parm LPESI | lpesi |
  1230. * Pointer to external sort info, which contains all info
  1231. *
  1232. * @parm LONG | index |
  1233. * Index of the inserted node
  1234. *
  1235. *************************************************************************/
  1236. VOID PASCAL NEAR PQueueUp
  1237. (_LPIPB lpipb, LPESB FAR *lrgPriorityQueue, LONG index)
  1238. {
  1239. LPESB lpesbTemp; // Pointer to the inserted node
  1240. LPESB lpesbHalf; // Pointer to the parent node
  1241. WORD uiHalf; // Index of the parent's node
  1242. lpesbTemp = lrgPriorityQueue [index];
  1243. if ((uiHalf = (WORD) (index/2)) == 0)
  1244. return;
  1245. lpesbHalf = lrgPriorityQueue [uiHalf];
  1246. /* If the parent node is greated than the child, then exchange the
  1247. * nodes, The condition uiHalf != index makes sure that we stop
  1248. * at node 0 (top node)
  1249. */
  1250. while (uiHalf && CompareRecordBuffers (lpipb, (LPB)lpesbHalf->lrgbMem +
  1251. lpesbHalf->ibBuf, (LPB)lpesbTemp->lrgbMem + lpesbTemp->ibBuf) > 0)
  1252. {
  1253. lrgPriorityQueue [index] = lpesbHalf;
  1254. index = uiHalf;
  1255. uiHalf = (WORD)(index/2);
  1256. lpesbHalf = lrgPriorityQueue [uiHalf];
  1257. }
  1258. lrgPriorityQueue[index] = lpesbTemp;
  1259. #if BINHN
  1260. SetQueue (&lpipb->esi);
  1261. #endif
  1262. }
  1263. /*************************************************************************
  1264. *
  1265. * @doc INTERNAL INDEXING
  1266. *
  1267. * @func VOID | PQueueDown |
  1268. * The function restores the heap condition of a PQ, ie. the parent
  1269. * node must be less than the children. When the top node is removed
  1270. * the heap condition may be violated if the resulting node
  1271. * is greater than its children. In this case the nodes have to
  1272. * be switched.
  1273. *
  1274. * @parm LPESI | lpesi |
  1275. * Pointer to external sort info, which contains all info
  1276. *
  1277. *************************************************************************/
  1278. PRIVATE VOID PASCAL NEAR PQueueDown (_LPIPB lpipb)
  1279. {
  1280. LPESI lpesi = &lpipb->esi;
  1281. LPESB FAR *lrgPriorityQueue;
  1282. int CurIndex;
  1283. int ChildIndex;
  1284. int MaxCurIndex;
  1285. int MaxChildIndex;
  1286. LPESB lpesbSaved;
  1287. LPESB lpesbTemp;
  1288. LPESB lpesbChild;
  1289. lrgPriorityQueue = lpesi->lrgPriorityQueue;
  1290. lpesbSaved = lrgPriorityQueue[1];
  1291. MaxCurIndex = (MaxChildIndex = lpesi->uiQueueSize) / 2;
  1292. for (CurIndex = 1; CurIndex <= MaxCurIndex; CurIndex = ChildIndex)
  1293. {
  1294. // Get child index
  1295. ChildIndex = CurIndex * 2;
  1296. // Find the minimum of the two children
  1297. if (ChildIndex < MaxChildIndex)
  1298. {
  1299. if ((lpesbTemp = lrgPriorityQueue[ChildIndex + 1]) != NULL)
  1300. {
  1301. lpesbChild = lrgPriorityQueue[ChildIndex];
  1302. // The two children exist. Take the smallest
  1303. if (CompareRecordBuffers
  1304. (lpipb, (LPB)lpesbChild->lrgbMem + lpesbChild->ibBuf,
  1305. (LPB)lpesbTemp->lrgbMem + lpesbTemp->ibBuf) >= 0)
  1306. ChildIndex++;
  1307. }
  1308. }
  1309. // If the parent's node is less than the child, then break
  1310. // (heap condition met)
  1311. if (ChildIndex > MaxChildIndex)
  1312. break;
  1313. lpesbTemp = lrgPriorityQueue [ChildIndex];
  1314. if (CompareRecordBuffers (lpipb, (LPB)lpesbSaved->lrgbMem +
  1315. lpesbSaved->ibBuf, (LPB)lpesbTemp->lrgbMem+lpesbTemp->ibBuf) < 0)
  1316. break;
  1317. // Replace the node
  1318. lrgPriorityQueue [CurIndex] = lpesbTemp;
  1319. }
  1320. lrgPriorityQueue [CurIndex] = lpesbSaved;
  1321. #if _BINHN
  1322. SetQueue (lpesi);
  1323. #endif
  1324. }
  1325. PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER pHeader,
  1326. PTOPICDATA pNewTopic, int cNumOcc)
  1327. {
  1328. // PTOPICDATA pLastTopic;
  1329. PTOPICDATA pTopic, pPrevTopic;
  1330. int fResult;
  1331. if ((pTopic = pHeader->pLastTopic) == NULL)
  1332. {
  1333. // The list is empty
  1334. pHeader->pTopic = pHeader->pLastTopic = pNewTopic;
  1335. pNewTopic->pNext = NULL;
  1336. return(NULL);
  1337. }
  1338. fResult = pTopic->dwTopicId - pNewTopic->dwTopicId;
  1339. if (fResult < 0)
  1340. {
  1341. // New node. Add to the end
  1342. pNewTopic->pNext = NULL;
  1343. pHeader->pLastTopic->pNext = pNewTopic;
  1344. pHeader->pLastTopic = pNewTopic;
  1345. // Reset pNewTopic for next node allocation
  1346. return NULL;
  1347. }
  1348. if (fResult == 0)
  1349. {
  1350. // Same topic. Return pNewTopic for reuse
  1351. if (cNumOcc)
  1352. MergeOccurrence (pTopic, pNewTopic, cNumOcc);
  1353. return(pNewTopic);
  1354. }
  1355. // If we get to this point, the list is out of order
  1356. // Try to find the insertion point
  1357. pTopic = pHeader->pTopic;
  1358. pPrevTopic = NULL;
  1359. for (; pTopic->pNext; pTopic = pTopic->pNext)
  1360. {
  1361. if (pTopic->dwTopicId >= pNewTopic->dwTopicId)
  1362. {
  1363. /* We pass the inserted point */
  1364. break;
  1365. }
  1366. pPrevTopic = pTopic;
  1367. }
  1368. if (pTopic->dwTopicId == pNewTopic->dwTopicId)
  1369. {
  1370. // Same topic. Return pNewTopic for reuse
  1371. if (cNumOcc)
  1372. MergeOccurrence (pTopic, pNewTopic, cNumOcc);
  1373. return(pNewTopic);
  1374. }
  1375. // Handle empty case
  1376. if (pPrevTopic == NULL)
  1377. {
  1378. /* Insert at the beginning */
  1379. pNewTopic->pNext = pHeader->pTopic;
  1380. pHeader->pTopic = pNewTopic;
  1381. }
  1382. else
  1383. {
  1384. /* Inserted at the middle or the end */
  1385. pNewTopic->pNext = pPrevTopic->pNext;
  1386. pPrevTopic->pNext = pNewTopic;
  1387. }
  1388. // Update the last topic
  1389. while (pTopic->pNext)
  1390. {
  1391. pTopic = pTopic->pNext;
  1392. }
  1393. pHeader->pLastTopic = pTopic;
  1394. return(NULL);
  1395. }
  1396. /*************************************************************************
  1397. * @doc PRIVATE
  1398. * @func void | MergeOccurrence |
  1399. * Merge the occurrence by adding them in order
  1400. *************************************************************************/
  1401. PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA pOldTopic,
  1402. PTOPICDATA pNewTopic, int cOccNum)
  1403. {
  1404. ERRB errb;
  1405. if (CompareOccurrence (&pOldTopic->pLastOccData->OccData[0],
  1406. &pNewTopic->pOccData->OccData[0], cOccNum) <= 0)
  1407. {
  1408. // The whole last list is less than the current list. This is
  1409. // what I expect
  1410. // We just linked the 2 lists together
  1411. pOldTopic->pLastOccData->pNext = pNewTopic->pOccData;
  1412. pOldTopic->pLastOccData = pNewTopic->pLastOccData;
  1413. pOldTopic->dwOccCount += pNewTopic->dwOccCount;
  1414. return;
  1415. }
  1416. // The current list is less than the old list.
  1417. // This is weird, but still we can handle it
  1418. if (CompareOccurrence (&pNewTopic->pOccData->OccData[0],
  1419. &pOldTopic->pOccData->OccData[0], cOccNum) <= 0)
  1420. {
  1421. pNewTopic->pLastOccData->pNext = pOldTopic->pOccData;
  1422. pOldTopic->pOccData = pNewTopic->pOccData;
  1423. pOldTopic->dwOccCount += pNewTopic->dwOccCount;
  1424. return;
  1425. }
  1426. SetErrCode (&errb, E_ASSERT);
  1427. }
  1428. /*====================================================================*/
  1429. #ifdef BINHN
  1430. PRIVATE VOID PASCAL NEAR SetQueue (LPESI pEsi)
  1431. {
  1432. unsigned int i = 0;
  1433. LPESB FAR *lrgPriorityQueue;
  1434. lrgPriorityQueue = pEsi->lrgPriorityQueue;
  1435. for (i = 0; i < 20 && i < pEsi->cesb ; i++)
  1436. {
  1437. if (lrgPriorityQueue[i])
  1438. pEsi->lpbQueueStr[i] = lrgPriorityQueue[i]->lrgbMem +
  1439. lrgPriorityQueue[i]->ibBuf + 6;
  1440. }
  1441. }
  1442. #endif
  1443. /************************************************************************
  1444. * @doc PRIVATE
  1445. * @func HRESULT PASCAL NEAR | ESBBlockAllocate |
  1446. * Set the memory allocation based on the memory of the machine
  1447. * @parm DWORD | lMemSize |
  1448. * Memory allocated for the indexer
  1449. * @rdesc S_OK, or E_OUTOFMEMORY
  1450. ************************************************************************/
  1451. PRIVATE HRESULT PASCAL NEAR ESBBlockAllocate (_LPIPB lpipb, DWORD lMemSize)
  1452. {
  1453. DWORD dwTopicSize;
  1454. DWORD dwOccSize;
  1455. WORD OccNodeSize = sizeof (OCCDATA) - 1 + sizeof(DWORD) *
  1456. lpipb->ucNumOccDataFields; // About 24bytes
  1457. OccNodeSize = (OccNodeSize + 3) & ~3;
  1458. /* The memory is for topic block and occurrence blocks, which
  1459. * should be in the ratio 1:1.5
  1460. */
  1461. dwTopicSize = (lMemSize * 2) / 5;
  1462. dwOccSize = lMemSize - dwTopicSize;
  1463. #if 0
  1464. /* Don't do anything if things are too small */
  1465. if (dwTopicSize < MAX_BLOCK_SIZE || dwOccSize < MAX_BLOCK_SIZE)
  1466. return(E_OUTOFMEMORY);
  1467. #endif
  1468. // Allocate a block manager for topic node
  1469. if ((lpipb->TopicBlock.pBlockMgr =
  1470. BlockInitiate ((MAX_BLOCK_SIZE * sizeof(TOPICDATA)/sizeof(TOPICDATA)),
  1471. sizeof(TOPICDATA),
  1472. (WORD)(dwTopicSize/MAX_BLOCK_SIZE),
  1473. USE_VIRTUAL_MEMORY | THREADED_ELEMENT)) == NULL)
  1474. {
  1475. exit2:
  1476. return SetErrCode (NULL, E_OUTOFMEMORY);
  1477. }
  1478. lpipb->TopicBlock.pFreeList =
  1479. (PLIST)BlockGetLinkedList(lpipb->TopicBlock.pBlockMgr);
  1480. // Allocate a block manager for occ node
  1481. if ((lpipb->OccBlock.pBlockMgr =
  1482. BlockInitiate((MAX_BLOCK_SIZE * OccNodeSize)/OccNodeSize,
  1483. OccNodeSize, (WORD)(lMemSize / MAX_BLOCK_SIZE),
  1484. USE_VIRTUAL_MEMORY | THREADED_ELEMENT)) == NULL)
  1485. {
  1486. BlockFree(lpipb->BTNodeBlock.pBlockMgr);
  1487. lpipb->BTNodeBlock.pBlockMgr = NULL;
  1488. goto exit2;
  1489. }
  1490. lpipb->OccBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->OccBlock.pBlockMgr);
  1491. return (S_OK);
  1492. }
  1493. PRIVATE LPV PASCAL NEAR GetBlockNode (PBLKCOMBO pBlockCombo)
  1494. {
  1495. PLIST pList;
  1496. if (pBlockCombo->pFreeList == NULL)
  1497. {
  1498. if ((BlockGrowth (pBlockCombo->pBlockMgr) != S_OK))
  1499. return (NULL);
  1500. pBlockCombo->pFreeList =
  1501. (PLIST)BlockGetLinkedList(pBlockCombo->pBlockMgr);
  1502. }
  1503. pList = pBlockCombo->pFreeList;
  1504. pBlockCombo->pFreeList = pList->pNext;
  1505. pBlockCombo->dwCount ++;
  1506. // pList->pNext = NULL;
  1507. return (pList);
  1508. }
  1509. /*************************************************************************
  1510. *
  1511. * @doc INTERNAL
  1512. *
  1513. * @func BOOL FAR PASCAL | BuildIndexFile |
  1514. * This function is for debugging purpose only. In normal indexing,
  1515. * it will never be called. Since collecting words and indexing can
  1516. * take a long time, debugging the index phase can become a hassle that
  1517. * take several hours per shot. To minimize the index time for debugging,
  1518. * all the intermediate files are saved, which are:
  1519. * - the internal sorted result file, which contains all words and
  1520. * their occurrences sorted
  1521. * - the external sorted result file, which is a snap shot of the
  1522. * ESI structures and its ESB blocks
  1523. * The only steps left will be processing the occurrence list and doing
  1524. * permanent index
  1525. *
  1526. * To use the function, add the following lines in the app:
  1527. *
  1528. * extern HRESULT PASCAL FAR BuildIndexFile (LPSTR, LPSTR, LPSTR, WORD, WORD,
  1529. * WORD, INTERRUPT_FUNC, VOID FAR *, STATUS_FUNC, VOID FAR*, PHRESULT);
  1530. *
  1531. * int fDotest;
  1532. *
  1533. * if (fDotest) {
  1534. * return BuildIndexFile ((LPSTR)"c:/tmp/test.mvb!MVINDEX",
  1535. * (LPSTR)"c:/tmp/esi.tmp", (LPSTR)"c:/tmp/iso.tmp",
  1536. * OCCF_TOPICID, IDXF_NORMALIZE, 0, (INTERRUPT_FUNC)lpfnInterruptFunc,
  1537. * (LPV)NULL,
  1538. * (STATUS_FUNC)lpfnStatusFunc, (LPV)hwndGlobal,
  1539. * NULL);
  1540. * }
  1541. *
  1542. * @parm HFPB | hfpb |
  1543. * HFPB for index file if pstrIndexFile is NULL
  1544. *
  1545. * @parm LPB | pstrIndexFile |
  1546. * The .MVB + index file, usually with the format TEST.MVB!MVINDEX
  1547. *
  1548. * @parm LPB | lpbEsiFile |
  1549. * The external sort info file
  1550. *
  1551. * @parm LPB | lpbIsiFile |
  1552. * The internal sorted info filename
  1553. *
  1554. * @parm PINDEXINFO | pIndexInfo |
  1555. * IndexInfo
  1556. *
  1557. * @rdesc S_OK if succeeded, else other non-zero error codes
  1558. *************************************************************************/
  1559. PUBLIC HRESULT PASCAL EXPORT_API FAR BuildIndexFile
  1560. (HFPB hfpb, LPSTR pstrIndexFile,
  1561. LPB lpbEsiFile, LPB lpbIsiFile, PINDEXINFO pIndexInfo)
  1562. {
  1563. _LPIPB lpipb;
  1564. LPESI lpesi;
  1565. BOOL fRet;
  1566. ERRB errb;
  1567. DWORD loop;
  1568. FLOAT rLog;
  1569. BYTE bKeyIndex = 0;
  1570. if ((lpipb = MVIndexInitiate(pIndexInfo, NULL)) == NULL)
  1571. return E_FAIL;
  1572. lpesi = &lpipb->esi;
  1573. if (LoadEsiTemp (lpipb, lpesi, lpbEsiFile, lpbIsiFile, NULL) != S_OK)
  1574. {
  1575. fRet = E_FAIL;
  1576. exit0:
  1577. MVIndexDispose (lpipb);
  1578. return fRet;
  1579. }
  1580. if (lpipb->idxf & IDXF_NORMALIZE)
  1581. {
  1582. // Allocate a huge buffer to contain all the sigma terms
  1583. if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT,
  1584. (LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL)
  1585. return SetErrCode (&errb, E_OUTOFMEMORY);
  1586. lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
  1587. // Small buffer containing pre-calculated values
  1588. if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT,
  1589. (CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL)
  1590. {
  1591. SetErrCode (&errb, (HRESULT)(fRet = E_OUTOFMEMORY));
  1592. FreeHandle (lpipb->wi.hSigma);
  1593. goto exit0;
  1594. }
  1595. lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog);
  1596. // Initialize the array
  1597. for (loop = cLOG_MAX - 1; loop > 0; --loop)
  1598. {
  1599. #ifndef ISBU_IR_CHANGE
  1600. rLog = (float) log10(cHundredMillion/(double)loop);
  1601. #else
  1602. rLog = (float)1.0 / (float)loop;
  1603. #endif // ISBU_IR_CHANGE
  1604. lpipb->wi.lrgrLog[loop] = rLog * rLog;
  1605. }
  1606. }
  1607. if ((fRet = MergeSortTreeFile (lpipb, NULL)) != S_OK)
  1608. return SetErrCode (&errb, (HRESULT)fRet);
  1609. if ((lpipb->idxf & KEEP_TEMP_FILE) == 0)
  1610. FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE);
  1611. // If we are doing term-weighting we have to square root all sigma values
  1612. if (lpipb->idxf & IDXF_NORMALIZE)
  1613. {
  1614. // ISBU_IR_CHANGE not necessary 'cos sqrt computation is necessary in both cases
  1615. for (loop = 0; loop < lpipb->dwMaxTopicId + 1; ++loop)
  1616. lpipb->wi.hrgsigma[loop] =
  1617. (float)sqrt ((double)lpipb->wi.hrgsigma[loop]);
  1618. }
  1619. // Analyze data to get the best compression scheme
  1620. // TopicId
  1621. VGetBestScheme(&lpipb->cKey[CKEY_TOPIC_ID],
  1622. &lpipb->BitCount[CKEY_TOPIC_ID][0], lcbitBITSTREAM_ILLEGAL, TRUE);
  1623. // Occurrence Count
  1624. VGetBestScheme(&lpipb->cKey[CKEY_OCC_COUNT],
  1625. &lpipb->BitCount[CKEY_OCC_COUNT][0], lcbitBITSTREAM_ILLEGAL, TRUE);
  1626. if (lpipb->occf & OCCF_COUNT)
  1627. {
  1628. VGetBestScheme(&lpipb->cKey[bKeyIndex],
  1629. &lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
  1630. bKeyIndex++;
  1631. }
  1632. if (lpipb->occf & OCCF_OFFSET)
  1633. {
  1634. VGetBestScheme(&lpipb->cKey[bKeyIndex],
  1635. &lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
  1636. bKeyIndex++;
  1637. }
  1638. // Call the user callback every once in a while
  1639. if (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS)
  1640. {
  1641. PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo;
  1642. CALLBACKINFO Info;
  1643. Info.dwPhase = 2;
  1644. Info.dwIndex = 100;
  1645. fRet = (*pCallbackInfo->MessageFunc)
  1646. (ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info);
  1647. if (S_OK != fRet)
  1648. goto exit0;
  1649. }
  1650. // Build the permanent index
  1651. fRet = BuildBTree(NULL, lpipb, lpipb->esi.aszTempName, hfpb, pstrIndexFile);
  1652. if (lpipb->idxf & IDXF_NORMALIZE)
  1653. {
  1654. FreeHandle (lpipb->wi.hLog);
  1655. FreeHandle (lpipb->wi.hSigma);
  1656. }
  1657. goto exit0;
  1658. }
  1659. PRIVATE VOID PASCAL NEAR SaveEsiTemp (_LPIPB lpipb, LPESI lpesi)
  1660. {
  1661. GHANDLE hfpb;
  1662. LPESB lpesb;
  1663. char szEsi[100];
  1664. GETTEMPFILENAME ((char)0, "foo", 0, szEsi);
  1665. if ((hfpb = FileOpen(NULL, szEsi, REGULAR_FILE, READ_WRITE, NULL)) == NULL)
  1666. return;
  1667. FileWrite(hfpb, lpipb, sizeof(IPB), NULL);
  1668. for (lpesb = lpesi->lpesbRoot; lpesb; lpesb = lpesb->lpesbNext)
  1669. {
  1670. if (FileWrite(hfpb, lpesb, sizeof(ESB), NULL) != sizeof(ESB))
  1671. {
  1672. FileClose (hfpb);
  1673. FileUnlink (NULL, szEsi, REGULAR_FILE);
  1674. return;
  1675. }
  1676. }
  1677. FileClose (hfpb);
  1678. MEMCPY (lpipb->szEsiTemp, szEsi, 100);
  1679. }
  1680. PRIVATE VOID PASCAL NEAR UpdateEsiTemp (_LPIPB lpipb)
  1681. {
  1682. GHANDLE hfpb;
  1683. if ((hfpb = FileOpen(NULL, lpipb->szEsiTemp, REGULAR_FILE,
  1684. READ_WRITE, NULL)) == NULL)
  1685. return;
  1686. FileWrite(hfpb, lpipb, sizeof(IPB), NULL);
  1687. FileClose (hfpb);
  1688. }
  1689. PRIVATE BOOL PASCAL LoadEsiTemp (_LPIPB lpipb, LPESI lpesi, LPB lpbEsiFile,
  1690. LPB lpbIsiFile, PHRESULT phr)
  1691. {
  1692. LPESB lpesb;
  1693. HFILE hFile;
  1694. ESB esb;
  1695. HANDLE hesb;
  1696. HRESULT fRet;
  1697. IPB ipb;
  1698. LPISI pIsi = &lpipb->isi; // Pointer to internal sort info
  1699. /* Copy the internal sort info filename */
  1700. MEMCPY (pIsi->aszTempName, lpbIsiFile, lstrlen(lpbIsiFile));
  1701. /* Read in the external sort buffer info */
  1702. if ((hFile = _lopen (lpbEsiFile, READ)) == HFILE_ERROR)
  1703. return E_NOTEXIST;
  1704. /* Read old IPB info */
  1705. _lread (hFile, &ipb, sizeof(IPB));
  1706. /* Transfer meaningful data */
  1707. lpipb->dwIndexedWord = ipb.dwIndexedWord;
  1708. lpipb->dwUniqueWord = ipb.dwUniqueWord;
  1709. lpipb->dwByteCount = ipb.dwByteCount;
  1710. lpipb->dwOccOffbits = ipb.dwOccOffbits;
  1711. lpipb->dwOccExtbits = ipb.dwOccExtbits;
  1712. lpipb->dwMaxFieldId = ipb.dwMaxFieldId;
  1713. lpipb->dwMaxWCount = ipb.dwMaxWCount;
  1714. lpipb->dwMaxOffset = ipb.dwMaxOffset;
  1715. lpipb->dwTotal3bWordLen = ipb.dwTotal3bWordLen;
  1716. lpipb->dwTotal2bWordLen = ipb.dwTotal2bWordLen;
  1717. lpipb->dwTotalUniqueWordLen = ipb.dwTotalUniqueWordLen;
  1718. lpipb->lcTopics = ipb.lcTopics;
  1719. lpipb->dwMaxTopicId = ipb.dwMaxTopicId;
  1720. // lpipb->dwMemAllowed = ipb.dwMemAllowed;
  1721. lpipb->dwMaxRecordSize = ipb.dwMaxRecordSize;
  1722. lpipb->dwMaxEsbRecSize = ipb.dwMaxEsbRecSize;
  1723. lpipb->dwMaxWLen = ipb.dwMaxWLen;
  1724. lpipb->idxf = ipb.idxf;
  1725. while ((_lread (hFile, &esb, sizeof(ESB))) == sizeof(ESB))
  1726. {
  1727. if ((hesb = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
  1728. sizeof(ESB))) == NULL) {
  1729. fRet = SetErrCode (phr,E_OUTOFMEMORY);
  1730. exit0:
  1731. _lclose (hFile);
  1732. return fRet;
  1733. }
  1734. lpesb = (LPESB)_GLOBALLOCK (hesb);
  1735. /* Copy the ESB information */
  1736. *lpesb = esb;
  1737. /* Update the structure */
  1738. lpesb->hStruct = hesb;
  1739. lpesb->lpesbNext = lpesi->lpesbRoot;
  1740. lpesi->lpesbRoot= lpesb;
  1741. lpesi->cesb ++;
  1742. }
  1743. _lclose (hFile);
  1744. fRet = S_OK;
  1745. goto exit0;
  1746. }
  1747. HRESULT FAR PASCAL AllocSigmaTable (_LPIPB lpipb)
  1748. {
  1749. ERRB errb;
  1750. DWORD loop;
  1751. float rLog;
  1752. if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT,
  1753. (LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL)
  1754. return SetErrCode (&errb, E_OUTOFMEMORY);
  1755. lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
  1756. if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT,
  1757. (CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL)
  1758. {
  1759. FreeHandle (lpipb->wi.hSigma);
  1760. return SetErrCode (&errb, E_OUTOFMEMORY);
  1761. }
  1762. lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog);
  1763. // Initialize the array
  1764. for (loop = cLOG_MAX - 1; loop > 0; --loop)
  1765. {
  1766. #ifndef ISBU_IR_CHANGE
  1767. rLog = (float) log10(cHundredMillion/(double)loop);
  1768. #else
  1769. rLog = (float)1.0 / (float)loop;
  1770. #endif // ISBU_IR_CHANGE
  1771. lpipb->wi.lrgrLog[loop] = rLog * rLog;
  1772. }
  1773. return(S_OK);
  1774. }