Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

884 lines
34 KiB

  1. /*************************************************************************
  2. * *
  3. * Copyright (C) Microsoft Corporation 1990-1994 *
  4. * All Rights reserved. *
  5. * *
  6. **************************************************************************
  7. * *
  8. * Module Intent *
  9. * All typedefs and defines needed for user's retrieval *
  10. * *
  11. *************************************************************************/
  12. #ifndef __MVSEARCH_H_
  13. #define __MVSEARCH_H_
  14. #ifdef __cplusplus
  15. extern "C" {
  16. #endif
  17. #pragma pack(1) // Guard against Zp problems.
  18. #include <iterror.h>
  19. /*************************************************************************
  20. * Basic defines.
  21. *************************************************************************/
  22. #ifdef _32BIT
  23. #define EXPORT_API
  24. #define HUGE
  25. #else
  26. #define EXPORT_API _export
  27. #define HUGE huge
  28. #endif
  29. #ifdef PRIVATE
  30. #undef PRIVATE
  31. #endif
  32. #define PRIVATE static
  33. #ifdef PUBLIC
  34. #undef PUBLIC
  35. #endif
  36. #define PUBLIC
  37. #define cbMAX_PATH (CB)256 // Maximum pathname length.
  38. /* Maximum year's value that can be passed to MediaView's FBreakEpoch() */
  39. #define MAX_YEAR ((unsigned long)0xFFFFFFFF / 366)
  40. /* Maximum word length that is accepted by MediaView's breaker. */
  41. #define CB_MAX_WORD_LEN ((CB)1000) // Longest legal word.
  42. /*************************************************************************
  43. * Typedef
  44. *************************************************************************/
  45. #ifndef LPB
  46. typedef BYTE FAR * LPB;
  47. #endif
  48. typedef WORD IB; // Index into an array of bytes.
  49. #ifdef _32BIT
  50. typedef DWORD CB; // Count of bytes.
  51. #else
  52. typedef WORD CB; // Count of bytes.
  53. #endif
  54. typedef DWORD LCB; // Count of bytes.
  55. typedef WORD CBIT; // Count of bits.
  56. typedef WORD FAR * LPW; // pointer to word
  57. typedef BYTE FAR *LSZ; // 0-terminated string far pointer
  58. typedef BYTE FAR *LST; // Pascal style string far pointer
  59. typedef void FAR *LPV; // Far void pointer
  60. typedef DWORD LFO; // 32-bit file offset.
  61. typedef DWORD LCF; // 32-bit file count bytes
  62. typedef void NEAR *NPV; // Void near pointers.
  63. typedef NPV NPIBI; // Near
  64. typedef WORD IDXF;
  65. typedef LPV LPIBI; // Far
  66. typedef LPV LPSIPB; // Stop information parameter block.
  67. typedef LPV LPCAT; // Pointer to catalog
  68. typedef LPV LPGROUP; // pointer to a group.
  69. typedef LPV LPIDX; // Pointer index block.
  70. typedef LPV LPQT; // Pointer to Query tree.
  71. typedef LPV LPIPB; // Pointer to Index parameter block.
  72. typedef LPV LPWHEEL; // Pointer to wheel parameter block.
  73. typedef LPV LPHL; // Pointer to hitlist block.
  74. typedef LPV LPCTAB; // Pointer to chartab
  75. typedef LPV LPOPTAB; // Pointer to operator table
  76. typedef LPV LPBRKI; // Pointer to breaker info
  77. typedef WORD OCCF;
  78. typedef HANDLE GHANDLE;
  79. typedef HANDLE HGPOUP; // Handle to Group list
  80. typedef DWORD IDGROUP; // Group's ID
  81. typedef GHANDLE HIDX;
  82. typedef GHANDLE HFPB;
  83. /*************************************************************************
  84. * Word-breaker API and associated defines.
  85. *************************************************************************/
  86. typedef HANDLE HIBI; // "Internal break info". The individual
  87. // word breakers allocate this
  88. /*
  89. * FWORDCB
  90. * Call back function needed for MediaView breaker. All the LST strings
  91. * are special 2-byte length preceded strings.
  92. *
  93. * LST lstRawWord:
  94. * Words as they appear originally. MediaView only uses the length that
  95. * is need for highlighting
  96. *
  97. * LST lstNormWord:
  98. * Normalized word, which will be indexed. Normalized are words that
  99. * are modified (such as stemmed, changed to lower case, etc)
  100. *
  101. * DWORD dwOffset:
  102. * Offset in the topic (or from the beginning of the buffer passed to
  103. * MediaView breakers) where the word occurs
  104. *
  105. * LPV lpUser:
  106. * User's data, propageted down to the user's call back function
  107. */
  108. typedef ERR (FAR PASCAL * FWORDCB)(LST lstRawWord, LST lstNormWord,
  109. DWORD dwOffset, LPV lpUser);
  110. /* BREAKER_INIT
  111. * This is the breaker's initialization routine. This routine will
  112. * be called only by MediaView's Title Builder before any calls to
  113. * the breaker is made
  114. */
  115. typedef LPIBI (FAR PASCAL * BREAKER_INIT)(VOID);
  116. /* BREAKER_FREE
  117. * Termination routine for the breaker. This will allow the breaker
  118. * to free any internal buffer used by it
  119. */
  120. typedef void (FAR PASCAL * BREAKER_FREE)(LPIBI);
  121. /*
  122. * Breaker function's parameter structure:
  123. *
  124. * BRK_PARMS structure
  125. * LPIBI lpInternalBreakInfo:
  126. * This points to internal information associated with a breaker
  127. * (such as memory buffer, flags, etc). It is solely used by
  128. * that breaker
  129. *
  130. * BYTE FAR * lpbBuf;
  131. * Buffer containing the strings to be broken into invidual words
  132. * This buffer is allocated by the application
  133. *
  134. * DWORD cbBufCount;
  135. * The size of the buffer
  136. *
  137. * DWORD lcbBufOffset;
  138. * The offset of the strings from the topic. This is needed if
  139. * OCCF_OFFSET is used, since the MV breaker will return offsets
  140. * of the words based on this offset
  141. *
  142. * LPV lpvUser
  143. * Anything that the application's callback function needs. The way
  144. * the breaker works is that:
  145. * - The application calls the breaker with some buffers to be broken
  146. * into words
  147. * - For each word the breaker will call the app's callback function
  148. * to return the word and its associated information (length, offset)
  149. *
  150. * FWORDCB lpfnOutWord;
  151. * Pointer to application callback function
  152. *
  153. * LPSIPB lpStopInfoBlock;
  154. * Stop word information. This contains a list of words that the
  155. * application wants the breaker to ignore. This pertains to
  156. * MediaView's breaker only
  157. *
  158. * LPVOID lpCharTab;
  159. * Character table information. This pertains to MediaView's breaker
  160. * only
  161. *
  162. * WORD fFlags;
  163. * Internal flags set and used by MediaView's breaker only
  164. *
  165. * } BRK_PARMS, FAR *LPBRK_PARMS;
  166. */
  167. typedef struct BRK_PARMS
  168. {
  169. LPIBI lpInternalBreakInfo;
  170. BYTE FAR * lpbBuf;
  171. DWORD cbBufCount;
  172. DWORD lcbBufOffset;
  173. LPV lpvUser;
  174. FWORDCB lpfnOutWord;
  175. LPSIPB lpStopInfoBlock;
  176. LPVOID lpCharTab;
  177. WORD fFlags;
  178. WORD Pad; // Padding to make DWORD align
  179. } BRK_PARMS, FAR *LPBRK_PARMS;
  180. /* BREAKER_FUNC
  181. * Breaker's function prototype for various MediaView's breaker functions
  182. * such as FBreakWords()
  183. */
  184. typedef ERR (FAR PASCAL * BREAKER_FUNC) (LPBRK_PARMS);
  185. /*
  186. * BRKLIST structure
  187. * For internal use only
  188. */
  189. typedef struct BreakList
  190. {
  191. HANDLE hnd; // handle to this structure
  192. HANDLE hLib;
  193. BREAKER_FUNC lpfnBreakFunc;
  194. LPSIPB lpStopListInfo;
  195. LPVOID lpCharTab;
  196. } BRKLIST, FAR *LPBRKLIST;
  197. /*************************************************************************
  198. *
  199. * The following breakers functions are internal functions
  200. * They can be served as a template prototypes for user's functions
  201. *************************************************************************/
  202. PUBLIC LPIBI EXPORT_API FAR PASCAL BreakerInitiate(void);
  203. PUBLIC void EXPORT_API FAR PASCAL BreakerFree(LPIBI);
  204. PUBLIC ERR EXPORT_API FAR PASCAL FBreakWords(LPBRK_PARMS);
  205. PUBLIC ERR EXPORT_API FAR PASCAL FBreakNumber(LPBRK_PARMS);
  206. PUBLIC ERR EXPORT_API FAR PASCAL FBreakDate(LPBRK_PARMS);
  207. PUBLIC ERR EXPORT_API FAR PASCAL FBreakTime(LPBRK_PARMS);
  208. PUBLIC ERR EXPORT_API FAR PASCAL FBreakEpoch(LPBRK_PARMS);
  209. // This exists only to enable MVJK to link statically.
  210. // We must have the same function names for the static build.
  211. PUBLIC ERR FAR PASCAL FBreakStems(LPBRK_PARMS lpBrkParms);
  212. // (EX)ternal (BR)ea(K)er (P)ara(M)eter structure that old .c search code
  213. // can pass to ExtBreakText() in order to configure and call the
  214. // new COM breakers. The breaker control params have been purposely defined
  215. // to mimic those in BRK_PARAMS as much as possible. Note that the ones
  216. // missing are now internal to the COM breaker implementation.
  217. typedef struct _exbrkpm
  218. {
  219. // This section to be specified by .c caller of breaker.
  220. DWORD dwBreakWordType; // Reg. text, number, date, etc.
  221. LPBYTE lpbBuf; // Text buffer;
  222. DWORD cbBufCount; // No. of chars in buffer.
  223. LPVOID lpvUser; // Caller data that gets passed through
  224. // to *lpfnOutWord.
  225. FWORDCB lpfnOutWord; // Pointer to word callback function.
  226. WORD fFlags; // Breaker flags.
  227. // This section is owned by the index COM object and should not be
  228. // modified by the .c caller.
  229. LPVOID lpvIndexObjBridge;
  230. } EXBRKPM, *PEXBRKPM;
  231. PUBLIC HRESULT EXPORT_API FAR PASCAL ExtBreakText(PEXBRKPM pexbrkpm);
  232. PUBLIC HRESULT EXPORT_API FAR PASCAL ExtStemWord(LPVOID lpvIndexObjBridge,
  233. LPBYTE lpbStemWord, LPBYTE lpbRawWord);
  234. PUBLIC HRESULT EXPORT_API FAR PASCAL ExtLookupStopWord(
  235. LPVOID lpvIndexObjBridge, LPBYTE lpbStopWord);
  236. PUBLIC HRESULT EXPORT_API FAR PASCAL ExtAddQueryResultTerm(
  237. LPVOID lpvIndexObjBridge, LPBYTE lpbTermHit, LPVOID *ppvTermHit);
  238. /*************************************************************************
  239. * @doc API EXTERNAL
  240. * @func ERR FAR PASCAL | fInterrupt |
  241. * Function to support interrupt (cancel) feature.
  242. * @parm LPVOID | lpV |
  243. * Parameter used by the callback interrupt function
  244. * @rdesc ERR_SUCCESS if there is no interrupt, else ERR_INTERRUPT
  245. *************************************************************************/
  246. typedef ERR (FAR PASCAL *INTERRUPT_FUNC)(LPVOID);
  247. /*************************************************************************
  248. * @doc API EXTERNAL
  249. * @func int FAR PASCAL | fStatus |
  250. * Function to support status messaging feature.
  251. * @parm LPSTR | lpStr |
  252. * Message to be displayed
  253. * @rdesc Different status codes
  254. *************************************************************************/
  255. typedef VOID (FAR PASCAL *STATUS_FUNC)(LPSTR);
  256. #define BREAKERBUFFERSIZE 1024 // Size of breaker's state info struct
  257. /*
  258. * Breaker Table Constants
  259. */
  260. #define MAXNUMBRKRS 16 // maximum number of breakers.
  261. #define MAXBRKRLEN 1024 // maximum size of breaker line in |SYSTEM.
  262. #ifndef ISBU_IR_CONSTS
  263. #define ISBU_IR_CONSTS
  264. #define cHundredMillion ((float) 100000000.0)
  265. #define cVerySmallWt ((float) 0.02)
  266. #define cNintyFiveMillion 95000000
  267. #define cTFThreshold 4096
  268. #endif // ISBU_IR_CONSTS
  269. /*************************************************************************
  270. * Stop list retrieval API.
  271. *************************************************************************/
  272. typedef ERR (FAR PASCAL * STOPLKUP)(LPSIPB, LST);
  273. PUBLIC LPSIPB EXPORT_API FAR PASCAL MVStopListInitiate (WORD wTabSize,
  274. LPERRB lperrb);
  275. PUBLIC ERR EXPORT_API FAR PASCAL MVStopListIndexLoad(HFPB hSysFile,
  276. LPSIPB lpsipb, LSZ szStopFilename);
  277. PUBLIC ERR EXPORT_API FAR PASCAL MVStopListLookup(LPSIPB lpsipb,
  278. LST sPascalString);
  279. PUBLIC void EXPORT_API FAR PASCAL MVStopListDispose(LPSIPB lpsipb);
  280. PUBLIC ERR EXPORT_API FAR PASCAL MVStopListAddWord(LPSIPB lpsipb,
  281. LST sPascalString);
  282. PUBLIC ERR EXPORT_API FAR PASCAL MVStopListLoad(HFPB hfpbIn, LPSIPB lpsipb,
  283. LSZ szFilename, BREAKER_FUNC lpfnBreakerFunc,
  284. LPV lpCharTab);
  285. PUBLIC ERR EXPORT_API PASCAL FAR MVStopFileBuild (HFPB hSysFile,
  286. LPSIPB lpsipb, LSZ szStopfilename);
  287. PUBLIC ERR EXPORT_API FAR PASCAL MVStopListEnumWords(LPSIPB lpsipb,
  288. LST *plstWord, LONG *plWordInfo, LPVOID *ppvWordInfo);
  289. PUBLIC ERR EXPORT_API FAR PASCAL MVStopListFindWordPtr(LPSIPB lpsipb,
  290. LST lstWord, LST *plstWordInList);
  291. /*************************************************************************
  292. * Query's defines and API
  293. *************************************************************************/
  294. /* User's operators for retrieval */
  295. #define AND_OP 0
  296. #define OR_OP 1
  297. #define NOT_OP 2
  298. #define PHRASE_OP 3
  299. #define NEAR_OP 4
  300. #define DEF_PROX_DIST 8 // Default prox distance
  301. // In strings with embedded font tag, this number denotes that the next byte
  302. // is an index into a charmap table
  303. #define EMBEDFONT_BYTE_TAG 3
  304. #define TL_QKEY 0x8000 // Use with cDefOp to treat operators as words
  305. /*
  306. * Parser function's parameter structure:
  307. *
  308. * PARSE_PARMS structure
  309. * This structure contains all the informations necessary for
  310. * parsing a line.
  311. *
  312. * LPB lpbQuery:
  313. * Pointer to buffer containing the query to be parsed
  314. *
  315. * LPBRKLIST lpfnTable:
  316. * Array of breaker functions (FBreakWords, etc) indexed by their dtype
  317. *
  318. * DWORD cbQuery:
  319. * Query's buffer length
  320. */
  321. typedef struct PARSE_PARMS
  322. {
  323. LPCSTR lpbQuery; /* Pointer to query buffer */
  324. EXBRKPM *pexbrkpm; /* External breaker param structure */
  325. DWORD cbQuery; /* Query buffer's length */
  326. /* Note: all the following fields may be gone in the future if
  327. * we provide new operator to support them
  328. */
  329. LPGROUP lpGroup; /* Group */
  330. LPVOID lpOpTab; /* Operator table */
  331. WORD wCompoundWord;
  332. WORD cProxDist; /* Proximity distance */
  333. WORD cDefOp; /* Default operator */
  334. char padding[2];
  335. } PARSE_PARMS, FAR *LPPARSE_PARMS;
  336. PUBLIC LPQT EXPORT_API FAR PASCAL MVQueryParse(LPPARSE_PARMS, LPERRB);
  337. PUBLIC void EXPORT_API FAR PASCAL MVQueryFree(LPQT);
  338. /*************************************************************************
  339. * Index & Hitlist Retrieval API
  340. *************************************************************************/
  341. /*
  342. * This is an information buffer structure that you pass to "HitListGetTopic"
  343. * which fills in its fields. You can look at the fields not marked as
  344. * "internal", and also pass it to other API functions.
  345. */
  346. typedef struct TopicInfo
  347. {
  348. DWORD dwTopicId; // Topic-ID associated with this hit.
  349. DWORD lcHits; // Number of hits in this document.
  350. union
  351. {
  352. DWORD liFirstHit; // Index in the ROCC file of the first
  353. // hit in this document (internal).
  354. LPV lpTopicList; // Pointer to TopicList (internal)
  355. };
  356. WORD wWeight; // Document-weight.
  357. WORD Pad;
  358. } TOPICINFO,
  359. FAR *PTOPICINFO;
  360. /*
  361. * This is an information buffer structure that you pass to "HitListGetHit"
  362. * which fills in its fields.
  363. */
  364. typedef struct HitInfo
  365. {
  366. DWORD dwOffset; // Byte-offset ".
  367. DWORD dwFieldId; // Field-ID associated with this hit.
  368. DWORD dwCount;
  369. DWORD dwLength; // Word-length ".
  370. LPVOID lpvTerm; // Pointer to a term in WORD-prefix length
  371. // Unicode format, i.e. a "wide ST".
  372. } HIT,
  373. FAR *LPHIT;
  374. // SLOP is extra bytes to handle diacritic. Each byte represents an
  375. // occurence of one diacritic. 5 of them should be more than enough to
  376. // handle all diacritics in a word. This set up will allow us to
  377. // simplify the checking by just doing it for RawWord only
  378. #define SLOP 5
  379. /* FBreakWord states */
  380. #define SCAN_WHITE_STATE 0
  381. #define SCAN_WORD_STATE 1
  382. #define SCAN_NUM_STATE 2
  383. #define SCAN_SEP_STATE 3
  384. #define SCAN_LEADBYTE_STATE 4
  385. #define SCAN_SBKANA_STATE 5
  386. /* Other breaker functions' states */
  387. #define INITIAL_STATE 0
  388. #define COLLECTING_STATE 1
  389. // The following defines have an impact on the speed of the breaker.
  390. // The character class that appears the most should have the lowest
  391. // value (eg. 1), since the compiler will generate DEC AX, JE Lab
  392. // We want that class to be executed first. Since most documents have
  393. // more lower case (normalized) characters, CLASS_NORM should be 1
  394. #define NO_CLASS 0
  395. #define CLASS_NORM 0x01 // The char is already normalized
  396. #define CLASS_CHAR 0x02 // The char needs to be normalized
  397. #define CLASS_DIGIT 0x03 // The char is a digit
  398. #define CLASS_NSTRIP 0x04 // Strip from number (like comma)
  399. #define CLASS_NKEEP 0x05 // Keep with number (eg. decimal point)
  400. #define CLASS_STRIP 0x06 // Strip from the word (eg. apostrophe)
  401. #define CLASS_TYPE 0x07 // Reserved
  402. #define CLASS_CONTEXTNSTRIP 0x08 // Stripped or not depending on context
  403. #define CLASS_WILDCARD 0x09 // This is a wildcard char
  404. #define CLASS_LEADBYTE 0x0A // This is a DBCS lead-byte
  405. #define CLASS_SBKANA 0x0B // This is a single Kana byte
  406. #define CLASS_LIGATURE 0x0C // This is a ligature char
  407. /* Map to extract the class for special characters */
  408. #define SPECIAL_CHAR_MAP 0xFF00
  409. #define CLASS_BULLET 0x0100
  410. #define CLASS_ENDASH 0x0200
  411. #define CLASS_EMDASH 0x0300
  412. #define CLASS_LQUOTE 0x0400
  413. #define CLASS_RQUOTE 0x0500
  414. #define CLASS_LDBLQUOTE 0x0600
  415. #define CLASS_RDBLQUOTE 0x0700
  416. #define CLASS_TERMINATOR 0x0800 // Ignore whatever from this char on
  417. // for word wheel sorting
  418. /* Special reserved wildcard character */
  419. #define WILDCARD_STAR '*'
  420. #define WILDCARD_CHAR '?'
  421. /*
  422. Those fields will be imbedded in the beginning of the date, time, etc.
  423. string. This insures that only same types are compared together
  424. correctly
  425. */
  426. /* All special data types will have that byte at the beginning */
  427. #define SPECIAL_TYPE 0x1
  428. #ifdef TEST
  429. #define DATE_FORMAT 0x3131
  430. #define EPOCH_FORMAT 0x3231
  431. #define TIME_FORMAT 0x3331
  432. #define NUMBER_FORMAT 0x3431
  433. #else
  434. #define DATE_FORMAT 0x01
  435. #define EPOCH_FORMAT 0x02
  436. #define TIME_FORMAT 0x03
  437. #define NUMBER_FORMAT 0x04
  438. #endif
  439. /* Sign byte of number */
  440. #define NEGATIVE '1'
  441. #define POSITIVE '2'
  442. // - - - - - - - - -
  443. typedef struct InternalBreakInfo
  444. {
  445. HANDLE hibi; // Handle to this structure
  446. LCB lcb; // Byte offset of the start of the word that's
  447. // being constructed. This is equal to the
  448. // user-specified offset of the start of the
  449. // block being processed plus the offset into
  450. // the block at which the word starts. More
  451. // simply, this is the "byte offset" field of
  452. // the occurence element.
  453. CB cbNormPunctLen; // When processing something that I'm calling
  454. // a "number", I have to deal with characters
  455. // such as ".", which I have to strip if
  456. // they're at the end of the word, but have
  457. // to keep if they're in the middle. This
  458. // keeps track of the number of characters
  459. // that I have to remove if the word ends.
  460. // For instance, for the word "162..", this
  461. // value will be 2 at the time the word ends,
  462. // which would tell me to remove the last two
  463. // characters (the "..").
  464. CB cbRawPunctLen; // This works like "cbNormPunctLen", with a
  465. // small difference. This field handles
  466. // characters that are stripped, but which
  467. // affect the "length" of the "number" being
  468. // processed. An example case is the ","
  469. // character. For instance, "12,345" is 6
  470. // characters long, even though the "," gets
  471. // stripped, but "12345," is five characters
  472. // long, because the trailing comma doesn't
  473. // affect the length.
  474. BYTE state;
  475. BYTE fGotType; // Flag to denote we have got the 1st byte
  476. // of a 2-byte special type
  477. BYTE astNormWord[CB_MAX_WORD_LEN + 1 + SLOP];
  478. BYTE astRawWord[CB_MAX_WORD_LEN + 1];
  479. } IBI,
  480. FAR *_LPIBI;
  481. PUBLIC LPIDX EXPORT_API FAR PASCAL MVIndexOpen(HANDLE, LSZ, LPERRB);
  482. PUBLIC ERR EXPORT_API FAR PASCAL MVSearchSetCallback (LPQT, LPVOID);
  483. PUBLIC void EXPORT_API FAR PASCAL MVIndexClose(LPIDX);
  484. typedef struct SearchInfo
  485. {
  486. DWORD dwMemAllowed; // Maximum memory allowed
  487. DWORD dwTopicCount; // Maximum topics that the user wants to return
  488. DWORD Flag; // Search flags
  489. DWORD dwValue; // Internal use (should be set to 0)
  490. DWORD dwTopicFullCalc; // Maximum topics of dwTopicCount which are guaranteed
  491. // to have fully calculated top N similarity scores.
  492. LPVOID lpvIndexObjBridge; // Allows internal .c query code to indirectly call
  493. // pluggable COM stemmers.
  494. } SRCHINFO, FAR *PSRCHINFO;
  495. // Parameter passed to indexSearch. This should match with medv.h
  496. #define QUERYRESULT_RANK 0x0100 // Ranked the result. If not highest hit 1st
  497. //#define QUERYRESULT_UNSORTED 0x0200 // Result topics are 1st in 1st out (in UID order)
  498. #define QUERYRESULT_UIDSORT 0x0200 // Topics are returned in UID order
  499. #define QUERYRESULT_IN_MEM 0x0400 // Result should be kept in mem
  500. #define QUERYRESULT_GROUPCREATE 0x0800 // Create a group from the hitlist
  501. #define QUERYRESULT_NORMALIZE 0x1000 // Normalize result. Short topic 1st
  502. #define QUERYRESULT_LONGFIRST 0x2000 // Long topic 1st (not supported yet)
  503. #define QUERYRESULT_ALPHASORT 0x4000 // Alphabetical sort (not supported yet)
  504. #define QUERYRESULT_SKIPOCCINFO 0x8000 // Topic list only, no occurrence info
  505. #define STEMMED_SEARCH 0x00010000 // Perform runtime stemming (English only)
  506. #define LARGEQUERY_SEARCH 0x00020000 // Perform large query search
  507. #define SIMILAR_SEARCH 0x00040000 // Perform "find similar" search:
  508. // currently memory-optimized ranked boolean OR
  509. #define QUERY_GETTERMS 0x00080000 // Return with each set of occurrence
  510. // data a pointer to the term string
  511. // that the data is associated with.
  512. PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexSearch(LPIDX, LPQT, PSRCHINFO,
  513. LPGROUP, LPERRB);
  514. #ifndef SIMILARITY
  515. PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexFindSimilar (LPIDX lpidx,
  516. LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, LPGROUP lpResGroup, LPVOID pCallback,
  517. LPERRB lperrb);
  518. #endif // SIMILARITY
  519. PUBLIC ERR EXPORT_API PASCAL FAR MVHitListFlush (LPHL, DWORD);
  520. PUBLIC ERR EXPORT_API FAR PASCAL MVHitListGetTopic(LPHL, DWORD, PTOPICINFO);
  521. PUBLIC ERR EXPORT_API FAR PASCAL MVHitListGetHit(LPHL, PTOPICINFO, DWORD, LPHIT);
  522. PUBLIC void EXPORT_API FAR PASCAL MVHitListDispose(LPHL);
  523. PUBLIC DWORD EXPORT_API PASCAL FAR MVHitListEntries (LPHL);
  524. PUBLIC LONG EXPORT_API PASCAL FAR MVHitListMax (LPHL);
  525. PUBLIC ERR EXPORT_API PASCAL FAR MVHitListGroup (LPVOID, LPHL);
  526. PUBLIC void EXPORT_API PASCAL FAR MVGetIndexInfoLpidx(LPIDX, struct IndexInfo *);
  527. /*************************************************************************
  528. * Character Table Retrieval API
  529. *************************************************************************/
  530. PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableDispose (LPCTAB);
  531. PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableIndexLoad(HFPB, LSZ, LPERRB);
  532. PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableLoad (HFPB, LPB, LPERRB);
  533. PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableGetDefault (LPERRB);
  534. PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableDispose (LPCTAB);
  535. PUBLIC ERR EXPORT_API PASCAL FAR MVCharTableFileBuild (HFPB, LPCTAB, LSZ);
  536. PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableSetWildcards (LPCTAB);
  537. /*************************************************************************
  538. * Operator Table Index & Retrieval API
  539. *************************************************************************/
  540. PUBLIC LPOPTAB EXPORT_API PASCAL FAR MVOpTableLoad (LSZ, LPERRB);
  541. PUBLIC VOID EXPORT_API PASCAL FAR MVOpTableDispose (LPOPTAB);
  542. PUBLIC LPOPTAB EXPORT_API FAR PASCAL MVOpTableIndexLoad(HANDLE, LSZ, LPERRB);
  543. PUBLIC LPOPTAB EXPORT_API FAR PASCAL MVOpTableGetDefault(LPERRB);
  544. PUBLIC ERR EXPORT_API PASCAL FAR MVOpTableFileBuild(HFPB, LPOPTAB, LSZ);
  545. /*************************************************************************
  546. * Index API
  547. *************************************************************************/
  548. /*************************************************************************
  549. INDEXINFO structure
  550. This structure is used to give the indexer information on how the
  551. the index should be built given a certain amount of resource
  552. DWORD dwMemSize :
  553. Specify approximately how much memory the indexer is allowed to use.
  554. The rule of thumb is that if more memory is allowed indexing speed
  555. will be increased, if this much memory is given to the indexer by the
  556. operating system. The catch is that the operating system can not
  557. allocate such amount of memory for the indexer, so when some allowed
  558. limit is reached, the O/S will start swapping to disk (using Virtual
  559. Memory). This will slow down the indexer signicantly. It is best to
  560. try to do some trial/error testing to see how much memory will give
  561. an optmal indexing speed.
  562. If dwMemSize = 0, a minimum 4M is used
  563. DWORD dwBlockSize:
  564. Desired block size used by the index. The minimum size if 4096. By
  565. varying the block size, the user can make the overall size of the index
  566. smaller or larger, the retrieval speed slower or faster. Again, there
  567. is no strict rule. A large block will cause slowness in comparison,
  568. while a small block will cause extra seeks to be perfromed
  569. DWORD Occf:
  570. Occurrence field flags. Those describe what data are to be included
  571. in the index. The currently supported flags are:
  572. OCCF_FIELDID
  573. OCCF_TOPICID
  574. OCCF_COUNT
  575. OCCF_OFFSET
  576. OCCF_LENGTH
  577. Those flags can be OR'ed together
  578. DWORD Idxf
  579. Flags to denote how the index is built in term of ranking. The only
  580. supported flag is
  581. IDXF_NORMALIZE
  582. which tell the indexer to do normalized ranking. This causes the
  583. indexer to generate a huge table proportional with the maximum
  584. topic id to be reside in memory at index time, and saved with the
  585. index. This flag may cause the index's size to increase significantly,
  586. especially when the topic ids are non-sequential (ie. random). Using
  587. this flag with non-sequential topic ids may cause the indexer to
  588. fail because lack of memory. Normalized searches will have a tendercy
  589. to return short topics first
  590. LCID lcid
  591. This is used mostly for runtime stemming. Currently, runtime
  592. stemming is supported for English only. Any other language doesn't
  593. support runtime stemming yet
  594. *************************************************************************/
  595. typedef struct IndexInfo
  596. {
  597. DWORD dwMemSize; // Memory allocated for indexing to use
  598. DWORD dwBlockSize; // Unit block size of the index
  599. DWORD Occf; // Occurrenc field flags
  600. DWORD Idxf; // Various index flag
  601. //--------------- New Members for File Version 4.0 ----------------
  602. DWORD dwCodePageID; // ANSI code page no. specified at build time
  603. LCID lcid; // WIN32 locale ID specified at build time;
  604. DWORD dwBreakerInstID; // breaker instance that was used to parse
  605. // terms for the index at build time.
  606. } INDEXINFO, FAR *PINDEXINFO;
  607. // Various idxf flags. They can be OR'ed together
  608. #define IDXF_NONE ((IDXF)0x0000) // Nothing, just do straight boolean.
  609. #define IDXF_NORMALIZE ((IDXF)0x0001) // Index is normalized
  610. #define IDXF_NOSLACK ((IDXF)0x0002) // Not supported yet
  611. #define KEEP_TEMP_FILE ((IDXF)0x0100) // Keep flag
  612. #ifdef OLD_LANG_INFO
  613. // Various language flags
  614. #define LANGUAGE_ENGLISH 0x00
  615. #define LANGUAGE_JAPANESE 0x01
  616. #define LANGUAGE_TRAD_CHINESE 0x02 // Mapped to old LANGUAGE_CHINESE
  617. #define LANGUAGE_KOREAN 0x03
  618. #define LANGUAGE_ANSI 0x04
  619. #define LANGUAGE_SIMP_CHINESE 0x05
  620. #define SZ_LANGUAGE_ENGLISH L"English"
  621. #define SZ_LANGUAGE_JAPANESE L"Japanese"
  622. #define SZ_LANGUAGE_TRAD_CHINESE L"TradChinese"
  623. #define SZ_LANGUAGE_KOREAN L"Korean"
  624. #define SZ_LANGUAGE_ANSI L"ANSI"
  625. #define SZ_LANGUAGE_SIMP_CHINESE L"SimpChinese"
  626. #define CSZ_LANGUAGE_ENGLISH 7 // strlen (SZ_LANGUAGE_ENGLISH)
  627. #define CSZ_LANGUAGE_JAPANESE 8 // strlen (SZ_LANGUAGE_JAPANESE)
  628. #define CSZ_LANGUAGE_TRAD_CHINESE 11 // strlen (SZ_LANGUAGE_TRAD_CHINESE)
  629. #define CSZ_LANGUAGE_KOREAN 6 // strlen (SZ_LANGUAGE_KOREAN)
  630. #define CSZ_LANGUAGE_ANSI 4 // strlen (SZ_LANGUAGE_ANSI)
  631. #define CSZ_LANGUAGE_SIMP_CHINESE 11 // strlen (SZ_LANGUAGE_SIMP_CHINESE)
  632. #endif // OLD_LANG_INFO - obsoleted by IT 4.0's use of locale ids.
  633. /*
  634. * Occurence field flags. These are initially used to indicate which
  635. * occurence fields are to be indexed. Once an index is created, the
  636. * flags used to create an index are stored in it. During retrieval
  637. * these flags are examined in order to determine how to decode the
  638. * index format. Any combination of these fields is legal
  639. * OCCF_TOPICID field must be always present.
  640. *
  641. * OCCF_NONE
  642. * Basically, this is equivalent to OCCF_TOPICID, since the indexer
  643. * always turn OCCF_TOPICID on
  644. *
  645. * OCCF_FIELDID
  646. * Save information about the field that the word belongs to. This must
  647. * be set if the application is using VFLD
  648. *
  649. * OCCF_TOPICID
  650. * Topic ID is to be saved. Basically, a topic id is just a number
  651. * associated with the topic to uniquely identify it. This flag is always
  652. * set by the indexer
  653. *
  654. * OCCF_COUNT
  655. * Save information about the positions of the words relative to the
  656. * first word in a topic. This flag must be set if the application wants
  657. * to use NEAR or phrase operator in the search
  658. *
  659. * OCCF_OFFSET
  660. * Save the information about the offset of the word in the topic. This
  661. * is mostly used if the application wants to do search result highlighting
  662. *
  663. * OCCF_LENGTH
  664. * Save the information about the length of the word in the topic. This
  665. * is mostly used if the application wants to do search result highlighting
  666. * The length of a word can be different from the real length if stemming
  667. * or aliasing is used
  668. *
  669. * OCCF_LANGUAGE
  670. * Currently ignored. This is for future multi-lingual topics support
  671. */
  672. #define OCCF_NONE ((OCCF)0x0000) // Blank.
  673. #define OCCF_FIELDID ((OCCF)0x0001) // Field-ID is present.
  674. #define OCCF_TOPICID ((OCCF)0x0002) // Topic ID is present. This should
  675. // always be set.
  676. #define OCCF_COUNT ((OCCF)0x0004) // Word-count is present.
  677. #define OCCF_OFFSET ((OCCF)0x0008) // Byte-offset is present.
  678. #define OCCF_LENGTH ((OCCF)0x0010) // Word-length is present.
  679. #define OCCF_LANGUAGE ((OCCF)0x0020) // Language (not supported yet)
  680. #define OCCF_HAVE_OCCURRENCE (OCCF_OFFSET | OCCF_COUNT)
  681. /*
  682. OCC structure
  683. This structure contains all the information related to a word to be
  684. added to the index
  685. DWORD dwFieldId:
  686. Field Id value associated with the word. This field is added to
  687. the index if OCCF_FIELDID flag is set
  688. DWORD dwTopicID:
  689. Id of the topic that the word belongs to. Used if OCCF_TOPICID is set
  690. DWORD dwCount:
  691. Starting at 0, this is the position of the word compared to the first
  692. word in the topic (eg.the 11th word in the topic). Used if OCCF_COUNT
  693. is set
  694. DWORD dwOffset:
  695. Starting at 0, this is the offset of the word compared to the first
  696. byte in the topic. Used if OCCF_OFFSET is set
  697. WORD wWordLen:
  698. Real length of the word (unstemmed or alias). Used if OCCF_LENGTH is
  699. set
  700. WORD wLanguage:
  701. Currently ignored
  702. This structure is passed to MVIndexAddWord
  703. */
  704. typedef struct Occurence
  705. {
  706. DWORD dwFieldId; // Field-ID.
  707. DWORD dwTopicID; // TopicID.
  708. DWORD dwCount; // Word-count.
  709. DWORD dwOffset; // Byte-offset.
  710. WORD wWordLen; // Word-length.
  711. WORD wLanguage; // Language (not supported yet)
  712. } OCC, FAR *LPOCC;
  713. PUBLIC LPIPB EXPORT_API FAR PASCAL MVIndexInitiate(PINDEXINFO, LPERRB);
  714. PUBLIC ERR EXPORT_API FAR PASCAL MVIndexAddWord(LPIPB, LST, LPOCC);
  715. PUBLIC ERR EXPORT_API FAR PASCAL MVIndexBuild(HFPB, LPIPB, HFPB, LPSTR);
  716. PUBLIC ERR EXPORT_API FAR PASCAL MVIndexTopicDelete(HFPB, LPIPB, LSZ,
  717. DWORD FAR [], DWORD);
  718. PUBLIC ERR EXPORT_API FAR PASCAL MVIndexUpdate(HFPB, LPIPB, LSZ);
  719. PUBLIC ERR EXPORT_API FAR PASCAL MVIndexUpdateEx(HFPB, LPIPB, LSZ, DWORD FAR [], DWORD);
  720. PUBLIC void EXPORT_API FAR PASCAL MVIndexDispose(LPIPB);
  721. /*************************************************************************
  722. * Structures for internal uses only
  723. *************************************************************************/
  724. /*****************************************************************************
  725. *
  726. * MatchCache Structure
  727. *
  728. * The layout engine expects the matches to be in sorted order and
  729. * non-overlapping. Neither assumption is maintained by the search
  730. * engine. So, when the layout asks for matches for a particular
  731. * topic, they are sorted and combined and stored in a match cache.
  732. *
  733. *****************************************************************************/
  734. typedef struct _MATCHCACHE
  735. {
  736. long lTopic; // the topic number
  737. long lItem; // the item in the topic list
  738. long lMatches; // the number of matches
  739. HIT match[1]; // array of match information
  740. } MATCHCACHE, NEAR *PMATCHCACHE, FAR *LPMATCHCACHE;
  741. #pragma pack() // Guard against Zp problems.
  742. #ifdef __cplusplus
  743. }
  744. #endif
  745. #endif //__MVSEARCH_H_