Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

473 lines
17 KiB

  1. //+--------------------------------------------------------------------------
  2. //
  3. // Copyright (C) 1994, 1995, 1996 Microsoft Corporation. All Rights Reserved.
  4. //
  5. // File: thammer.h
  6. //
  7. // This include file defines 3 exported APIs and their callbacks that export
  8. // word-breaking functionality for non-spaced Asian languages (Japanese, Chinese)
  9. //
  10. // Summary of exports:
  11. // EnumSelectionOffsets - This function returns the offsets for the
  12. // selection chunks as specified in the Selection Profile (set at compile-time)
  13. // EnumSummarizationOffsets - This function returns the offsets for the
  14. // prefix (if any), the stem, and bound morphemes (fuzokugo).
  15. // EnumStemOffsets - This function returns the offsets for the stem only.
  16. // Offsets corresponding to any prefix or postfix characters will not
  17. // be returned.
  18. //
  19. // History: pathal Created.
  20. // 25-Jun-97 pathal Add TH_ERROR_INIT_FAILED
  21. // 05-Jul-97 pathal Add EnumSentenceOffsets, etc.
  22. //---------------------------------------------------------------------------
  23. // Return errors: the following error codes can be returned from any of
  24. // T-Hammer's exported APIs (EnumSelectionOffsets, EnumSummarizationOffsets,
  25. // and EnumStemOffsets)
  26. //
  27. #define TH_ERROR_SUCCESS 0
  28. #define TH_ERROR_NOHPBS 1
  29. #define TH_ERROR_INVALID_INPUT 2
  30. #define TH_ERROR_INVALID_CALLBACK 3
  31. #define TH_ERROR_INIT_FAILED 4
  32. #define TH_ERROR_NOT_IMPLEMENTED 5
  33. // Offset delimiter: the following code is used to delimit the end of a list of
  34. // token offsets returned to one of the Enum* callback routines. This is not
  35. // an error code.
  36. #define TH_SELECTION_INVALID_OFFSET 0xFFFFFFFF
  37. // TOKENIZE_MODE: Begin and End HPB Modes
  38. //
  39. // Begin and End HPB modes signify that a hard phrase break comes before the
  40. // first character in the string and/or follows after the last character in the string
  41. // If these flags are not set, then the default behavior of EnumTokens is to start
  42. // enumerating tokens to the right of the leftmost HPB, which probably won't
  43. // be at the first character (unless it is a punctuation symbol) and to conclude
  44. // enumeration at the rightmost HPB, which likely will not be the true end of the
  45. // string. So, these flags in affect force HPBs at the 0th and nth offsets, where
  46. // n is the number of characters in the input buffer
  47. //
  48. // WARNNIG: Since Tokenize operates in batch mode, it assumes that the
  49. // start and end of the input buffer are HPBs. These flags are only used for
  50. // EnumTokens
  51. //
  52. #define TOKENIZE_MODE_BEGIN_HPB 0x00000001
  53. #define TOKENIZE_MODE_END_HPB 0x00000002
  54. // Note on HPBs: HPB = hard phrase break.
  55. // HPBs are statistically determined from analyzing a tagged corpora.
  56. // Roughly, they cor-respond to places where you csn break with 100%
  57. // precision (=confidence). Mostly this is around punctuation characters
  58. // and certain conspicuous [case markers | character type] bigrams.
  59. // When the Hide Punctuation mode is set in the tokenize flag parameter
  60. // T-Hammer strips punctuation out of the Stem Offsets and Summarization Offsets
  61. // callback
  62. //
  63. #define TOKENIZE_MODE_HIDE_PUNCTUATION 0x00000004
  64. //+--------------------------------------------------------------------------
  65. // Routine: EnumSelectionOffsetsCallback
  66. //
  67. // Synopsis: client-side callback that receives a list of offsets for selection chunks
  68. //
  69. // Parameters:
  70. // pichOffsets - pointer to first element in an array of offsets into client
  71. // text buffer. NOTE: callback is not allowed to stash pichChunks for
  72. // later processing. pichChunks will not persist between successive
  73. // callbacks. If the callback wants to use the data pointed to by pich
  74. // it must copy it to its own store
  75. // cOffsets - number of offsets passed to client (always > 1)
  76. // lpData - client defined data
  77. //
  78. // Return:
  79. // TRUE - to abort token enumeration
  80. // FALSE - to continue
  81. //---------------------------------------------------------------------------
  82. // BOOL
  83. // EnumSelectionOffsetsCallback (
  84. // IN CONST DWORD *pichOffsets,
  85. // IN DWORD cOffsets,
  86. // IN OUT LPARAM lpData);
  87. typedef BOOL (CALLBACK * ENUM_SELECTION_OFFSETS_CALLBACK)(
  88. IN CONST DWORD *pichOffsets,
  89. IN CONST DWORD cOffsets,
  90. IN OUT LPARAM lpData);
  91. //+--------------------------------------------------------------------------
  92. // Routine: EnumSelectionOffsets
  93. //
  94. // Synopsis: This is the main entry point for tokenizing text. Sends tokens,
  95. // which can either be offsets or zero delimited strings to callback.
  96. //
  97. // Parameters:
  98. // pwszText - pointer to wide-character text buffer to be tokenized,
  99. // cchText - count of characters in text buffer,
  100. // fBeginEndHPBMode - flag describing the callback mode (see above),
  101. // pcbEnumSelectionOffsets - pointer to callback procedure handling token
  102. // enumeration,
  103. // lpData - client defined data
  104. //
  105. // Returns:
  106. // TH_ERROR_SUCCESS - if the call completed successfully
  107. // TH_ERROR_NOHPBS - if there were no HPBs
  108. // TH_ERROR_INVALID_INPUT - if the input buffer was bad
  109. // TH_ERROR_INVALID_CALLBACK - if the input callback was bad
  110. //---------------------------------------------------------------------------
  111. INT
  112. APIENTRY
  113. EnumSelectionOffsets(
  114. IN PCWSTR pwszText,
  115. IN DWORD cchText,
  116. IN DWORD fBeginEndHPBMode,
  117. IN ENUM_SELECTION_OFFSETS_CALLBACK pcbEnumSelectionOffsets,
  118. IN LPARAM lpData);
  119. typedef INT (APIENTRY *LP_ENUM_SELECTION_OFFSETS)(
  120. IN PCWSTR pwszText,
  121. IN DWORD cchText,
  122. IN DWORD fBeginEndHPBMode,
  123. IN ENUM_SELECTION_OFFSETS_CALLBACK pcbEnumSelectionOffsets,
  124. IN LPARAM lpData);
  125. //+--------------------------------------------------------------------------
  126. // Routine: EnumSummarizationOffsetsCallback
  127. //
  128. // Synopsis: client-side callback that receives a list of offsets for each stem
  129. // in the free morpheme (jiritsugo) phrase. Last offset is always contains
  130. // the complete string of bound morphemes (fuzokugo). For example,
  131. // for "kaisan shite nai sou desu", offsets are returned for "kaisan" and
  132. // "shite nai sou desu". So, counting the first initial offset, there are three
  133. // offsets.
  134. //
  135. // Parameters:
  136. // pichOffsets - pointer to first element in an array of offsets into client
  137. // text buffer. NOTE: callback is not allowed to stash pichOffsets for
  138. // later processing. pichOffsets will not persist between successive
  139. // callbacks. If the callback wants to use the data pointed to by pich
  140. // it must copy it to its own store
  141. // cOffsets - number of offsets passed to client (always > 1)
  142. // lpData - client defined data
  143. //
  144. // Return:
  145. // TRUE - to abort token enumeration
  146. // FALSE - to continue
  147. //---------------------------------------------------------------------------
  148. // BOOL
  149. // EnumSummarizationOffsets (
  150. // IN CONST DWORD *pichOffsets,
  151. // IN DWORD cOffsets,
  152. // IN OUT LPARAM lpData);
  153. typedef BOOL (CALLBACK * ENUM_SUMMARIZATION_OFFSETS_CALLBACK)(
  154. IN CONST DWORD *pichOffsets,
  155. IN CONST DWORD cOffsets,
  156. IN OUT LPARAM lpData);
  157. typedef BOOL (CALLBACK * ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK)(
  158. IN CONST DWORD *pichOffsets,
  159. IN CONST DWORD cOffsets,
  160. IN PCWSTR pwzPOS,
  161. IN PCWSTR pwzMCat,
  162. IN OUT LPARAM lpData);
  163. //+--------------------------------------------------------------------------
  164. // Routine: EnumSummarizationOffsets
  165. //
  166. // Synopsis: This is the entry point for returning offsets for tokens used
  167. // in summarization. These tokens correspond to stems and bound morphemes
  168. // (fuzokugo) in the text. A list of offsets (and a count) is sent to the
  169. // EnumSummarizationOffsets callback (see above)
  170. //
  171. // Parameters:
  172. // pwszText - pointer to wide-character text buffer to be tokenized,
  173. // cchText - count of characters in text buffer,
  174. // fTokenizeMode - flag describing the callback mode (see above),
  175. // pEnumTokOutputProc - pointer to callback procedure handling token
  176. // enumeration,
  177. // lpData - client defined data
  178. //
  179. // Returns:
  180. // TH_ERROR_SUCCESS - if the call completed successfully
  181. // TH_ERROR_NOHPBS - if there were no HPBs
  182. // TH_ERROR_INVALID_INPUT - if the input buffer was bad
  183. // TH_ERROR_INVALID_CALLBACK - if the input callback was bad
  184. //---------------------------------------------------------------------------
  185. INT
  186. APIENTRY
  187. EnumSummarizationOffsets(
  188. IN PCWSTR pwszText,
  189. IN DWORD cchText,
  190. IN DWORD fBeginEndHPBMode,
  191. IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
  192. IN LPARAM lpData);
  193. INT
  194. APIENTRY
  195. EnumSummarizationOffsetsEx1(
  196. IN PCWSTR pwszText,
  197. IN DWORD cchText,
  198. IN DWORD fBeginEndHPBMode,
  199. IN ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK pcbEnumSummarizationOffsetsEx1,
  200. IN LPARAM lpData);
  201. typedef INT (APIENTRY *LP_ENUM_SUMMARIZATION_OFFSETS)(
  202. IN PCWSTR pwszText,
  203. IN DWORD cchText,
  204. IN DWORD fBeginEndHPBMode,
  205. IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
  206. IN LPARAM lpData);
  207. typedef INT (APIENTRY *LP_ENUM_SUMMARIZATION_OFFSETS_EX1)(
  208. IN PCWSTR pwszText,
  209. IN DWORD cchText,
  210. IN DWORD fBeginEndHPBMode,
  211. IN ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK pcbEnumSummarizationOffsetsEx1,
  212. IN LPARAM lpData);
  213. //+--------------------------------------------------------------------------
  214. // Routine: EnumStemOffsetsCallback
  215. //
  216. // Synopsis: client-side callback that receives a zero--terminated stem per SPB
  217. //
  218. // Parameters:
  219. // pwszStem - zero terminated stem string
  220. // lpData - client defined data
  221. //
  222. // Return:
  223. // TRUE - to abort token enumeration
  224. // FALSE - to continue
  225. //---------------------------------------------------------------------------
  226. // BOOL
  227. // EnumStemOffsetsCallback (
  228. // IN WCHAR *pwszStem,
  229. // IN OUT LPARAM lpData);
  230. typedef BOOL (CALLBACK * ENUM_STEM_OFFSETS_CALLBACK)(
  231. IN CONST DWORD *pichOffsets,
  232. IN CONST DWORD cOffsets,
  233. IN OUT LPARAM lpData);
  234. //+--------------------------------------------------------------------------
  235. // Routine: EnumStemOffsets
  236. //
  237. // Synopsis: This is the entry point for tokenizing stems. Sends offsets,
  238. // for stems to the EnumStemOffsets callback (see above)
  239. //
  240. // Parameters:
  241. // pwszText - pointer to wide-character text buffer to be tokenized,
  242. // cchText - count of characters in text buffer,
  243. // fTokenizeMode - flag describing the callback mode (see above),
  244. // pEnumTokOutputProc - pointer to callback procedure handling token
  245. // enumeration,
  246. // lpData - client defined data
  247. //
  248. // Returns:
  249. // TH_ERROR_SUCCESS - if the call completed successfully
  250. // TH_ERROR_NOHPBS - if there were no HPBs
  251. // TH_ERROR_INVALID_INPUT - if the input buffer was bad
  252. // TH_ERROR_INVALID_CALLBACK - if the input callback was bad
  253. //---------------------------------------------------------------------------
  254. INT
  255. APIENTRY
  256. EnumStemOffsets(
  257. IN PCWSTR pwszText,
  258. IN DWORD cchText,
  259. IN DWORD fBeginEndHPBMode,
  260. IN ENUM_STEM_OFFSETS_CALLBACK pcbEnumStemOffsets,
  261. IN OUT DWORD *pcchTextProcessed,
  262. IN LPARAM lpData);
  263. typedef INT (APIENTRY *LP_ENUM_STEM_OFFSETS)(
  264. IN PCWSTR pwszText,
  265. IN DWORD cchText,
  266. IN DWORD fBeginEndHPBMode,
  267. IN ENUM_STEM_OFFSETS_CALLBACK pcbEnumStemOffsets,
  268. IN OUT DWORD *pcchTextProcessed,
  269. IN LPARAM lpData);
  270. //+--------------------------------------------------------------------------
  271. // Routine: EnumStemInfoCallback
  272. //
  273. // Synopsis: client-side callback that receives offsets and stem information
  274. //
  275. // Parameters:
  276. // ichOffset - offset to first character in stem
  277. // cchLen - length of the stem
  278. // pwszPOS - string containing POS info
  279. // pwszMCat - string containing MCat info
  280. // pwszDictionaryForm - string containing Dictionary Form
  281. // lpData - client defined data
  282. //
  283. // Return:
  284. // TRUE - to abort token enumeration
  285. // FALSE - to continue
  286. //---------------------------------------------------------------------------
  287. // BOOL
  288. // EnumStemInfoCallback (
  289. // IN CONST DWORD ichOffset,
  290. // IN CONST DWORD cchLen,
  291. // IN PCWSTR pwszPOS,
  292. // IN PCWSTR pwszMCat,
  293. // IN PCWSTR pwszDictionaryForm,
  294. // IN OUT LPARAM lpData);
  295. typedef BOOL (CALLBACK * ENUM_STEM_INFO_CALLBACK)(
  296. IN CONST DWORD ichOffset,
  297. IN CONST DWORD cchLen,
  298. IN PCWSTR pwszPOS,
  299. IN PCWSTR pwszMCat,
  300. IN PCWSTR pwszDictionaryForm,
  301. IN OUT LPARAM lpData);
  302. //+--------------------------------------------------------------------------
  303. // Routine: EnumStemInfo
  304. //
  305. // Synopsis: Call this routine to get information about stems.
  306. // For example, if you want the dictionary form, part-of-speech or
  307. // MCat information for a stem, then this is the API for you
  308. //
  309. // Parameters:
  310. // pwszText - pointer to wide-character text buffer to be tokenized,
  311. // cchText - count of characters in text buffer,
  312. // fTokenizeMode - flag describing the callback mode (see above),
  313. // pcbEnumStemInfo - pointer to callback procedure handling stem info
  314. // enumeration
  315. // lpData - client defined data
  316. //
  317. // Returns:
  318. // TH_ERROR_SUCCESS - if the call completed successfully
  319. // TH_ERROR_NOHPBS - if there were no HPBs
  320. // TH_ERROR_INVALID_INPUT - if the input buffer was bad
  321. // TH_ERROR_INVALID_CALLBACK - if the input callback was bad
  322. //---------------------------------------------------------------------------
  323. INT
  324. APIENTRY
  325. EnumStemInfo(
  326. IN PCWSTR pwszText,
  327. IN DWORD cchText,
  328. IN DWORD fBeginEndHPBMode,
  329. IN ENUM_STEM_INFO_CALLBACK pcbEnumStemInfo,
  330. IN OUT DWORD *pcchTextProcessed,
  331. IN LPARAM lpData);
  332. typedef INT (APIENTRY *LP_ENUM_STEM_INFO)(
  333. IN PCWSTR pwszText,
  334. IN DWORD cchText,
  335. IN DWORD fBeginEndHPBMode,
  336. IN ENUM_STEM_INFO_CALLBACK pcbEnumStemInfo,
  337. IN OUT DWORD *pcchTextProcessed,
  338. IN LPARAM lpData);
  339. //+--------------------------------------------------------------------------
  340. // Routine: EnumSentenceOffsetsCallback
  341. //
  342. // Synopsis: client-side callback that receives a list of offsets for sentence breaks
  343. //
  344. // Parameters:
  345. // ichOffsetStart - offset to start of sentence
  346. // ichOffsetEnd - offset to end of sentence (includes terminating punctuation)
  347. // lpData - client defined data
  348. //
  349. // Return:
  350. // TRUE - to abort token enumeration
  351. // FALSE - to continue
  352. //---------------------------------------------------------------------------
  353. // BOOL
  354. // EnumSentenceOffsetsCallback (
  355. // IN DWORD ichOffsetStart,
  356. // IN DWORD ichOffsetEnd,
  357. // IN OUT LPARAM lpData);
  358. typedef BOOL (CALLBACK * ENUM_SENTENCE_OFFSETS_CALLBACK)(
  359. IN DWORD ichOffsetStart,
  360. IN DWORD ichOffsetEnd,
  361. IN OUT LPARAM lpData);
  362. //+--------------------------------------------------------------------------
  363. // Routine: EnumSentenceOffsets
  364. //
  365. // Synopsis: This is the main entry point for breaking sentences.
  366. // Sends offsets delimiting sentences to the callback.
  367. //
  368. // Parameters:
  369. // pwszText - pointer to wide-character text buffer to be tokenized,
  370. // cchText - count of characters in text buffer,
  371. // fTokenizeMode - not used. later this will be used to control how
  372. // partial sentences are handled.
  373. // pEnumSentenceOffsetsCallback - pointer to callback procedure handling offsets
  374. // lpData - client defined data
  375. //
  376. // Returns:
  377. // TH_ERROR_SUCCESS - if the call completed successfully
  378. // TH_ERROR_NOHPBS - if there were no HPBs
  379. // TH_ERROR_INVALID_INPUT - if the input buffer was bad
  380. // TH_ERROR_INVALID_CALLBACK - if the input callback was bad
  381. //---------------------------------------------------------------------------
  382. INT
  383. APIENTRY
  384. EnumSentenceOffsets(
  385. IN PCWSTR pwszText,
  386. IN DWORD cchText,
  387. IN DWORD fTokenizeMode,
  388. IN ENUM_SENTENCE_OFFSETS_CALLBACK pcbEnumSentenceOffsets,
  389. IN LPARAM lpData);
  390. typedef INT (APIENTRY *LP_ENUM_SENTENCE_OFFSETS)(
  391. IN PCWSTR pwszText,
  392. IN DWORD cchText,
  393. IN DWORD fTokenizeMode,
  394. IN ENUM_SENTENCE_OFFSETS_CALLBACK pcbEnumSentenceOffsets,
  395. IN LPARAM lpData);
  396. //+--------------------------------------------------------------------------
  397. // Routine: FEMorphCallback
  398. //
  399. // Synopsis: The callback that gets a text stream from T-Hammer.
  400. //
  401. // Parameters:
  402. // pwszWMorphRecs - a pointer to wide character text stream,
  403. // which contains mophological analyses of a given sentence
  404. // pvData - pointer to private data
  405. //
  406. // Returns:
  407. // TRUE if no more analysis is needed
  408. //---------------------------------------------------------------------------
  409. // BOOL
  410. // FEMorphCallback(
  411. // IN PWSTR pwszMorphRecs);
  412. typedef BOOL (CALLBACK * FEMORPH_CALLBACK)(
  413. IN PWSTR pwszMorphRecs,
  414. IN VOID *pvData);
  415. //+--------------------------------------------------------------------------
  416. // Routine: FEMorph
  417. //
  418. // Synopsis: This is the entry point for NLPWIN morpheme analysis.
  419. // Sends a morpheme record string back to the lex callback in NLPWIN
  420. //
  421. // Parameters:
  422. // pwszText - pointer to wide-character text buffer to be tokenized,
  423. // pcbFEMorphCB - pointer to callback procedure handling morph rec enumeration
  424. // pvData - pointer to private data
  425. //
  426. // Returns:
  427. //---------------------------------------------------------------------------
  428. INT
  429. APIENTRY
  430. FEMorph(
  431. IN PCWSTR pwszText,
  432. IN FEMORPH_CALLBACK pcbFEMorphCB,
  433. IN VOID *pvData);
  434. typedef INT (APIENTRY *LP_FEMORPH)(
  435. IN PCWSTR pwszText,
  436. IN FEMORPH_CALLBACK pcbFEMorphCB,
  437. IN VOID *pvData);