Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

998 lines
31 KiB

  1. /*************************************************************************
  2. * *
  3. * BREAKER.C *
  4. * *
  5. * Copyright (C) Microsoft Corporation 1990-1994 *
  6. * All Rights reserved. *
  7. * *
  8. **************************************************************************
  9. * *
  10. * Module Intent *
  11. * Word breaker module *
  12. * This module provides word-breaking routines applicable to the ANSI *
  13. * character-set. This means American English. *
  14. * Note that ANSI does not mean ASCII. *
  15. * *
  16. * WARNING: Tab setting is 4 for this file *
  17. * *
  18. **************************************************************************
  19. * *
  20. * Current Owner: BinhN *
  21. * *
  22. **************************************************************************
  23. * *
  24. * Released by Development: (date) *
  25. * *
  26. *************************************************************************/
  27. #include <verstamp.h>
  28. SETVERSIONSTAMP(MVBK);
  29. #include <mvopsys.h>
  30. #include <iterror.h>
  31. #include <mvsearch.h>
  32. #include "common.h"
  33. /* Macros to access structure's members */
  34. #define CP_CLASS(p) (((LPCMAP)p)->Class & 0xff)
  35. #define CP_NORMC(p) (((LPCMAP)p)->Norm)
  36. /*************************************************************************
  37. *
  38. * INTERNAL PRIVATE FUNCTIONS
  39. * All of them should be declared near
  40. *************************************************************************/
  41. PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS, WORD);
  42. PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord,
  43. LPCMAP lpCharPropTab, LPB lpbLigatureTab, WORD wcLigature);
  44. /*************************************************************************
  45. *
  46. * SINGLE TO DOUBLE-WIDTH KATAKANA MAPPING ARRAY
  47. *
  48. *************************************************************************/
  49. // Single-Width to Double-Width Mapping Array
  50. //
  51. static const int mtable[][2]={
  52. {129,66},{129,117},{129,118},{129,65},{129,69},{131,146},{131,64},
  53. {131,66},{131,68},{131,70},{131,72},{131,131},{131,133},{131,135},
  54. {131,98},{129,91},{131,65},{131,67},{131,69},{131,71},{131,73},
  55. {131,74},{131,76},{131,78},{131,80},{131,82},{131,84},{131,86},
  56. {131,88},{131,90},{131,92},{131,94},{131,96},{131,99},{131,101},
  57. {131,103},{131,105},{131,106},{131,107},{131,108},{131,109},
  58. {131,110},{131,113},{131,116},{131,119},{131,122},{131,125},
  59. {131,126},{131,128},{131,129},{131,130},{131,132},{131,134},
  60. {131,136},{131,137},{131,138},{131,139},{131,140},{131,141},
  61. {131,143},{131,147},{129,74},{129,75} };
  62. /*************************************************************************
  63. * @doc API INDEX RETRIEVAL
  64. *
  65. * @func LPIBI FAR PASCAL | BreakerInitiate |
  66. * Allocates a breaker parameter block. This parameter block keeps
  67. * track of the breaker's "global" variables.
  68. *
  69. * @rdesc NULL if the call fails (ie. no more memory)
  70. * a pointer to the block if it succeeds.
  71. *************************************************************************/
  72. PUBLIC LPIBI EXPORT_API FAR PASCAL BreakerInitiate(void)
  73. {
  74. _LPIBI lpibi;
  75. register HANDLE hibi;
  76. if ((hibi = GlobalAlloc(GMEM_MOVEABLE | GMEM_ZEROINIT,
  77. sizeof(IBI))) == NULL) {
  78. return NULL;
  79. }
  80. //
  81. // All variables not explicitly initialized are assumed to be
  82. // initialized as zero.
  83. //
  84. lpibi = (_LPIBI)GlobalLock(hibi);
  85. lpibi->hibi = hibi;
  86. return lpibi;
  87. }
  88. /*************************************************************************
  89. * @doc API INDEX RETRIEVAL
  90. *
  91. * @func void FAR PASCAL | BreakerFree |
  92. * Frees a word-breaker parameter block.
  93. *
  94. * @parm LPIBI | lpibi |
  95. * Pointer to the InternalBreakInfo Structure containing all the
  96. * informations about states
  97. *************************************************************************/
  98. PUBLIC void EXPORT_API FAR PASCAL BreakerFree(_LPIBI lpibi)
  99. {
  100. HANDLE hibi;
  101. /* Do sanity check */
  102. if (lpibi == NULL)
  103. return;
  104. hibi = lpibi->hibi;
  105. GlobalUnlock(hibi);
  106. GlobalFree(hibi);
  107. }
  108. // - - - - - - - - -
  109. // Break words out from a block of standard text characters.
  110. //
  111. // This routine is incredibly important. Any change in the performance
  112. // of this function will have immediate and obvious influence upon the
  113. // performance of the indexing system as a whole. Consequently, the
  114. // function should be very fast.
  115. //
  116. // This function uses a simple state machine to try to achieve the
  117. // necessary speed. It's in a different loop depending upon what kind
  118. // of characters it's trying to find, and it uses "goto" statements to
  119. // shift back and forth between "states".
  120. //
  121. /*************************************************************************
  122. * @doc API RETRIEVAL INDEX
  123. *
  124. * @func ERR | FBreakWords |
  125. * This function break a string into a sequence of words.
  126. *
  127. * @parm LPBRK_PARMS | lpBrkParms |
  128. * Pointer to structure containing all the parameters needed for
  129. * the breaker. They include:
  130. * 1/ Pointer to the InternalBreakInfo
  131. * 2/ Pointer to input buffer containing the word stream
  132. * 3/ Size of the input bufer
  133. * 4/ Offset in the source text of the first byte of the input buffer
  134. * 5/ Pointer to user's parameter block for the user's function
  135. * 6/ User's function to call with words. The format of the call should
  136. * be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb,
  137. * LPV lpvUser)
  138. * The function should return S_OK if succeeded
  139. * The function can be NULL
  140. * 7/ Pointer to stop word table. This table contains stop words specific
  141. * to this breaker. If this is non-null, then the function
  142. * will flag errors for stop word present in the query
  143. * 8/ Pointer to character table. If NULL, then the default built-in
  144. * character table will be used
  145. *
  146. * @rdesc
  147. * The function returns S_OK if succeeded. The failure's causes
  148. * are:
  149. * @flag E_WORDTOOLONG | Word too long
  150. * @flag errors | returned by the lpfnfOutWord
  151. *************************************************************************/
  152. PUBLIC ERR EXPORT_API FAR PASCAL FBreakWords(LPBRK_PARMS lpBrkParms)
  153. {
  154. return (WordBreakStem(lpBrkParms, FALSE));
  155. }
  156. #if 0
  157. /*************************************************************************
  158. * @doc API RETRIEVAL INDEX
  159. *
  160. * @func ERR | FBreakAndStemWords |
  161. * This function breaks a string into a sequence of words and
  162. * stems each resulting word
  163. *
  164. * @parm LPBRK_PARMS | lpBrkParms |
  165. * Pointer to structure containing all the parameters needed for
  166. * the breaker. They include:
  167. * 1/ Pointer to the InternalBreakInfo
  168. * 2/ Pointer to input buffer containing the word stream
  169. * 3/ Size of the input bufer
  170. * 4/ Offset in the source text of the first byte of the input buffer
  171. * 5/ Pointer to user's parameter block for the user's function
  172. * 6/ User's function to call with words. The format of the call should
  173. * be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb,
  174. * LPV lpvUser)
  175. * The function should return S_OK if succeeded
  176. * The function can be NULL
  177. * 7/ Pointer to stop word table. This table contains stop words specific
  178. * to this breaker. If this is non-null, then the function
  179. * will flag errors for stop word present in the query
  180. * 8/ Pointer to character table. If NULL, then the default built-in
  181. * character table will be used
  182. *
  183. * @rdesc
  184. * The function returns S_OK if succeeded. The failure's causes
  185. * are:
  186. * @flag E_WORDTOOLONG | Word too long
  187. * @flag Other errors | returned by the lpfnfOutWord
  188. *************************************************************************/
  189. PUBLIC ERR EXPORT_API FAR PASCAL FBreakAndStemWords(LPBRK_PARMS lpBrkParms)
  190. {
  191. return (WordBreakStem(lpBrkParms, TRUE));
  192. }
  193. #endif
  194. PUBLIC ERR EXPORT_API FAR PASCAL BreakerVersion (void)
  195. {
  196. return CHARTABVER;
  197. }
  198. // This exists only to enable MVJK to link statically.
  199. // We must have the same function names for the static build.
  200. PUBLIC ERR FAR PASCAL FBreakStems(LPBRK_PARMS lpBrkParms)
  201. {
  202. return E_NOTSUPPORTED;
  203. }
  204. // This exists only to enable MVJK to link statically.
  205. // We must have the same function names for the static build.
  206. PUBLIC ERR FAR PASCAL FSelectWord (LPCSTR pBuffer, DWORD dwCount,
  207. DWORD dwOffset, LPDWORD pStart, LPDWORD pEnd)
  208. {
  209. return E_NOTSUPPORTED;
  210. }
  211. /*************************************************************************
  212. * @doc INTERNAL
  213. *
  214. * @func ERR | WordBreakStem |
  215. * This function breaks a string into a sequence of words and
  216. * stems each resulting word
  217. *
  218. * @parm BYTE | fStem |
  219. * If set, stem the word
  220. *
  221. * @rdesc
  222. * The function returns S_OK if succeeded. The failure's causes
  223. * are:
  224. * @flag E_WORDTOOLONG | Word too long
  225. * @flag Other errors | returned by the lpfnfOutWord
  226. *************************************************************************/
  227. PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS lpBrkParms, WORD fStem)
  228. {
  229. register LPB lpbRawWord; // Pointer to RawWord buffer
  230. register LPB lpbNormWord; // Pointer to NormWord buffer
  231. LPCMAP lpCharPropTab; // Pointer to the char property table
  232. LPB lpbInBuffer; // Buffer to groot through.
  233. LPB lpbRawWordLimit; // Limit of RawWord buffer
  234. #if 0
  235. LPB lpbNormWordLimit; // Limit of NormWord buffer
  236. #endif
  237. BYTE bCurChar; // Current character.
  238. BYTE fScan = TRUE;
  239. ERR fRet;
  240. #if 0
  241. BYTE astStemmed[CB_MAX_WORD_LEN + 2]; // Temporary buffer for stemming
  242. #endif
  243. LPB lpbLigature = NULL;
  244. WORD wcLigature = 0;
  245. LPCHARTAB lpCharTab;
  246. LPB astNormWord;
  247. LPB astRawWord;
  248. BYTE fAcceptWildCard;
  249. /* Breakers parameters break out */
  250. _LPIBI lpibi;
  251. LPB lpbInBuf;
  252. CB cbInBufSize;
  253. LCB lcbInBufOffset;
  254. LPV lpvUser;
  255. FWORDCB lpfnfOutWord;
  256. _LPSIPB lpsipb;
  257. LPCMAP lpCMap = NULL;
  258. /*
  259. * Initialize variables
  260. */
  261. if (lpBrkParms == NULL ||
  262. (lpibi = lpBrkParms->lpInternalBreakInfo) == NULL)
  263. return E_INVALIDARG;
  264. astNormWord = (LPB)lpibi->astNormWord;
  265. astRawWord = (LPB)lpibi->astRawWord;
  266. lpbInBuf = lpBrkParms->lpbBuf;
  267. lpvUser = lpBrkParms->lpvUser;
  268. lpfnfOutWord = lpBrkParms->lpfnOutWord;
  269. lpsipb = lpBrkParms->lpStopInfoBlock;
  270. fAcceptWildCard = (BYTE)(lpBrkParms->fFlags & ACCEPT_WILDCARD);
  271. /*
  272. * Restore to the proper state. This is in place to handle
  273. * words that cross block boundaries, and to deal with explicit
  274. * buffer-flush commands.
  275. */
  276. if ((lpbInBuffer = lpbInBuf) != NULL) {
  277. cbInBufSize = lpBrkParms->cbBufCount;
  278. lcbInBufOffset = lpBrkParms->lcbBufOffset;
  279. if (lpCharTab = lpBrkParms->lpCharTab) {
  280. lpCMap = (LPCMAP)(lpCharTab->lpCMapTab);
  281. lpbLigature = lpCharTab->lpLigature;
  282. wcLigature = lpCharTab->wcLigature;
  283. }
  284. else {
  285. return(E_INVALIDARG);
  286. }
  287. lpbRawWordLimit = (LPB)&astRawWord[CB_MAX_WORD_LEN];
  288. switch (lpibi->state) {
  289. case SCAN_WHITE_STATE:
  290. goto ScanWhite; // Running through white space.
  291. case SCAN_WORD_STATE:
  292. lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
  293. lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
  294. goto ScanWord; // Found one 'a'..'z', collecting.
  295. case SCAN_NUM_STATE:
  296. lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
  297. lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
  298. goto ScanNumber;// Found one '0'..'9', collecting.
  299. case SCAN_LEADBYTE_STATE:
  300. lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
  301. lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
  302. goto ScanLeadByte;
  303. case SCAN_SBKANA_STATE:
  304. lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
  305. lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
  306. goto ScanSbKana;
  307. }
  308. }
  309. else {
  310. cbInBufSize = fScan = 0;
  311. switch (lpibi->state) {
  312. case SCAN_WHITE_STATE:
  313. return S_OK; // Still stuck in white space.
  314. case SCAN_WORD_STATE:
  315. goto FlushWord; // Flush a word.
  316. case SCAN_NUM_STATE:
  317. goto FlushNumber; // Flush a number.
  318. case SCAN_LEADBYTE_STATE:
  319. goto ScanLeadByte;
  320. case SCAN_SBKANA_STATE:
  321. goto ScanSbKana;
  322. }
  323. }
  324. //
  325. // W H I T E - S P A C E S T A T E
  326. //
  327. // While in this state the code is hunting through white-space,
  328. // searching for an alpha character or a digit character. If
  329. // it finds one, it initializes the word and goes to either the
  330. // word-collection state or the number-collection state.
  331. //
  332. ScanWhite:
  333. for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
  334. //
  335. // Get the character and its class.
  336. //
  337. switch (CP_CLASS(&lpCMap[*lpbInBuffer])) {
  338. case CLASS_WILDCARD:
  339. if (fAcceptWildCard == FALSE)
  340. continue;
  341. case CLASS_TYPE: // Found the 1st byte of the special string
  342. case CLASS_CHAR: // Found a non-normalized char
  343. case CLASS_NORM: // Found a normalized character
  344. case CLASS_LIGATURE: // Found a ligature
  345. // jump to the word-collection state.
  346. lpibi->lcb = (DWORD)(lcbInBufOffset +
  347. (lpbInBuffer - lpbInBuf));
  348. lpbRawWord = (LPB)&astRawWord[2];
  349. lpbNormWord = (LPB)&astNormWord[2];
  350. goto ScanWord;
  351. case CLASS_DIGIT: // Found a digit.
  352. lpibi->lcb = (DWORD)(lcbInBufOffset +
  353. (lpbInBuffer - lpbInBuf));
  354. lpibi->cbNormPunctLen = lpibi->cbRawPunctLen = 0;
  355. lpbRawWord = (LPB)&astRawWord[2];
  356. lpbNormWord = (LPB)&astNormWord[2];
  357. goto ScanNumber;
  358. case CLASS_LEADBYTE:
  359. lpibi->lcb = (DWORD)(lcbInBufOffset +
  360. (lpbInBuffer - lpbInBuf));
  361. lpbRawWord = (LPB)&astRawWord[2];
  362. lpbNormWord = (LPB)&astNormWord[2];
  363. *(LPW)astNormWord = *(LPW)astRawWord = 0;
  364. goto ScanLeadByte;
  365. case CLASS_SBKANA:
  366. lpibi->lcb = (DWORD)(lcbInBufOffset +
  367. (lpbInBuffer - lpbInBuf));
  368. *(LPW)astNormWord = *(LPW)astRawWord = 0;
  369. lpbRawWord = (LPB)&astRawWord[2];
  370. lpbNormWord = (LPB)&astNormWord[2];
  371. goto ScanSbKana;
  372. }
  373. }
  374. //
  375. // If I run out of data, set things up so I'll come back
  376. // to this state if the user provides more data.
  377. //
  378. lpibi->state = SCAN_WHITE_STATE;
  379. return S_OK;
  380. ScanWord:
  381. //
  382. // W O R D S T A T E
  383. //
  384. // While in this state the code is attempting to append alpha
  385. // and digit characters to the alpha character it's already
  386. // found. Apostrophes are stripped.
  387. //
  388. for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
  389. //
  390. // Get the character and its class.
  391. //
  392. lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
  393. switch (CP_CLASS(lpCharPropTab)) {
  394. case CLASS_NORM :
  395. case CLASS_DIGIT :
  396. case CLASS_CHAR:
  397. //
  398. // Found a normalized character or a digit.
  399. // Append it to the output buffer.
  400. //
  401. if (lpbRawWord >= lpbRawWordLimit)
  402. return (E_WORDTOOLONG);
  403. *lpbRawWord++ = bCurChar;
  404. *lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]);
  405. break;
  406. case CLASS_LIGATURE:
  407. //
  408. // Found an ligature character. Normalize
  409. // it and append it to the output buffer.
  410. //
  411. if (lpbRawWord >= lpbRawWordLimit)
  412. return (E_WORDTOOLONG);
  413. *lpbRawWord++ = bCurChar;
  414. lpbNormWord += LigatureMap (bCurChar, lpbNormWord,
  415. lpCMap, lpbLigature, wcLigature);
  416. break;
  417. case CLASS_STRIP:
  418. //
  419. // Found an apostrophe or somesuch. Ignore
  420. // this character, but increment the word length,
  421. // since it counts as part of the un-normalized
  422. // word's length.
  423. //
  424. if (lpbRawWord >= lpbRawWordLimit)
  425. return (E_WORDTOOLONG);
  426. *lpbRawWord++ = bCurChar;
  427. break;
  428. case CLASS_TYPE :
  429. /* Set the flag to remind us to get the
  430. second byte.
  431. */
  432. lpibi->fGotType = TRUE;
  433. *lpbRawWord++ = *lpbNormWord++ = bCurChar;
  434. break;
  435. case CLASS_WILDCARD:
  436. //
  437. // Found a wildcard character
  438. // Append it to the output buffer if we accept wildcard
  439. //
  440. if (fAcceptWildCard) {
  441. if (lpbRawWord >= lpbRawWordLimit)
  442. return (E_WORDTOOLONG);
  443. *lpbRawWord++ = bCurChar;
  444. *lpbNormWord++ = bCurChar;
  445. break;
  446. }
  447. default:
  448. if (lpibi->fGotType == TRUE) {
  449. lpibi->fGotType = FALSE;
  450. /* Found a the 2nd byte of a special type
  451. Append it to the output buffer. */
  452. *lpbRawWord++ = *lpbNormWord++ = bCurChar;
  453. break;
  454. }
  455. //
  456. // Found something weird, or I have been ordered
  457. // to flush the output buffer. Flush the output
  458. // buffer and go back to the "grooting through
  459. // white space" state (#0).
  460. //
  461. FlushWord:
  462. if (fScan)
  463. {
  464. /* Recalculate the length only if scanning */
  465. *(LPW)astRawWord = (WORD)(lpbRawWord -
  466. (LPB)&astRawWord[2]);
  467. *(LPW)astNormWord = (WORD)(lpbNormWord -
  468. (LPB)&astNormWord[2]);
  469. }
  470. /* Check for stop word if required */
  471. if (lpsipb)
  472. {
  473. if (lpsipb->lpfnStopListLookup(lpsipb,
  474. astNormWord) == S_OK)
  475. {
  476. goto ScanWhite; // Ignore stop words
  477. }
  478. }
  479. #if 0
  480. if (fStem)
  481. {
  482. /* Do stemming if requested */
  483. if (FStem(astStemmed, astNormWord) == S_OK)
  484. {
  485. MEMCPY(astNormWord, astStemmed, GETWORD(astStemmed)
  486. + sizeof(WORD));
  487. }
  488. }
  489. #endif
  490. /* Execute user's function */
  491. if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,
  492. lpibi->astNormWord, lpibi->lcb, lpvUser)) != S_OK)
  493. return fRet;
  494. goto ScanWhite;
  495. }
  496. }
  497. //
  498. // If I run out of data, set things up so I'll come back
  499. // to this state if the user provides more data. If they
  500. // just want me to flush, I come back to the "flush a
  501. // word" state (#1f), since at this time I already have
  502. // a valid word, since I got an alpha-char in state #0,
  503. // and may have gotten more since.
  504. //
  505. lpibi->state = SCAN_WORD_STATE;
  506. *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
  507. *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
  508. return S_OK;
  509. ScanLeadByte:
  510. if(!cbInBufSize)
  511. {
  512. // no character - we may have lost a DBC
  513. //
  514. lpibi->state = SCAN_WHITE_STATE;
  515. *(LPW)astNormWord = *(LPW)astRawWord = 0;
  516. return S_OK;
  517. }
  518. if(!GETWORD(astNormWord))
  519. {
  520. // process lead byte
  521. //
  522. *(LPW)astNormWord = *(LPW)astRawWord = 1;
  523. astNormWord[2] = *lpbInBuffer++;
  524. --cbInBufSize;
  525. }
  526. if(!cbInBufSize)
  527. {
  528. // no more characters - set up state so we come back to get trail byte.
  529. //
  530. lpibi->state = SCAN_LEADBYTE_STATE;
  531. return S_OK;
  532. }
  533. // process trail byte
  534. //
  535. *(LPW)astNormWord = *(LPW)astRawWord = 2;
  536. astNormWord[3] = *lpbInBuffer++;
  537. --cbInBufSize;
  538. // flush the DBC
  539. //
  540. if (*lpfnfOutWord &&
  541. (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
  542. != S_OK)
  543. return fRet;
  544. if(!cbInBufSize)
  545. {
  546. // no more characters - we have already flushed our DBC so we will just
  547. // set the state back to scanning for white space.
  548. //
  549. lpibi->state = SCAN_WHITE_STATE;
  550. return S_OK;
  551. }
  552. // all done - go back to scanning white space.
  553. //
  554. goto ScanWhite;
  555. ScanSbKana:
  556. if(!cbInBufSize)
  557. {
  558. // Buffer is empty. Flush the buffer if we are holding a character.
  559. //
  560. if(GETWORD(astNormWord))
  561. {
  562. if (*lpfnfOutWord &&
  563. (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
  564. != S_OK)
  565. return fRet;
  566. }
  567. lpibi->state = SCAN_WHITE_STATE;
  568. *(LPW)astNormWord = *(LPW)astRawWord = 0;
  569. return S_OK;
  570. }
  571. // Note: The basic algorithm (including the mapping table) used here to
  572. // convert half-width Katakana characters to full-width Katakana appears
  573. // in the book "Understanding Japanese Information Systems" by
  574. // O'Reily & Associates.
  575. // If the RawWord buffer is empty then we will process this as a first
  576. // character (we are not looking for an diacritic mark).
  577. //
  578. if(!GETWORD(astRawWord))
  579. {
  580. // Verify that we have a half-width Katakana character. This check is
  581. // a good safeguard against erroneous information in a user defined
  582. // charmap.
  583. //
  584. if(*lpbInBuffer >= 161 && *lpbInBuffer <= 223)
  585. {
  586. // We have a half-width Katakana character. Now compute the equivalent
  587. // full-width character via the mapping table.
  588. //
  589. astNormWord[2] = (BYTE)(mtable[*lpbInBuffer-161][0]);
  590. astNormWord[3] = (BYTE)(mtable[*lpbInBuffer-161][1]);
  591. *(LPW)astNormWord = 2;
  592. }
  593. else
  594. {
  595. // This is an error condition. For some reason the charmap has
  596. // *lpbInBuffer tagged as CLASS_SBKANA when in fact it's not
  597. // a single byte Katakana character. This is probably the result
  598. // of an improperly formed user defined charmap.
  599. //
  600. // Since there's no way to determine the real class of this character
  601. // we will send it to the bit bucket.
  602. //
  603. lpbInBuffer++;
  604. cbInBufSize--;
  605. *(LPW)astNormWord = *(LPW)astRawWord = 0;
  606. lpibi->state = SCAN_WHITE_STATE;
  607. goto ScanWhite;
  608. }
  609. *(LPW)astRawWord = 1; // we have processed one character so far
  610. astRawWord[2] = *lpbInBuffer; // we will need the original character later
  611. lpbInBuffer++;
  612. cbInBufSize--;
  613. }
  614. // Check if we have more characters in the buffer.
  615. //
  616. if(!cbInBufSize)
  617. {
  618. // Return because the buffer is empty.
  619. //
  620. lpibi->state = SCAN_SBKANA_STATE;
  621. return S_OK;
  622. }
  623. // check if the second character is nigori mark.
  624. //
  625. if(*lpbInBuffer == 222)
  626. {
  627. // see if we have a half-width katakana that can be modified by nigori.
  628. //
  629. if((astRawWord[1] >= 182 && astRawWord[1] <= 196) ||
  630. (astRawWord[1] >= 202 && astRawWord[1] <= 206) || (astRawWord[1] == 179))
  631. {
  632. // transform kana into kana with maru
  633. //
  634. if((astNormWord[2] >= 74 && astNormWord[2] <= 103) ||
  635. (astNormWord[2] >= 110 && astNormWord[2] <= 122))
  636. astNormWord[2]++;
  637. else if(astNormWord[2] == 131 && astNormWord[3] == 69)
  638. astNormWord[3] = 148;
  639. // set the word lengths and advance the buffer.
  640. //
  641. *(LPW)astNormWord=2;
  642. *(LPW)astRawWord =2;
  643. lpbInBuffer++;
  644. cbInBufSize--;
  645. }
  646. }
  647. // check if following character is maru mark
  648. //
  649. else if(*lpbInBuffer==223)
  650. {
  651. // see if we have a half-width katakana that can be modified by maru.
  652. //
  653. if((astRawWord[2] >= 202 && astRawWord[2] <= 206))
  654. {
  655. // transform kana into kana with nigori
  656. //
  657. if(astNormWord[3] >= 110 && astNormWord[3] <= 122)
  658. astNormWord[3]+=2;
  659. // set the word lengths and advance the buffer.
  660. //
  661. *(LPW)astNormWord=2;
  662. *(LPW)astRawWord=2;
  663. lpbInBuffer++;
  664. cbInBufSize--;
  665. }
  666. }
  667. // Note: If the character at *lpbInBuffer wasn't a diacritic mark, then it
  668. // will be processed when ScanWhite is re-entered.
  669. //
  670. // Another note: The above code only combines diacritic marks with
  671. // single-width Katakana characters that can be modifed
  672. // by these marks (not all can). If we happen to encounter
  673. // a situation where the diacritic can't be combined
  674. // into the character, we let the character continue
  675. // back to ScanWhite where it will be re-sent to
  676. // ScanSbKana, however this time it will be a first
  677. // character and be converted into its stand-alone
  678. // full-width equivalent (maru and nigori have full-width
  679. // character equilalents that contain just the mark).
  680. // flush the buffer
  681. //
  682. if (*lpfnfOutWord &&
  683. (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
  684. != S_OK)
  685. return fRet;
  686. // reset word lengths and return to scanning for white space.
  687. //
  688. *(LPW)astNormWord = *(LPW)astRawWord = 0;
  689. lpibi->state = SCAN_WHITE_STATE;
  690. // Return if buffer is empty
  691. //
  692. if(!cbInBufSize)
  693. return S_OK;
  694. // all done - go back to scanning white space.
  695. //
  696. goto ScanWhite;
  697. ScanNumber:
  698. //
  699. // N U M B E R S T A T E
  700. //
  701. // While in this state the code is attempting to append alpha
  702. // and digit characters to the digit character it's already
  703. // found. This state is more complex than the word grabbing
  704. // state, because it deals with slashes and hyphens in a weird
  705. // way. They're allowed in a number unless they appear at the
  706. // end. Extra variables have to account for these conditions.
  707. //
  708. for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
  709. //
  710. // Get the character and its class.
  711. //
  712. lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
  713. switch (CP_CLASS(lpCharPropTab)) {
  714. case CLASS_DIGIT :
  715. case CLASS_NORM :
  716. case CLASS_CHAR:
  717. //
  718. // Found a normalized character or a digit.
  719. // Append it to the output buffer.
  720. //
  721. if (lpbRawWord >= lpbRawWordLimit)
  722. return (E_WORDTOOLONG);
  723. *lpbRawWord++ = bCurChar;
  724. *lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]);
  725. lpibi->cbRawPunctLen = 0;
  726. lpibi->cbNormPunctLen = 0;
  727. break;
  728. case CLASS_LIGATURE:
  729. //
  730. // Found an ligature character. Normalize
  731. // it and append it to the output buffer.
  732. //
  733. if (lpbRawWord >= lpbRawWordLimit)
  734. return (E_WORDTOOLONG);
  735. *lpbRawWord++ = bCurChar;
  736. lpbNormWord += LigatureMap (bCurChar, lpbNormWord,
  737. lpCMap, lpbLigature, wcLigature);
  738. lpibi->cbRawPunctLen = 0;
  739. lpibi->cbNormPunctLen = 0;
  740. break;
  741. case CLASS_NKEEP:
  742. //
  743. // Found a hyphen or a slash. These are kept
  744. // as part of the number unless they appear at
  745. // the end of the number.
  746. //
  747. if (lpbRawWord >= lpbRawWordLimit)
  748. return (E_WORDTOOLONG);
  749. *lpbRawWord++ = bCurChar;
  750. *lpbNormWord++= bCurChar;
  751. lpibi->cbRawPunctLen++;
  752. lpibi->cbNormPunctLen++;
  753. break;
  754. case CLASS_NSTRIP:
  755. //
  756. // Found a comma or somesuch. Ignore this
  757. // character, but increment the word length,
  758. // since it counts as part of the un-normalized
  759. // number's length.
  760. //
  761. if (lpbRawWord >= lpbRawWordLimit)
  762. return (E_WORDTOOLONG);
  763. *lpbRawWord++= bCurChar;
  764. lpibi->cbRawPunctLen++;
  765. break;
  766. case CLASS_CONTEXTNSTRIP:
  767. //
  768. // Found special character used for number separator. This
  769. // may be a space in French, ie. 100 000. The problem here
  770. // is that we must differentiate it from a regular word
  771. // separator. In the meantime, ignore this character, but
  772. // increment the word length
  773. //
  774. if (lpbRawWord >= lpbRawWordLimit)
  775. return (E_WORDTOOLONG);
  776. *lpbRawWord++= bCurChar;
  777. lpibi->cbRawPunctLen++;
  778. cbInBufSize--;
  779. lpbInBuffer++;
  780. goto ScanSeparator; // Found a "possible" separator
  781. break;
  782. case CLASS_WILDCARD:
  783. //
  784. // Found a wildcard character
  785. // Append it to the output buffer if we accept wildcard
  786. //
  787. if (fAcceptWildCard) {
  788. if (lpbRawWord >= lpbRawWordLimit)
  789. return (E_WORDTOOLONG);
  790. *lpbRawWord++ = bCurChar;
  791. *lpbNormWord++ = bCurChar;
  792. break;
  793. }
  794. default:
  795. //
  796. // Found something weird, or I have been ordered
  797. // to flush the output buffer. Flush the output
  798. // buffer and go back to the "grooting through
  799. // white space" state (#0).
  800. //
  801. // This is a little more complicated than the
  802. // analogous routine for dealing with words.
  803. // This has to deal with words that have some
  804. // number of trailing punctuation characters.
  805. // These need to be stripped from the word, and
  806. // the un-normalized word length value needs to
  807. // be adjusted as well.
  808. //
  809. FlushNumber:
  810. if (fScan)
  811. {
  812. /* Recalculate the length only if scanning */
  813. *(LPW)astRawWord = (WORD)(lpbRawWord -
  814. (LPB)&astRawWord[2] -
  815. lpibi->cbRawPunctLen);
  816. *(LPW)astNormWord = (WORD)(lpbNormWord -
  817. (LPB)&astNormWord[2] -
  818. lpibi->cbNormPunctLen);
  819. }
  820. /* Check for stop word if required */
  821. if (lpsipb)
  822. {
  823. if (lpsipb->lpfnStopListLookup(lpsipb,
  824. astNormWord) == S_OK)
  825. {
  826. goto ScanWhite; // Ignore stop words
  827. }
  828. }
  829. if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,
  830. astNormWord, lpibi->lcb, lpvUser)) != S_OK)
  831. return fRet;
  832. goto ScanWhite;
  833. }
  834. }
  835. //
  836. // If I run out of data, set things up so I'll come back
  837. // to this state if the user provides more data. If they
  838. // just want me to flush, I come back to the "flush a
  839. // number" state (#2f), since at this time I already have
  840. // a valid word, since I got an digit-char in state #0,
  841. // and may have gotten more since.
  842. //
  843. lpibi->state = SCAN_NUM_STATE;
  844. *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
  845. *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
  846. return S_OK;
  847. ScanSeparator:
  848. // S E P A R A T O R S T A T E
  849. //
  850. // This state deals with special character used to separate digits
  851. // of numbers. Example:
  852. // 100 000 ' ' is used to separate the digits in French(??)
  853. // In some sense, comma belongs to this class, when we
  854. // deal with US numbers. Because of compability with Liljoe, they
  855. // are set to be CLASS_NSTRIP. The rules to distinguish between
  856. // a digit separator from regular word separator is: If there is a
  857. // digit thats follows, then this is a digit separator, else it is
  858. // a regular word separator
  859. //
  860. if (cbInBufSize) {
  861. //
  862. // Get the character and its class.
  863. //
  864. lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
  865. if (CP_CLASS(lpCharPropTab) == CLASS_DIGIT) {
  866. /* The followed character is a digit, so this must be a digit
  867. * separator. Continue to get the number */
  868. goto ScanNumber;
  869. }
  870. else {
  871. /* Back out the change since this is a word separator */
  872. lpbRawWord--;
  873. *(LPW)astRawWord = (WORD)(lpbRawWord -
  874. (LPB)&astRawWord[2]);
  875. lpibi->cbRawPunctLen--;
  876. goto FlushNumber;
  877. }
  878. }
  879. //
  880. // If I run out of data, set things up so I'll come back
  881. // to this state if the user provides more data.
  882. //
  883. lpibi->state = SCAN_SEP_STATE;
  884. *(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
  885. *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
  886. return S_OK;
  887. }
  888. PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord,
  889. LPCMAP lpCMap, LPB lpbLigatureTab, WORD wcLigature)
  890. {
  891. for (;wcLigature > 0; wcLigature --) {
  892. if (*lpbLigatureTab == c) {
  893. *lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[1]]);
  894. *lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[2]]);
  895. return 2;
  896. }
  897. lpbLigatureTab += 3;
  898. }
  899. /* Not a ligature */
  900. *lpbNormWord++ = CP_NORMC(&lpCMap[c]);
  901. return 1;
  902. }