Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

696 lines
22 KiB

  1. /*************************************************************************
  2. * *
  3. * STEM.C *
  4. * *
  5. * Copyright (C) Microsoft Corporation 1990-1994 *
  6. * All Rights reserved. *
  7. * *
  8. **************************************************************************
  9. * *
  10. * Module Intent *
  11. * This module contains the functions to strip off the suffix of a word *
  12. * It is based on the research paper of Dr. Porter, pulished in *
  13. * An algorithm for suffix stripping *
  14. * Program, Vol.14, no.3,pp 130-137, July 1980 *
  15. * *
  16. * Description: *
  17. * *
  18. * The full description of the algorithm can be found in that document *
  19. * Basically, the algorithm consists of: *
  20. * - Matching the suffix from a table of suffixes *
  21. * - Applies the rule that comes with the suffix *
  22. * - If the rule matches, then change the suffix to the new one *
  23. * *
  24. * Comments: *
  25. * *
  26. * 1/ There are some misconceptions about stripping the suffix *
  27. * People are thinking in term of super-smart algorithm that can *
  28. * strip a word to its stem. The fact is that it is not necessarily *
  29. * true. For example, DIED is strippe to DI, but not DIE. *
  30. * *
  31. * 2/ The current code is SLOW, but it easy to understand in term *
  32. * of implementation, since it is straigthforward from the algorithm *
  33. * description. The impact on runtime is nothing. On compiled time *
  34. * stemming 5,000,000 words will take less than 1 hour, which is *
  35. * acceptable, since a project that large requires 1-2 days to *
  36. * compile. *
  37. * *
  38. * To improve the speed (up to 2 times), we can scan the suffix *
  39. * if one letter doesn't match we can jump pass all stem that have *
  40. * this letter *
  41. * WARNING: Tab setting is 4 for this file *
  42. * *
  43. **************************************************************************
  44. * *
  45. * Current Owner: BinhN *
  46. * *
  47. **************************************************************************/
  48. #include <mvopsys.h>
  49. #include <memory.h>
  50. #include <mvsearch.h>
  51. #include "common.h"
  52. #define VOWEL 0
  53. #define CONSONANT 1
  54. #define MIXED 2
  55. #define MIN_LENGTH_FOR_STEM 3
  56. /* Rule table structure */
  57. typedef struct RULE
  58. {
  59. LPB szInitSuffix; // Initial suffix
  60. LPB szNewSuffix; // New suffix
  61. LPB szCondition; // Stemming condition
  62. short NextTable; // Next table to jump to
  63. } RULE, FAR *LPRULE;
  64. /* The conventional letter used for the stemming condition are:
  65. *
  66. * '1': Measure == 1
  67. * '2': Measure > 1
  68. * 'd': Double consonant at the end (*d in the document)
  69. * 'o': Form cvc , and 2nd c is not W, X or Y (*o in the document)
  70. * 'p': Measure > 0
  71. * 's': Remove the last consonant (used with 'd')
  72. * 'v': Word contains vowels (*v* in the document)
  73. * '*': Terminated with the next letter (*S in the document)
  74. * '&': AND operation
  75. * '|': OR operation
  76. * '!': NOT operation
  77. * The rule operation is based on a postfix notation, so "m=1 and *o*" is
  78. * described as "1o&"
  79. */
  80. RULE RuleTab0[] =
  81. {
  82. "\4sses", "\2ss", NULL, 1,
  83. "\3ies", "\1i", NULL, 1,
  84. "\2ss", "\2ss", NULL, 1,
  85. "\1s", "\0", NULL, 1,
  86. NULL, NULL, NULL, 1,
  87. };
  88. RULE RuleTab1[] =
  89. {
  90. "\3eed", "\2ee", "p", 3,
  91. "\2ed", "\0", "v", 2,
  92. "\3ing", "\0", "v", 2,
  93. NULL, NULL, NULL, 3,
  94. };
  95. RULE RuleTab2[] =
  96. {
  97. "\2at", "\3ate", NULL, 3,
  98. "\2bl", "\3ble", NULL, 3,
  99. "\2iz", "\3ize", NULL, 3,
  100. /* The following szNewSuffix has a negative \377
  101. * (-1) length. It is to be used to reduce a
  102. * double consonant ending to single consonant
  103. */
  104. "\0", "\377\0", "*l*s|*z|!d&s", 3,
  105. "\0", "\1e", "1o&", 3,
  106. NULL, NULL, NULL, 3,
  107. };
  108. RULE RuleTab3[] =
  109. {
  110. "\1y", "\1i", "v", 4,
  111. NULL, NULL, NULL, 4,
  112. };
  113. RULE RuleTab4[] =
  114. {
  115. "\7ational", "\3ate", "p", 5,
  116. "\6tional", "\4tion", "p", 5,
  117. "\4enci", "\4ence", "p", 5,
  118. "\4anci", "\4ance", "p", 5,
  119. "\4izer", "\3ize", "p", 5,
  120. "\4abli", "\4able", "p", 5,
  121. "\4alli", "\2al", "p", 5,
  122. "\5entli", "\3ent", "p", 5,
  123. "\3eli", "\1e", "p", 5,
  124. "\5ousli", "\3ous", "p", 5,
  125. "\7ization", "\3ize", "p", 5,
  126. "\5ation", "\3ate", "p", 5,
  127. "\4ator", "\3ate", "p", 5,
  128. "\5alism", "\2al", "p", 5,
  129. "\7iveness", "\3ive", "p", 5,
  130. "\7fulness", "\3ful", "p", 5,
  131. "\7ousness", "\3ous", "p", 5,
  132. "\5aliti", "\2al", "p", 5,
  133. "\5iviti", "\3ive", "p", 5,
  134. "\6biliti", "\3ble", "p", 5,
  135. NULL, NULL, NULL, 5,
  136. };
  137. RULE RuleTab5[] =
  138. {
  139. "\5icate", "\2ic", "p", 6,
  140. "\5ative", "\0", "p", 6,
  141. "\5alize", "\2al", "p", 6,
  142. "\5iciti", "\2ic", "p", 6,
  143. "\4ical", "\2ic", "p", 6,
  144. "\3ful", "\0", "p", 6,
  145. "\4ness", "\0", "p", 6,
  146. NULL, NULL, NULL, 6,
  147. };
  148. RULE RuleTab6[] =
  149. {
  150. "\2al", "\0", "2", 7,
  151. "\4ance", "\0", "2", 7,
  152. "\4ence", "\0", "2", 7,
  153. "\2er", "\0", "p", 7,
  154. "\2ic", "\0", "2", 7,
  155. "\4able", "\0", "2", 7,
  156. "\4ible", "\0", "2", 7,
  157. "\3ant", "\0", "2", 7,
  158. "\5ement", "\0", "2", 7,
  159. "\4ment", "\0", "2", 7,
  160. "\3ent", "\0", "2", 7,
  161. "\3ion", "\0", "2*s*t|&", 7,
  162. "\2ou", "\0", "2", 7,
  163. "\3ism", "\0", "2", 7,
  164. "\3ate", "\0", "2", 7,
  165. "\3iti", "\0", "2", 7,
  166. "\3ous", "\0", "2", 7,
  167. "\3ive", "\0", "2", 7,
  168. "\3ize", "\0", "2", 7,
  169. NULL, NULL, NULL, 7,
  170. };
  171. RULE RuleTab7[] =
  172. {
  173. "\1e", "\0", "2", 8,
  174. "\1e", "\0", "1o!&", 8,
  175. NULL, NULL, NULL, 8,
  176. };
  177. RULE RuleTab8[] =
  178. {
  179. "\2ll", "\1l", "2", 9,
  180. "\0", "\377\0", "2*l&d&s", 9,
  181. NULL, NULL, NULL, 9,
  182. };
  183. char CharTypeTab[] =
  184. {
  185. VOWEL, //a
  186. CONSONANT, //b
  187. CONSONANT, //c
  188. CONSONANT, //d
  189. VOWEL, //e
  190. CONSONANT, //f
  191. CONSONANT, //g
  192. CONSONANT, //h
  193. VOWEL, //i
  194. CONSONANT, //j
  195. CONSONANT, //k
  196. CONSONANT, //l
  197. CONSONANT, //m
  198. CONSONANT, //n
  199. VOWEL, //o
  200. CONSONANT, //p
  201. CONSONANT, //q
  202. CONSONANT, //r
  203. CONSONANT, //s
  204. CONSONANT, //t
  205. VOWEL, //u
  206. CONSONANT, //v
  207. CONSONANT, //w
  208. CONSONANT, //x
  209. MIXED, //y, consonant, but may be vowel if after consonant
  210. CONSONANT, //z
  211. };
  212. LPRULE RuleTables[] =
  213. {
  214. RuleTab0,
  215. RuleTab1,
  216. RuleTab2,
  217. RuleTab3,
  218. RuleTab4,
  219. RuleTab5,
  220. RuleTab6,
  221. RuleTab7,
  222. RuleTab8,
  223. NULL,
  224. };
  225. /*************************************************************************
  226. *
  227. * INTERNAL PRIVATE FUNCTIONS
  228. * All of them should be declared near
  229. *************************************************************************/
  230. int PRIVATE PASCAL NEAR MeasureCalc (LPB, int);
  231. int PRIVATE PASCAL NEAR ConditionMet (LPB, LPB, LPB, int);
  232. int PRIVATE PASCAL NEAR SuffixMatch (LPB lpbWord, LPB lpSuffix);
  233. HRESULT PRIVATE PASCAL NEAR MarkType (LPB, LPB, int);
  234. /*************************************************************************
  235. *
  236. * @doc API INDEX RETRIEVAL
  237. *
  238. * @func HRESULT PASCAL FAR | FStem |
  239. * This function will strip the suffix from a word, ie, "stem" it
  240. *
  241. * @parm LPB | lpbStemWord |
  242. * Buffer to contain the stemmed word
  243. *
  244. * @parm LPB | lpbWord |
  245. * Word to be stemmed
  246. *
  247. * @rdesc S_OK if succeeded, or E_INVALIDARG if the null argument is
  248. * passed
  249. *
  250. * @comm The word passed must have all the letters in lower case for
  251. * The function to work with. WARNING: There is no checking about
  252. * case, so thing can go wrong if the word contains upper case letter
  253. * or non alphabetic letter.
  254. *
  255. *************************************************************************/
  256. PUBLIC HRESULT PASCAL FAR EXPORT_API FStem (LPB lpbStemWord, LPB lpbWord)
  257. {
  258. register int wLength; // Length of the word
  259. register int i; // Scratch variable
  260. LPRULE lpRuleTab; // Pointer to rule table
  261. LPRULE lpRule; // Pointer to rule
  262. int wLengthSaved;
  263. int wNewSuffixLength; // This must be signed!
  264. int wInitSuffixLength;
  265. char lpbWordType [CB_MAX_WORD_LEN];
  266. LPB szInitSuffix;
  267. LPB szNewSuffix;
  268. int TableIndex; // For debugging purpose only
  269. int RuleIndex; // For debugging purpose only
  270. LPB lpbTmp;
  271. if (lpbWord == NULL)
  272. return E_INVALIDARG;
  273. wLength = (*(LPW)lpbWordType = *((LPW)lpbWord));
  274. if (wLength >= CB_MAX_WORD_LEN)
  275. return(E_WORDTOOLONG);
  276. /* Copy the word over */
  277. MEMCPY (lpbStemWord, lpbWord, wLength + 2);
  278. /* Don't do any stemming for words <= 3 bytes */
  279. if (wLength <= MIN_LENGTH_FOR_STEM)
  280. return S_OK;
  281. /* Mark the type of each letter to be consonant or vowel */
  282. if (MarkType (lpbStemWord+2, lpbWordType+2, wLength) != S_OK)
  283. {
  284. /* We got some non alphabetic characters. Just return */
  285. return S_OK;
  286. }
  287. /* Traverse all the tables and check for stemming conditions */
  288. for (TableIndex = 0, lpRuleTab = RuleTables[0]; lpRuleTab;)
  289. {
  290. /* Check for each rule */
  291. for (RuleIndex = 0, lpRule = lpRuleTab;
  292. szInitSuffix = lpRule->szInitSuffix; lpRule++, RuleIndex++)
  293. {
  294. szNewSuffix = lpRule->szNewSuffix;
  295. /* The casting is needed to make wNewSuffixLength signed */
  296. wNewSuffixLength = (char)*szNewSuffix++;
  297. wInitSuffixLength = (char)*szInitSuffix++;
  298. /* Check for condition match */
  299. if (wLength >= wInitSuffixLength)
  300. {
  301. lpbTmp = lpbStemWord + wLength + 2 - wInitSuffixLength;
  302. /* Compare the suffixes */
  303. for (i = wInitSuffixLength;
  304. i > 0 && (*lpbTmp == *szInitSuffix);
  305. i--, lpbTmp++, szInitSuffix++);
  306. /* Restore szInitSuffix */
  307. szInitSuffix = lpRule->szInitSuffix;
  308. if (i != 0) // String comparison fails
  309. continue;
  310. /* Save the word length */
  311. wLengthSaved = wLength;
  312. /* Update word length since we don't include the suffix
  313. * length in our computation
  314. */
  315. wLength -= wInitSuffixLength;
  316. /* Now check the stemming condition */
  317. if (ConditionMet (lpbStemWord, lpbWordType,
  318. lpRule->szCondition, wLength))
  319. {
  320. /* Rule applies, change to the new suffix */
  321. if (wNewSuffixLength > 0)
  322. {
  323. MEMCPY (&lpbStemWord[wLength+2], szNewSuffix,
  324. wNewSuffixLength);
  325. /* Update the word type */
  326. MarkType (szNewSuffix,
  327. lpbWordType + wLength + 2, wNewSuffixLength);
  328. }
  329. /* Update the word length
  330. * The check for wLength is necessary since we don't
  331. * want to strip evething
  332. */
  333. if (wLength + wNewSuffixLength > 0)
  334. *(LPW)lpbStemWord = (wLength += wNewSuffixLength);
  335. if (wLength <= MIN_LENGTH_FOR_STEM)
  336. goto Done;
  337. break;
  338. }
  339. else
  340. {
  341. /* Rule doesn't apply, Restore the word length */
  342. wLength = wLengthSaved;
  343. }
  344. }
  345. }
  346. /* Go to the next table */
  347. lpRuleTab = RuleTables [TableIndex = lpRule->NextTable];
  348. }
  349. Done:
  350. lpbStemWord[*((LPW)lpbStemWord)+2] = 0;
  351. return S_OK;
  352. }
  353. /*************************************************************************
  354. *
  355. * @doc INTERNAL
  356. *
  357. * @func int PASCAL NEAR | MeasureCalc |
  358. * Calculate the measure of a word. The measure is defined as
  359. * the pair (VC), where V is the vowels, and C consonants. A word
  360. * is described as [C](VC)m[V], where the first C and the last V are
  361. * optional. m is the measure of the word (or part of word without
  362. * the suffix). Example:
  363. * architect: m = 3 (arch, it, ect)
  364. * convention: m = 3 (onv, ent, ion)
  365. * lie: m = 0, since the first consonant, and the last vowels
  366. * don't count
  367. *
  368. * @parm LPB | lpbWordType |
  369. * Buffer containing word type
  370. *
  371. * @parm int | wLength |
  372. * The length of the word
  373. *
  374. * @rdesc Return the measure of the word
  375. *
  376. *************************************************************************/
  377. int PRIVATE PASCAL NEAR MeasureCalc (LPB lpbWordType, register int wLength)
  378. {
  379. register int cMeasure;
  380. #if 0
  381. /* Safety chck
  382. * IFdef out for speed. This is a internal function
  383. */
  384. if (lpbWordType == NULL)
  385. return 0;
  386. #endif
  387. /* Initialize the word measure */
  388. cMeasure = 0;
  389. /* Skip the beginning consonants */
  390. for (;wLength > 0 && *lpbWordType == CONSONANT; wLength--, lpbWordType++);
  391. /* Get the vowel/consonant pairs */
  392. while (wLength > 0)
  393. {
  394. /* Get all the vowels */
  395. for (; wLength > 0 && *lpbWordType == VOWEL; wLength--, lpbWordType++);
  396. if (wLength > 0)
  397. {
  398. cMeasure ++;
  399. /* Get all the consonants */
  400. for (; wLength > 0 && *lpbWordType == CONSONANT;
  401. wLength--, lpbWordType++);
  402. }
  403. }
  404. return cMeasure;
  405. }
  406. /*************************************************************************
  407. *
  408. * @doc INTERNAL
  409. *
  410. * @func int PASCAL NEAR | ConditionMet |
  411. * This fuction check the condition to be met by a particular
  412. * suffix.
  413. *
  414. * @parm LPB | lpbWord |
  415. * Buffer contains the word to be stemmed> This is a 2-byte prefixed
  416. * pascal string
  417. *
  418. * @parm LPB | lpbWordType |
  419. * Buffer containing the type of each letter of the word. This
  420. * is a parallel buffer
  421. *
  422. * @parm LPB | szCondition |
  423. * Condtion in postfix form
  424. *
  425. * @parm int | wLength |
  426. * Length of the word
  427. *
  428. * @rdesc TRUE, if the condition is met, FALSE otherwise
  429. *
  430. *************************************************************************/
  431. int PRIVATE PASCAL NEAR ConditionMet (LPB lpbWord, LPB lpbWordType,
  432. LPB szCondition, int wLength)
  433. {
  434. int StackIndex;
  435. int Stack[4];
  436. int wLengthSaved;
  437. int LastByte;
  438. LPB lpbTmp;
  439. LPB lpbTmpType;
  440. if (szCondition == NULL)
  441. return TRUE;
  442. /* Initialize variables
  443. * Note: The original codes are written for a 1-byte length preceded
  444. * string. The new format is 2-byte preceded string. To minimize the
  445. * change, lpbTmp is used, and points to the 2nd byte
  446. */
  447. StackIndex = -1;
  448. lpbTmp = lpbWord + 1;
  449. lpbTmpType = lpbWordType + 1;
  450. LastByte = lpbTmp[wLength];
  451. while (*szCondition)
  452. {
  453. switch (*szCondition)
  454. {
  455. case '*': // *S in the document
  456. /* Check to see if the stem ends with the next letter */
  457. Stack[++StackIndex] =
  458. (LastByte == *(++szCondition));
  459. break;
  460. case 'd': // *d in the document
  461. /* Check to see if the stem ends with a double consonant */
  462. Stack[++StackIndex] = (wLength > 2 &&
  463. LastByte == lpbTmp[wLength - 1] &&
  464. lpbTmpType[wLength] == CONSONANT);
  465. break;
  466. case 's': // Remove the last consonant
  467. if (Stack[0])
  468. {
  469. lpbTmp[wLength] = 0;
  470. wLength --;
  471. *(LPW)lpbWordType = *(LPW)lpbWord = (WORD) wLength;
  472. }
  473. break;
  474. case 'v': // *v* in the document
  475. /* Check to see if the word has a vowel */
  476. wLengthSaved = wLength; /* Save the length */
  477. for (; wLength &&
  478. lpbTmpType[wLength] != VOWEL; wLength--);
  479. Stack[++StackIndex] = wLength > 0;
  480. /* Restore the word length */
  481. wLength = wLengthSaved;
  482. break;
  483. case 'o':
  484. /* *o in the document, ie.
  485. - The word ends with the form cvc
  486. - The second c is not W, X, Y
  487. The +2 is for skipping the word length
  488. */
  489. Stack[++StackIndex] = (wLength >= 3) &&
  490. (lpbWordType[wLength + 1] == CONSONANT) &&
  491. (lpbWordType[wLength] == VOWEL) &&
  492. (lpbWordType[wLength - 1] == CONSONANT) &&
  493. (LastByte != 'w' && LastByte != 'x' && LastByte != 'y');
  494. break;
  495. /* The conditions below test Measure. If they fails, then
  496. * the whole condition fails. ie. there is no need to test
  497. * any other conditions. There is no need to save the result
  498. * on the stack
  499. */
  500. case 'p': // Measure > 0
  501. if ((Stack[++StackIndex] =
  502. MeasureCalc (lpbWordType+2, wLength) > 0) == FALSE)
  503. return FALSE;
  504. break;
  505. case '2': // Measure > 1
  506. if ((Stack[++StackIndex] =
  507. MeasureCalc (lpbWordType+2, wLength) > 1) == FALSE)
  508. return FALSE;
  509. break;
  510. case '1': // Measure == 1
  511. if ((Stack[++StackIndex] =
  512. MeasureCalc (lpbWordType+2, wLength) == 1) == FALSE)
  513. return FALSE;
  514. break;
  515. /* The next conditions are operators combination */
  516. case '|':
  517. /* OR the result of the top 2 stack entries */
  518. Stack[StackIndex-1] |= Stack[StackIndex];
  519. StackIndex--;
  520. break;
  521. case '&':
  522. /* AND the result of the top 2 stack entries */
  523. Stack[StackIndex-1] &= Stack[StackIndex];
  524. StackIndex--;
  525. break;
  526. case '!':
  527. /* NOT the result of the top stack entry */
  528. Stack[StackIndex] = !Stack[StackIndex];
  529. break;
  530. default:
  531. return FALSE;
  532. }
  533. szCondition++;
  534. }
  535. return Stack[0];
  536. }
  537. /*************************************************************************
  538. *
  539. * @doc INTERNAL
  540. *
  541. * @func HRESULT PASCAL NEAR | MarkType |
  542. * Marking the type of each letter of the word to be CONSONANT or
  543. * VOWEL
  544. *
  545. * @parm LPB | lpbWord |
  546. * Buffer containing the word
  547. *
  548. * @parm LPB | lpBufType |
  549. * Buffer to contain the type of the letters
  550. *
  551. * @parm int | wLength |
  552. * Length of the word
  553. *
  554. *************************************************************************/
  555. HRESULT PRIVATE PASCAL NEAR MarkType (LPB lpbWord, LPB lpBufType, int wLength)
  556. {
  557. for (; wLength > 0; lpBufType++, lpbWord++, wLength--)
  558. {
  559. /* Consider wildcard characters to be consonnant */
  560. if (*lpbWord == '?' || *lpbWord == '*')
  561. {
  562. *lpBufType = CONSONANT;
  563. continue;
  564. }
  565. if (*lpbWord < 'a' || *lpbWord > 'z')
  566. return E_FAIL;
  567. switch (CharTypeTab [*lpbWord - 'a'])
  568. {
  569. case CONSONANT:
  570. *lpBufType = CONSONANT;
  571. break;
  572. case VOWEL:
  573. *lpBufType = VOWEL;
  574. break;
  575. case MIXED:
  576. if (*(lpBufType - 1) == CONSONANT)
  577. *lpBufType = VOWEL;
  578. else
  579. *lpBufType = CONSONANT;
  580. break;
  581. }
  582. }
  583. return S_OK;
  584. }