* * * STEM.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * This module contains the functions to strip off the suffix of a word * * It is based on the research paper of Dr. Porter, pulished in * * An algorithm for suffix stripping * * Program, Vol.14, no.3,pp 130-137, July 1980 * * * * Description: * * * * The full description of the algorithm can be found in that document * * Basically, the algorithm consists of: * * - Matching the suffix from a table of suffixes * * - Applies the rule that comes with the suffix * * - If the rule matches, then change the suffix to the new one * * * * Comments: * * * * 1/ There are some misconceptions about stripping the suffix * * People are thinking in term of super-smart algorithm that can * * strip a word to its stem. The fact is that it is not necessarily * * true. For example, DIED is strippe to DI, but not DIE. * * * * 2/ The current code is SLOW, but it easy to understand in term * * of implementation, since it is straigthforward from the algorithm * * description. The impact on runtime is nothing. On compiled time * * stemming 5,000,000 words will take less than 1 hour, which is * * acceptable, since a project that large requires 1-2 days to * * compile. * * * * To improve the speed (up to 2 times), we can scan the suffix * * if one letter doesn't match we can jump pass all stem that have * * this letter * * WARNING: Tab setting is 4 for this file * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/ #include <mvopsys.h>
#include <memory.h>
#include <mvsearch.h>
#include "common.h"
#define VOWEL 0
#define CONSONANT 1
#define MIXED 2
/* Rule table structure */
typedef struct RULE { LPB szInitSuffix; // Initial suffix
LPB szNewSuffix; // New suffix
LPB szCondition; // Stemming condition
short NextTable; // Next table to jump to
/* The conventional letter used for the stemming condition are:
* * '1': Measure == 1 * '2': Measure > 1 * 'd': Double consonant at the end (*d in the document) * 'o': Form cvc , and 2nd c is not W, X or Y (*o in the document) * 'p': Measure > 0 * 's': Remove the last consonant (used with 'd') * 'v': Word contains vowels (*v* in the document) * '*': Terminated with the next letter (*S in the document) * '&': AND operation * '|': OR operation * '!': NOT operation * The rule operation is based on a postfix notation, so "m=1 and *o*" is * described as "1o&" */
RULE RuleTab0[] = { "\4sses", "\2ss", NULL, 1, "\3ies", "\1i", NULL, 1, "\2ss", "\2ss", NULL, 1, "\1s", "\0", NULL, 1, NULL, NULL, NULL, 1, };
RULE RuleTab1[] = { "\3eed", "\2ee", "p", 3, "\2ed", "\0", "v", 2, "\3ing", "\0", "v", 2, NULL, NULL, NULL, 3, };
RULE RuleTab2[] = { "\2at", "\3ate", NULL, 3, "\2bl", "\3ble", NULL, 3, "\2iz", "\3ize", NULL, 3,
/* The following szNewSuffix has a negative \377
* (-1) length. It is to be used to reduce a * double consonant ending to single consonant */
"\0", "\377\0", "*l*s|*z|!d&s", 3, "\0", "\1e", "1o&", 3, NULL, NULL, NULL, 3, };
RULE RuleTab3[] = { "\1y", "\1i", "v", 4, NULL, NULL, NULL, 4, };
RULE RuleTab4[] = { "\7ational", "\3ate", "p", 5, "\6tional", "\4tion", "p", 5, "\4enci", "\4ence", "p", 5, "\4anci", "\4ance", "p", 5, "\4izer", "\3ize", "p", 5, "\4abli", "\4able", "p", 5, "\4alli", "\2al", "p", 5, "\5entli", "\3ent", "p", 5, "\3eli", "\1e", "p", 5, "\5ousli", "\3ous", "p", 5, "\7ization", "\3ize", "p", 5, "\5ation", "\3ate", "p", 5, "\4ator", "\3ate", "p", 5, "\5alism", "\2al", "p", 5, "\7iveness", "\3ive", "p", 5, "\7fulness", "\3ful", "p", 5, "\7ousness", "\3ous", "p", 5, "\5aliti", "\2al", "p", 5, "\5iviti", "\3ive", "p", 5, "\6biliti", "\3ble", "p", 5, NULL, NULL, NULL, 5, };
RULE RuleTab5[] = { "\5icate", "\2ic", "p", 6, "\5ative", "\0", "p", 6, "\5alize", "\2al", "p", 6, "\5iciti", "\2ic", "p", 6, "\4ical", "\2ic", "p", 6, "\3ful", "\0", "p", 6, "\4ness", "\0", "p", 6, NULL, NULL, NULL, 6, };
RULE RuleTab6[] = { "\2al", "\0", "2", 7, "\4ance", "\0", "2", 7, "\4ence", "\0", "2", 7, "\2er", "\0", "p", 7, "\2ic", "\0", "2", 7, "\4able", "\0", "2", 7, "\4ible", "\0", "2", 7, "\3ant", "\0", "2", 7, "\5ement", "\0", "2", 7, "\4ment", "\0", "2", 7, "\3ent", "\0", "2", 7, "\3ion", "\0", "2*s*t|&", 7, "\2ou", "\0", "2", 7, "\3ism", "\0", "2", 7, "\3ate", "\0", "2", 7, "\3iti", "\0", "2", 7, "\3ous", "\0", "2", 7, "\3ive", "\0", "2", 7, "\3ize", "\0", "2", 7, NULL, NULL, NULL, 7, };
RULE RuleTab7[] = { "\1e", "\0", "2", 8, "\1e", "\0", "1o!&", 8, NULL, NULL, NULL, 8, };
RULE RuleTab8[] = { "\2ll", "\1l", "2", 9, "\0", "\377\0", "2*l&d&s", 9, NULL, NULL, NULL, 9, };
char CharTypeTab[] = { VOWEL, //a
VOWEL, //e
VOWEL, //i
VOWEL, //o
VOWEL, //u
MIXED, //y, consonant, but may be vowel if after consonant
LPRULE RuleTables[] = { RuleTab0, RuleTab1, RuleTab2, RuleTab3, RuleTab4, RuleTab5, RuleTab6, RuleTab7, RuleTab8, NULL, };
* * INTERNAL PRIVATE FUNCTIONS * All of them should be declared near *************************************************************************/ int PRIVATE PASCAL NEAR MeasureCalc (LPB, int); int PRIVATE PASCAL NEAR ConditionMet (LPB, LPB, LPB, int); int PRIVATE PASCAL NEAR SuffixMatch (LPB lpbWord, LPB lpSuffix); HRESULT PRIVATE PASCAL NEAR MarkType (LPB, LPB, int);
* * @doc API INDEX RETRIEVAL * * @func HRESULT PASCAL FAR | FStem | * This function will strip the suffix from a word, ie, "stem" it * * @parm LPB | lpbStemWord | * Buffer to contain the stemmed word * * @parm LPB | lpbWord | * Word to be stemmed * * @rdesc S_OK if succeeded, or E_INVALIDARG if the null argument is * passed * * @comm The word passed must have all the letters in lower case for * The function to work with. WARNING: There is no checking about * case, so thing can go wrong if the word contains upper case letter * or non alphabetic letter. * *************************************************************************/
PUBLIC HRESULT PASCAL FAR EXPORT_API FStem (LPB lpbStemWord, LPB lpbWord) { register int wLength; // Length of the word
register int i; // Scratch variable
LPRULE lpRuleTab; // Pointer to rule table
LPRULE lpRule; // Pointer to rule
int wLengthSaved; int wNewSuffixLength; // This must be signed!
int wInitSuffixLength; char lpbWordType [CB_MAX_WORD_LEN]; LPB szInitSuffix; LPB szNewSuffix; int TableIndex; // For debugging purpose only
int RuleIndex; // For debugging purpose only
LPB lpbTmp; if (lpbWord == NULL) return E_INVALIDARG; wLength = (*(LPW)lpbWordType = *((LPW)lpbWord)); if (wLength >= CB_MAX_WORD_LEN) return(E_WORDTOOLONG);
/* Copy the word over */ MEMCPY (lpbStemWord, lpbWord, wLength + 2);
/* Don't do any stemming for words <= 3 bytes */ if (wLength <= MIN_LENGTH_FOR_STEM) return S_OK;
/* Mark the type of each letter to be consonant or vowel */ if (MarkType (lpbStemWord+2, lpbWordType+2, wLength) != S_OK) { /* We got some non alphabetic characters. Just return */ return S_OK; }
/* Traverse all the tables and check for stemming conditions */
for (TableIndex = 0, lpRuleTab = RuleTables[0]; lpRuleTab;) {
/* Check for each rule */
for (RuleIndex = 0, lpRule = lpRuleTab; szInitSuffix = lpRule->szInitSuffix; lpRule++, RuleIndex++) {
szNewSuffix = lpRule->szNewSuffix;
/* The casting is needed to make wNewSuffixLength signed */ wNewSuffixLength = (char)*szNewSuffix++;
wInitSuffixLength = (char)*szInitSuffix++;
/* Check for condition match */
if (wLength >= wInitSuffixLength) { lpbTmp = lpbStemWord + wLength + 2 - wInitSuffixLength; /* Compare the suffixes */
for (i = wInitSuffixLength; i > 0 && (*lpbTmp == *szInitSuffix); i--, lpbTmp++, szInitSuffix++);
/* Restore szInitSuffix */ szInitSuffix = lpRule->szInitSuffix;
if (i != 0) // String comparison fails
/* Save the word length */ wLengthSaved = wLength;
/* Update word length since we don't include the suffix
* length in our computation */ wLength -= wInitSuffixLength;
/* Now check the stemming condition */
if (ConditionMet (lpbStemWord, lpbWordType, lpRule->szCondition, wLength)) {
/* Rule applies, change to the new suffix */
if (wNewSuffixLength > 0) {
MEMCPY (&lpbStemWord[wLength+2], szNewSuffix, wNewSuffixLength);
/* Update the word type */
MarkType (szNewSuffix, lpbWordType + wLength + 2, wNewSuffixLength); }
/* Update the word length
* The check for wLength is necessary since we don't * want to strip evething */
if (wLength + wNewSuffixLength > 0) *(LPW)lpbStemWord = (wLength += wNewSuffixLength);
if (wLength <= MIN_LENGTH_FOR_STEM) goto Done;
break; } else {
/* Rule doesn't apply, Restore the word length */ wLength = wLengthSaved; } } }
/* Go to the next table */ lpRuleTab = RuleTables [TableIndex = lpRule->NextTable]; }
Done: lpbStemWord[*((LPW)lpbStemWord)+2] = 0; return S_OK; }
* * @doc INTERNAL * * @func int PASCAL NEAR | MeasureCalc | * Calculate the measure of a word. The measure is defined as * the pair (VC), where V is the vowels, and C consonants. A word * is described as [C](VC)m[V], where the first C and the last V are * optional. m is the measure of the word (or part of word without * the suffix). Example: * architect: m = 3 (arch, it, ect) * convention: m = 3 (onv, ent, ion) * lie: m = 0, since the first consonant, and the last vowels * don't count * * @parm LPB | lpbWordType | * Buffer containing word type * * @parm int | wLength | * The length of the word * * @rdesc Return the measure of the word * *************************************************************************/
int PRIVATE PASCAL NEAR MeasureCalc (LPB lpbWordType, register int wLength) { register int cMeasure;
#if 0
/* Safety chck
* IFdef out for speed. This is a internal function */ if (lpbWordType == NULL) return 0; #endif
/* Initialize the word measure */ cMeasure = 0;
/* Skip the beginning consonants */
for (;wLength > 0 && *lpbWordType == CONSONANT; wLength--, lpbWordType++);
/* Get the vowel/consonant pairs */ while (wLength > 0) {
/* Get all the vowels */ for (; wLength > 0 && *lpbWordType == VOWEL; wLength--, lpbWordType++);
if (wLength > 0) { cMeasure ++;
/* Get all the consonants */ for (; wLength > 0 && *lpbWordType == CONSONANT; wLength--, lpbWordType++); } } return cMeasure; }
* * @doc INTERNAL * * @func int PASCAL NEAR | ConditionMet | * This fuction check the condition to be met by a particular * suffix. * * @parm LPB | lpbWord | * Buffer contains the word to be stemmed> This is a 2-byte prefixed * pascal string * * @parm LPB | lpbWordType | * Buffer containing the type of each letter of the word. This * is a parallel buffer * * @parm LPB | szCondition | * Condtion in postfix form * * @parm int | wLength | * Length of the word * * @rdesc TRUE, if the condition is met, FALSE otherwise * *************************************************************************/
int PRIVATE PASCAL NEAR ConditionMet (LPB lpbWord, LPB lpbWordType, LPB szCondition, int wLength) { int StackIndex; int Stack[4]; int wLengthSaved; int LastByte; LPB lpbTmp; LPB lpbTmpType;
if (szCondition == NULL) return TRUE;
/* Initialize variables
* Note: The original codes are written for a 1-byte length preceded * string. The new format is 2-byte preceded string. To minimize the * change, lpbTmp is used, and points to the 2nd byte */
StackIndex = -1; lpbTmp = lpbWord + 1; lpbTmpType = lpbWordType + 1; LastByte = lpbTmp[wLength];
while (*szCondition) { switch (*szCondition) { case '*': // *S in the document
/* Check to see if the stem ends with the next letter */
Stack[++StackIndex] = (LastByte == *(++szCondition)); break;
case 'd': // *d in the document
/* Check to see if the stem ends with a double consonant */
Stack[++StackIndex] = (wLength > 2 && LastByte == lpbTmp[wLength - 1] && lpbTmpType[wLength] == CONSONANT); break;
case 's': // Remove the last consonant
if (Stack[0]) { lpbTmp[wLength] = 0; wLength --; *(LPW)lpbWordType = *(LPW)lpbWord = (WORD) wLength; } break;
case 'v': // *v* in the document
/* Check to see if the word has a vowel */
wLengthSaved = wLength; /* Save the length */
for (; wLength && lpbTmpType[wLength] != VOWEL; wLength--);
Stack[++StackIndex] = wLength > 0;
/* Restore the word length */ wLength = wLengthSaved; break;
case 'o': /* *o in the document, ie.
- The word ends with the form cvc - The second c is not W, X, Y The +2 is for skipping the word length */ Stack[++StackIndex] = (wLength >= 3) && (lpbWordType[wLength + 1] == CONSONANT) && (lpbWordType[wLength] == VOWEL) && (lpbWordType[wLength - 1] == CONSONANT) && (LastByte != 'w' && LastByte != 'x' && LastByte != 'y'); break;
/* The conditions below test Measure. If they fails, then
* the whole condition fails. ie. there is no need to test * any other conditions. There is no need to save the result * on the stack */
case 'p': // Measure > 0
if ((Stack[++StackIndex] = MeasureCalc (lpbWordType+2, wLength) > 0) == FALSE) return FALSE; break;
case '2': // Measure > 1
if ((Stack[++StackIndex] = MeasureCalc (lpbWordType+2, wLength) > 1) == FALSE) return FALSE; break;
case '1': // Measure == 1
if ((Stack[++StackIndex] = MeasureCalc (lpbWordType+2, wLength) == 1) == FALSE) return FALSE; break;
/* The next conditions are operators combination */
case '|': /* OR the result of the top 2 stack entries */
Stack[StackIndex-1] |= Stack[StackIndex]; StackIndex--; break;
case '&': /* AND the result of the top 2 stack entries */
Stack[StackIndex-1] &= Stack[StackIndex]; StackIndex--; break;
case '!': /* NOT the result of the top stack entry */
Stack[StackIndex] = !Stack[StackIndex]; break;
default: return FALSE; } szCondition++; }
return Stack[0]; }
* * @doc INTERNAL * * @func HRESULT PASCAL NEAR | MarkType | * Marking the type of each letter of the word to be CONSONANT or * VOWEL * * @parm LPB | lpbWord | * Buffer containing the word * * @parm LPB | lpBufType | * Buffer to contain the type of the letters * * @parm int | wLength | * Length of the word * *************************************************************************/
HRESULT PRIVATE PASCAL NEAR MarkType (LPB lpbWord, LPB lpBufType, int wLength) { for (; wLength > 0; lpBufType++, lpbWord++, wLength--) {
/* Consider wildcard characters to be consonnant */ if (*lpbWord == '?' || *lpbWord == '*') { *lpBufType = CONSONANT; continue; }
if (*lpbWord < 'a' || *lpbWord > 'z') return E_FAIL;
switch (CharTypeTab [*lpbWord - 'a']) { case CONSONANT: *lpBufType = CONSONANT; break; case VOWEL: *lpBufType = VOWEL; break; case MIXED: if (*(lpBufType - 1) == CONSONANT) *lpBufType = VOWEL; else *lpBufType = CONSONANT; break; } } return S_OK; }