windows-xp/Source/XPSP1/NT/enduser/stuff/itircl/fts/search/stem.c



								/*************************************************************************

								*                                                                        *

								*  STEM.C                                                                *

								*                                                                        *

								*  Copyright (C) Microsoft Corporation 1990-1994                         *

								*  All Rights reserved.                                                  *

								*                                                                        *

								**************************************************************************

								*                                                                        *

								*  Module Intent                                                         *

								*   This module contains the functions to strip off the suffix of a word *

								*   It is based on the research paper of  Dr. Porter, pulished in        *

								*             An algorithm for suffix stripping                          *

								*          Program, Vol.14, no.3,pp 130-137, July 1980                   *

								*                                                                        *

								*  Description:                                                          *

								*                                                                        *

								*   The full description of the algorithm can be found in that document  *

								*   Basically, the algorithm consists of:                                *

								*      - Matching the suffix from a table of suffixes                    *

								*      - Applies the rule that comes with the suffix                     *

								*      - If the rule matches, then change the suffix to the new one      *

								*                                                                        *

								*  Comments:                                                             *

								*                                                                        *

								*     1/ There are some misconceptions about stripping the suffix        *

								*     People are thinking in term of super-smart algorithm that can      *

								*     strip a word to its stem. The fact is that it is not necessarily   *

								*     true. For example, DIED is strippe to DI, but not DIE.             *

								*                                                                        *

								*     2/ The current code is SLOW, but it easy to understand in term     *

								*     of implementation, since it is straigthforward from the algorithm  *

								*     description. The impact on runtime is nothing. On compiled time    *

								*     stemming 5,000,000 words will take less than 1 hour, which is      *

								*     acceptable, since a project that large requires 1-2 days to        *

								*     compile.                                                           *

								*                                                                        *

								*     To improve the speed (up to 2 times), we can scan the suffix       *

								*     if one letter doesn't match we can jump pass all stem that have    *

								*     this letter                                                        *

								*   WARNING: Tab setting is 4 for this file                              *

								*                                                                        *

								**************************************************************************

								*                                                                        *

								*  Current Owner: BinhN                                                  *

								*                                                                        *

								**************************************************************************/

								#include <mvopsys.h>

								#include <memory.h>

								#include <mvsearch.h>

								#include "common.h"


								#define VOWEL       0

								#define CONSONANT   1

								#define MIXED       2


								#define	MIN_LENGTH_FOR_STEM	3


								/* Rule table structure */


								typedef struct RULE

								{

								    LPB szInitSuffix;           // Initial suffix

								    LPB szNewSuffix;            // New suffix

								    LPB szCondition;            // Stemming condition

								    short NextTable;            // Next table to jump to

								} RULE, FAR *LPRULE;


								/* The conventional letter  used for the stemming condition are:

								 *

								 *  '1':    Measure == 1

								 *  '2':    Measure > 1

								 *  'd':    Double consonant at the end (*d in the document)

								 *  'o':    Form cvc , and 2nd c is not W, X or Y (*o in the document)

								 *  'p':    Measure > 0

								 *  's':    Remove the last consonant (used with 'd')

								 *  'v':    Word contains vowels (*v* in the document)

								 *  '*':    Terminated with the next letter (*S in the document)

								 *  '&':    AND operation

								 *  '|':    OR operation

								 *  '!':    NOT operation

								 * The rule operation is based on a postfix notation, so "m=1 and *o*" is

								 * described as "1o&"

								 */


								RULE RuleTab0[] =

								{

								    "\4sses",       "\2ss",     NULL,   1,

								    "\3ies",        "\1i",      NULL,   1,

								    "\2ss",         "\2ss",     NULL,   1,

								    "\1s",          "\0",       NULL,   1,

								    NULL,           NULL,       NULL,   1,

								};


								RULE RuleTab1[] =

								{

								    "\3eed",        "\2ee",     "p",    3,

								    "\2ed",         "\0",       "v",    2,

								    "\3ing",        "\0",       "v",    2,

								    NULL,           NULL,       NULL,   3,

								};


								RULE RuleTab2[] =

								{

								    "\2at",         "\3ate",    NULL,   3,

								    "\2bl",         "\3ble",    NULL,   3,

								    "\2iz",         "\3ize",    NULL,   3,


								    /* The following szNewSuffix has a negative \377

								     * (-1) length. It is to be used to reduce a

								     * double consonant ending to single consonant

								     */


								    "\0",           "\377\0",   "*l*s|*z|!d&s", 3,

								    "\0",           "\1e",      "1o&",  3,

								    NULL,           NULL,       NULL,   3,

								};


								RULE RuleTab3[] =

								{

								    "\1y",          "\1i",      "v",    4,

								    NULL,           NULL,       NULL,   4,

								};


								RULE RuleTab4[] =

								{

								    "\7ational",    "\3ate",    "p",    5,

								    "\6tional",     "\4tion",   "p",    5,

								    "\4enci",       "\4ence",   "p",    5,

								    "\4anci",       "\4ance",   "p",    5,

								    "\4izer",       "\3ize",    "p",    5,

								    "\4abli",       "\4able",   "p",    5,

								    "\4alli",       "\2al",     "p",    5,

								    "\5entli",      "\3ent",    "p",    5,

								    "\3eli",        "\1e",      "p",    5,

								    "\5ousli",      "\3ous",    "p",    5,

								    "\7ization",    "\3ize",    "p",    5,

								    "\5ation",      "\3ate",    "p",    5,

								    "\4ator",       "\3ate",    "p",    5,

								    "\5alism",      "\2al",     "p",    5,

								    "\7iveness",    "\3ive",    "p",    5,

								    "\7fulness",    "\3ful",    "p",    5,

								    "\7ousness",    "\3ous",    "p",    5,

								    "\5aliti",      "\2al",     "p",    5,

								    "\5iviti",      "\3ive",    "p",    5,

								    "\6biliti",     "\3ble",    "p",    5,

								    NULL,           NULL,       NULL,   5,

								};


								RULE RuleTab5[] =

								{

								    "\5icate",      "\2ic",     "p",    6,

								    "\5ative",      "\0",       "p",    6,

								    "\5alize",      "\2al",     "p",    6,

								    "\5iciti",      "\2ic",     "p",    6,

								    "\4ical",       "\2ic",     "p",    6,

								    "\3ful",        "\0",       "p",    6,

								    "\4ness",       "\0",       "p",    6,

								    NULL,           NULL,       NULL,   6,

								};


								RULE RuleTab6[] =

								{

								    "\2al",         "\0",       "2",    7,

								    "\4ance",       "\0",       "2",    7,

								    "\4ence",       "\0",       "2",    7,

								    "\2er",         "\0",       "p",    7,

								    "\2ic",         "\0",       "2",    7,

								    "\4able",       "\0",       "2",    7,

								    "\4ible",       "\0",       "2",    7,

								    "\3ant",        "\0",       "2",    7,

								    "\5ement",      "\0",       "2",    7,

								    "\4ment",       "\0",       "2",    7,

								    "\3ent",        "\0",       "2",    7,

								    "\3ion",        "\0",       "2*s*t|&",      7,

								    "\2ou",         "\0",       "2",    7,

								    "\3ism",        "\0",       "2",    7,

								    "\3ate",        "\0",       "2",    7,

								    "\3iti",        "\0",       "2",    7,

								    "\3ous",        "\0",       "2",    7,

								    "\3ive",        "\0",       "2",    7,

								    "\3ize",        "\0",       "2",    7,

								    NULL,           NULL,       NULL,   7,

								};


								RULE RuleTab7[] =

								{

								    "\1e",          "\0",       "2",    8,

								    "\1e",          "\0",       "1o!&", 8,

								    NULL,           NULL,       NULL,   8,

								};


								RULE RuleTab8[] =

								{

								    "\2ll",         "\1l",      "2",    9,

								    "\0",           "\377\0",   "2*l&d&s",  9,

								    NULL,           NULL,       NULL,   9,

								};


								char CharTypeTab[] =

								{

								    VOWEL,      //a

								    CONSONANT,  //b

								    CONSONANT,  //c

								    CONSONANT,  //d

								    VOWEL,      //e

								    CONSONANT,  //f

								    CONSONANT,  //g

								    CONSONANT,  //h

								    VOWEL,      //i

								    CONSONANT,  //j

								    CONSONANT,  //k

								    CONSONANT,  //l

								    CONSONANT,  //m

								    CONSONANT,  //n

								    VOWEL,      //o

								    CONSONANT,  //p

								    CONSONANT,  //q

								    CONSONANT,  //r

								    CONSONANT,  //s

								    CONSONANT,  //t

								    VOWEL,      //u

								    CONSONANT,  //v

								    CONSONANT,  //w

								    CONSONANT,  //x

								    MIXED,      //y, consonant, but may be vowel if after consonant

								    CONSONANT,  //z

								};


								LPRULE RuleTables[] =

								{

								    RuleTab0,

								    RuleTab1,

								    RuleTab2,

								    RuleTab3,

								    RuleTab4,

								    RuleTab5,

								    RuleTab6,

								    RuleTab7,

								    RuleTab8,

								    NULL,

								};


								/*************************************************************************

								 *

								 *                    INTERNAL PRIVATE FUNCTIONS

								 *  All of them should be declared near

								 *************************************************************************/

								int PRIVATE PASCAL NEAR MeasureCalc (LPB, int);

								int PRIVATE PASCAL NEAR ConditionMet (LPB, LPB, LPB, int);

								int PRIVATE PASCAL NEAR SuffixMatch (LPB lpbWord, LPB lpSuffix);

								HRESULT PRIVATE PASCAL NEAR MarkType (LPB, LPB, int);


								/*************************************************************************

								 *

								 *  @doc    API INDEX RETRIEVAL

								 *

								 *  @func   HRESULT PASCAL FAR | FStem |

								 *      This function will strip the suffix from a word, ie, "stem" it

								 *

								 *  @parm   LPB | lpbStemWord |

								 *      Buffer to contain the stemmed word

								 *

								 *  @parm   LPB | lpbWord |

								 *      Word to be stemmed

								 *

								 *  @rdesc  S_OK if succeeded, or E_INVALIDARG if the null argument is

								 *      passed

								 *

								 *  @comm   The word passed must have all the letters in lower case for

								 *      The function to work with. WARNING: There is no checking about

								 *      case, so thing can go wrong if the word contains upper case letter

								 *      or non alphabetic letter.

								 *

								 *************************************************************************/


								PUBLIC HRESULT PASCAL FAR EXPORT_API FStem (LPB lpbStemWord, LPB lpbWord)

								{

								    register int wLength;   // Length of the word

								    register int i;         // Scratch variable

								    LPRULE lpRuleTab;       // Pointer to rule table

								    LPRULE lpRule;          // Pointer to rule

								    int wLengthSaved;

								    int wNewSuffixLength;   // This must be signed!

								    int wInitSuffixLength;

								    char lpbWordType [CB_MAX_WORD_LEN];

								    LPB szInitSuffix;

								    LPB szNewSuffix;

								    int TableIndex;         // For debugging purpose only

								    int RuleIndex;          // For debugging purpose only

								    LPB lpbTmp;


								    if (lpbWord == NULL)

								        return E_INVALIDARG;


								    wLength = (*(LPW)lpbWordType = *((LPW)lpbWord));

								    if (wLength >= CB_MAX_WORD_LEN)

								        return(E_WORDTOOLONG);


								    /* Copy the word over */

								    MEMCPY (lpbStemWord, lpbWord, wLength + 2);


								    /* Don't do any stemming for words <= 3 bytes */

								    if (wLength <= MIN_LENGTH_FOR_STEM)

								        return S_OK;


								    /* Mark the type of each letter to be consonant or vowel */

								    if (MarkType (lpbStemWord+2, lpbWordType+2, wLength) != S_OK)

								    {

								        /* We got some non alphabetic characters. Just return */

								        return S_OK;

								    }


								    /* Traverse all the tables and check for stemming conditions */


								    for (TableIndex = 0, lpRuleTab = RuleTables[0]; lpRuleTab;)

								    {


								        /* Check for each rule */


								        for (RuleIndex = 0, lpRule = lpRuleTab;

								            szInitSuffix = lpRule->szInitSuffix; lpRule++, RuleIndex++)

								        {


								            szNewSuffix = lpRule->szNewSuffix;


								            /* The casting is needed to make wNewSuffixLength signed */

								            wNewSuffixLength = (char)*szNewSuffix++;


								            wInitSuffixLength = (char)*szInitSuffix++;


								            /* Check for condition match */


								            if (wLength >= wInitSuffixLength)

								            {


								                lpbTmp = lpbStemWord + wLength + 2 - wInitSuffixLength;

								                /* Compare the suffixes */


								                for (i = wInitSuffixLength;

								                    i > 0 && (*lpbTmp == *szInitSuffix);

								                    i--, lpbTmp++, szInitSuffix++);


								                /* Restore szInitSuffix */

								                szInitSuffix = lpRule->szInitSuffix;


								                if (i != 0) // String comparison fails

								                    continue;


								                /* Save the word length */

								                wLengthSaved = wLength;


								                /* Update word length since we don't include the suffix

								                 * length in our computation

								                 */

								                wLength -= wInitSuffixLength;


								                /* Now check the stemming condition */


								                if (ConditionMet (lpbStemWord, lpbWordType,

								                    lpRule->szCondition, wLength))

								                {


								                    /* Rule applies, change to the new suffix */


								                    if (wNewSuffixLength > 0)

								                    {


								                        MEMCPY (&lpbStemWord[wLength+2], szNewSuffix,

								                            wNewSuffixLength);


								                        /* Update the word type */


								                        MarkType (szNewSuffix,

								                            lpbWordType + wLength + 2, wNewSuffixLength);

								                    }


								                    /* Update the word length

								                     * The check for wLength is necessary since we don't

								                     * want to strip evething

								                     */


								                    if (wLength + wNewSuffixLength > 0)

								                        *(LPW)lpbStemWord = (wLength += wNewSuffixLength);


													if (wLength <= MIN_LENGTH_FOR_STEM)

														goto Done;


								                    break;

								                }

								                else

								                {


								                    /* Rule doesn't apply, Restore the word length */

								                    wLength = wLengthSaved;

								                }

								            }

								        }


								        /* Go to the next table */

								        lpRuleTab = RuleTables [TableIndex = lpRule->NextTable];

								    }


									Done:

								    lpbStemWord[*((LPW)lpbStemWord)+2] = 0;

								    return S_OK;

								}


								/*************************************************************************

								 *

								 *  @doc    INTERNAL

								 *

								 *  @func   int PASCAL NEAR | MeasureCalc |

								 *      Calculate the measure of a word. The measure is defined as

								 *      the pair (VC), where V is the vowels, and C consonants. A word

								 *      is described as [C](VC)m[V], where the first C and the last V are

								 *      optional. m is the measure of the word (or part of word without

								 *      the suffix). Example:

								 *          architect: m = 3 (arch, it, ect)

								 *          convention: m = 3 (onv, ent, ion)

								 *          lie: m = 0, since the first consonant, and the last vowels

								 *              don't count

								 *

								 *  @parm   LPB | lpbWordType |

								 *      Buffer containing word type

								 *

								 *  @parm   int | wLength |

								 *      The length of the word

								 *

								 *  @rdesc  Return the measure of the word

								 *

								 *************************************************************************/


								int PRIVATE PASCAL NEAR MeasureCalc (LPB lpbWordType, register int wLength)

								{

								    register int cMeasure;


								#if 0

								    /* Safety chck

								     * IFdef out for speed. This is a internal function

								     */

								    if (lpbWordType == NULL)

								        return 0;

								#endif


								    /* Initialize the word measure */

								    cMeasure = 0;


								    /* Skip the beginning consonants */


								    for (;wLength > 0 && *lpbWordType == CONSONANT; wLength--, lpbWordType++);


								    /* Get the vowel/consonant pairs */

								    while (wLength > 0)

								    {


								        /* Get all the vowels */

								        for (; wLength > 0 && *lpbWordType == VOWEL; wLength--, lpbWordType++);


								        if (wLength > 0)

								        {

								            cMeasure ++;


								            /* Get all the consonants */

								            for (; wLength > 0 && *lpbWordType == CONSONANT;

								                wLength--, lpbWordType++);

								        }

								    }

								    return cMeasure;

								}


								/*************************************************************************

								 *

								 *  @doc    INTERNAL

								 *

								 *  @func   int PASCAL NEAR | ConditionMet |

								 *      This fuction check the condition to be met by a particular

								 *      suffix.

								 *

								 *  @parm   LPB | lpbWord |

								 *      Buffer contains the word to be stemmed> This is a 2-byte prefixed

								 *      pascal string

								 *

								 *  @parm   LPB | lpbWordType |

								 *      Buffer containing the type of each letter of the word. This

								 *      is a parallel buffer

								 *

								 *  @parm   LPB | szCondition |

								 *      Condtion in postfix form

								 *

								 *  @parm   int | wLength |

								 *      Length of the word

								 *

								 *  @rdesc  TRUE, if the condition is met, FALSE otherwise

								 *

								 *************************************************************************/


								int PRIVATE PASCAL NEAR ConditionMet (LPB lpbWord, LPB lpbWordType,

								    LPB szCondition, int wLength)

								{

								    int StackIndex;

								    int Stack[4];

								    int wLengthSaved;

								    int LastByte;

								    LPB lpbTmp;

								    LPB lpbTmpType;


								    if (szCondition == NULL)

								        return TRUE;


								    /* Initialize variables

								     * Note: The original codes are written for a 1-byte length preceded

								     * string. The new format is 2-byte preceded string. To minimize the

								     * change, lpbTmp is used, and points to the 2nd byte

								     */


								    StackIndex = -1;

								    lpbTmp = lpbWord + 1;

								    lpbTmpType = lpbWordType + 1;

								    LastByte = lpbTmp[wLength];


								    while (*szCondition)

								    {

								        switch (*szCondition)

								        {

								            case '*':   // *S in the document


								                /* Check to see if the stem ends with the next letter */


								                Stack[++StackIndex] =

								                    (LastByte == *(++szCondition));

								                break;


								            case 'd':   // *d in the document


								                /* Check to see if the stem ends with a double consonant */


								                Stack[++StackIndex] = (wLength > 2 &&

								                    LastByte == lpbTmp[wLength - 1] &&

								                    lpbTmpType[wLength] == CONSONANT);

								                break;


								            case 's':   // Remove the last consonant

								                if (Stack[0])

								                {

								                    lpbTmp[wLength] = 0;

								                    wLength --;

								                    *(LPW)lpbWordType = *(LPW)lpbWord = (WORD) wLength;

								                }

								                break;


								            case 'v':   // *v* in the document


								                /* Check to see if the word has a vowel */


								                wLengthSaved = wLength; /* Save the length */


								                for (; wLength &&

								                    lpbTmpType[wLength] != VOWEL; wLength--);


								                Stack[++StackIndex] = wLength > 0;


								                /* Restore the word length */

								                wLength = wLengthSaved;

								                break;


								            case 'o':

								                /* *o in the document, ie.

								                    - The word ends with the form cvc

								                    - The second c is not W, X, Y

													The +2 is for skipping the word length

								                 */

								                Stack[++StackIndex] = (wLength >= 3) &&

								                    (lpbWordType[wLength + 1] == CONSONANT) &&

								                    (lpbWordType[wLength] == VOWEL) &&

								                    (lpbWordType[wLength - 1] == CONSONANT) &&

								                    (LastByte != 'w' && LastByte != 'x' && LastByte != 'y');

								                break;


								            /* The conditions below test Measure. If they fails, then

								             * the whole condition fails. ie. there is no need to test

								             * any other conditions. There is no need to save the result

								             * on the stack

								             */


								            case 'p':   // Measure > 0

								                if ((Stack[++StackIndex] =

								                    MeasureCalc (lpbWordType+2, wLength) > 0) == FALSE)

								                    return FALSE;

								                break;


								            case '2':   // Measure > 1

								                if ((Stack[++StackIndex] =

								                    MeasureCalc (lpbWordType+2, wLength) > 1) == FALSE)

								                    return FALSE;

								                break;


								            case '1':   // Measure == 1

								                if ((Stack[++StackIndex] =

								                    MeasureCalc (lpbWordType+2, wLength) == 1) == FALSE)

								                    return FALSE;

								                break;


								            /* The next conditions are operators combination */


								            case '|':


								                /* OR the result of the top 2 stack entries */


								                Stack[StackIndex-1] |= Stack[StackIndex];

								                StackIndex--;

								                break;


								            case '&':


								                /* AND the result of the top 2 stack entries */


								                Stack[StackIndex-1] &= Stack[StackIndex];

								                StackIndex--;

								                break;


								            case '!':


								                /* NOT the result of the top stack entry */


								                Stack[StackIndex] = !Stack[StackIndex];

								                break;


								            default:

								                return FALSE;

								        }

								        szCondition++;

								    }


								    return Stack[0];

								}


								/*************************************************************************

								 *

								 *  @doc    INTERNAL

								 *

								 *  @func   HRESULT PASCAL NEAR | MarkType |

								 *      Marking the type of each letter of the word to be CONSONANT or

								 *      VOWEL

								 *

								 *  @parm   LPB | lpbWord |

								 *      Buffer containing the word

								 *

								 *  @parm   LPB | lpBufType |

								 *      Buffer to contain the type of the letters

								 *

								 *  @parm   int | wLength |

								 *      Length of the word

								 *

								 *************************************************************************/


								HRESULT PRIVATE PASCAL NEAR MarkType (LPB lpbWord, LPB lpBufType, int wLength)

								{

								    for (; wLength > 0; lpBufType++, lpbWord++, wLength--)

								    {


								        /* Consider wildcard characters to be consonnant */

								        if (*lpbWord == '?' || *lpbWord == '*')

								        {

								            *lpBufType = CONSONANT;

								            continue;

								        }


								        if (*lpbWord < 'a' || *lpbWord > 'z')

								            return E_FAIL;


								        switch (CharTypeTab [*lpbWord - 'a'])

								        {

								            case CONSONANT:

								                *lpBufType = CONSONANT;

								                break;

								            case VOWEL:

								                *lpBufType = VOWEL;

								                break;

								            case MIXED:

								                if (*(lpBufType - 1) == CONSONANT)

								                    *lpBufType = VOWEL;

								                else

								                    *lpBufType = CONSONANT;

								                break;

								        }

								    }

								    return S_OK;

								}