windows-server-2003/inetsrv/query/apps/lrsample/porter.hxx

//+-------------------------------------------------------------------------
//
// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
// PARTICULAR PURPOSE.
//
// Copyright 2001 - 2001 Microsoft Corporation.  All Rights Reserved.
//
// File:     porter.hxx
//
// PURPOSE:  Simple implementation of the Porter stemming algorithm.
//
// PLATFORM: Windows 2000 and later
//
//--------------------------------------------------------------------------

#pragma once

const ULONG cwcMaxPorterWord = 128;

inline BOOL has_suffix( WCHAR *word, WCHAR const *suffix, WCHAR *stem )
{
    WCHAR tmp[cwcMaxPorterWord];

    ULONG cwcWord = wcslen( word );
    ULONG cwcSuffix = wcslen( suffix );

    if ( cwcWord <= cwcSuffix )
        return FALSE;

    if ( ( cwcSuffix > 1 ) &&
         ( word[cwcWord - 2] != suffix[ cwcSuffix - 2] ) )
        return FALSE;

    stem[0] = 0;
    wcsncat( stem, word, cwcWord - cwcSuffix );
    wcscpy( tmp, stem );
    wcscat( tmp, suffix );

    return ( wcscmp ( tmp, word ) == 0 );
} //has_suffix

inline int vowel( WCHAR ch, WCHAR prev )
{
    switch ( ch )
    {
        case 'a':
        case 'e':
        case 'i':
        case 'o':
        case 'u': return TRUE;
        case 'y': return vowel( prev, L'?' );
        default : return FALSE;
    }
} //vowel

inline int cvc( WCHAR *string )
{
    int length = wcslen( string );
    if ( length < 3 )
        return FALSE;

    return ( ( !vowel( string[length-1], string[length-2] ) ) &&
             ( string[length-1] != 'w') &&
             ( string[length-1] != 'x') &&
             ( string[length-1] != 'y') &&
             ( vowel(string[length-2],string[length-3])) &&
             ( ( ( length == 3 ) && ( !vowel( string[0], L'a' ) ) ) ||
               !vowel( string[length-3], string[length-4] ) ) );
} //cvc

inline int measure( WCHAR *stem )
{
    int i=0, count = 0;
    int length = wcslen( stem );

    while ( i < length )
    {
        for ( ; i < length ; i++ )
        {
            if ( i > 0 )
            {
                if ( vowel( stem[i], stem[i-1] ) )
                    break;
            }
            else
            {
                if ( vowel( stem[i], L'a' ) )
                    break;
            }
        }
        for ( i++ ; i < length ; i++ )
        {
            if ( i > 0 )
            {
                if ( ! vowel( stem[i], stem[i-1] ) )
                    break;
            }
            else
            {
                if ( ! vowel( stem[i], L'?' ) )
                    break;
            }
        }
        if ( i < length )
        {
            count++;
            i++;
        }
    }
    return count;
} //measure

inline BOOL contains_vowel( WCHAR *word )
{
    int i;
    int cwc = wcslen( word );
    for ( i=0 ; i < cwc; i++ )
    {
        if ( i > 0 )
        {
            if ( vowel( word[i], word[i-1] ) )
                return TRUE;
        }
        else
        {
            if ( vowel( word[0], L'a' ) )
                return TRUE;
        }
    }
    return FALSE;
} //contains_vowel

inline void PorterStep1( WCHAR * pwc )
{
    WCHAR stem[ cwcMaxPorterWord ];

    if ( pwc[wcslen( pwc ) - 1] == L's' )
    {
        if ( has_suffix( pwc, L"sses", stem )  ||
             has_suffix( pwc, L"ies", stem ) )
            pwc[wcslen( pwc ) - 2] = '\0';
        else if ( pwc[wcslen( pwc ) - 2] != 's' )
            pwc[wcslen( pwc ) - 1] = '\0';
    }

    if ( has_suffix( pwc, L"eed", stem ) )
    {
        if ( measure(stem) > 0 )
            pwc[wcslen(pwc)-1] = '\0';
    }
    else if ( ( has_suffix( pwc, L"ed", stem ) ||
                has_suffix( pwc, L"ing", stem ) ) &&
              ( contains_vowel( stem ) ) )
    {
        pwc[wcslen( stem )] = '\0';
        if ( ( has_suffix( pwc, L"at", stem ) ) ||
             ( has_suffix( pwc, L"bl", stem ) ) ||
             ( has_suffix( pwc, L"iz", stem ) ) )
        {
            pwc[wcslen( pwc ) + 1] = '\0';
            pwc[wcslen( pwc )] = 'e';
        }
        else
        {
            int length = wcslen( pwc );
            if ( (pwc[length-1] == pwc[length-2]) &&
                 (pwc[length-1] != 'l') &&
                 (pwc[length-1] != 's') &&
                 (pwc[length-1] != 'z') )
                pwc[length-1] = '\0';
            else if ( measure( pwc ) == 1 )
            {
                if ( cvc( pwc ) )
                {
                    pwc[wcslen(pwc)+1] = '\0';
                    pwc[wcslen(pwc)] = 'e';
                }
            }
        }
    }

    if ( ( has_suffix( pwc, L"y", stem ) ) &&
         ( contains_vowel( stem ) ) )
        pwc[wcslen( pwc ) - 1] = L'i';
} //PorterStep1

inline void PorterStep2( WCHAR * pwc )
{
    const WCHAR *suffixes[][2] =
    {
        { L"ational", L"ate" },
        { L"tional",  L"tion" },
        { L"enci",    L"ence" },
        { L"anci",    L"ance" },
        { L"izer",    L"ize" },
        { L"iser",    L"ize" },
        { L"abli",    L"able" },
        { L"alli",    L"al" },
        { L"entli",   L"ent" },
        { L"eli",     L"e" },
        { L"ousli",   L"ous" },
        { L"ization", L"ize" },
        { L"isation", L"ize" },
        { L"ation",   L"ate" },
        { L"ator",    L"ate" },
        { L"alism",   L"al" },
        { L"iveness", L"ive" },
        { L"fulness", L"ful" },
        { L"ousness", L"ous" },
        { L"aliti",   L"al" },
        { L"iviti",   L"ive" },
        { L"biliti",  L"ble" },
        { 0,          0 }
    };

    WCHAR stem[cwcMaxPorterWord];
    int index;
    for ( index = 0 ; suffixes[index][0] != 0 ; index++ )
    {
        if ( has_suffix ( pwc, suffixes[index][0], stem ) )
        {
            if ( measure ( stem ) > 0 )
            {
                wsprintf ( pwc, L"%ws%ws", stem, suffixes[index][1] );
                return;
            }
        }
    }
} //PorterStep2

inline void PorterStep3( WCHAR * pwc )
{
    const WCHAR *suffixes[][2] =
    {
        { L"icate", L"ic" },
        { L"ative", L"" },
        { L"alize", L"al" },
        { L"alise", L"al" },
        { L"iciti", L"ic" },
        { L"ical",  L"ic" },
        { L"ful",   L"" },
        { L"ness",  L"" },
        { 0,       0 }
    };

    WCHAR stem[cwcMaxPorterWord];
    int index;
    for ( index = 0 ; suffixes[index][0] != 0 ; index++ )
    {
        if ( has_suffix ( pwc, suffixes[index][0], stem ) )
            if ( measure ( stem ) > 0 )
            {
                wsprintf ( pwc, L"%ws%ws", stem, suffixes[index][1] );
                return;
            }
    }
} //PorterStep3

inline void PorterStep4( WCHAR * pwc )
{
    const WCHAR *suffixes[] =
    {
        L"al", L"ance", L"ence", L"er", L"ic", L"able",
        L"ible", L"ant", L"ement", L"ment", L"ent", L"sion",
        L"tion", L"ou", L"ism", L"ate", L"iti", L"ous",
        L"ive", L"ize", L"ise", 0
    };

    WCHAR stem[cwcMaxPorterWord];
    int index;
    for ( index = 0 ; suffixes[index] != 0 ; index++ )
    {
        if ( ( has_suffix ( pwc, suffixes[index], stem ) ) &&
             ( measure ( stem ) > 1 ) )
        {
            wcscpy( pwc, stem );
            return;
        }
    }
} //PorterStep4

inline void PorterStep5( WCHAR *pwc )
{
    if ( pwc[wcslen(pwc)-1] == L'e' )
    {
        if ( measure(pwc) > 1 )
        {
            // measure(pwc)==measure(stem) if ends in vowel

            pwc[wcslen(pwc)-1] = '\0';
        }
        else if ( measure(pwc) == 1 )
        {
            WCHAR stem[cwcMaxPorterWord];
            wcscpy(stem,L"");
            wcsncat( stem, pwc, wcslen(pwc)-1 );
            if ( cvc(stem) == FALSE )
                pwc[wcslen(pwc)-1] = '\0';
        }
    }

    if ( (pwc[wcslen(pwc)-1] == L'l') &&
         (pwc[wcslen(pwc)-2] == L'l') &&
         (measure(pwc) > 1) )
        pwc[wcslen(pwc)-1] = L'\0';
} //PorterStep5

inline void GetPorterStemForm( WCHAR * pwc )
{
    PorterStep1( pwc );
    PorterStep2( pwc );
    PorterStep3( pwc );
    PorterStep4( pwc );
    PorterStep5( pwc );
} //GetPorterStemForm