Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

318 lines
8.3 KiB

//+-------------------------------------------------------------------------
//
// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
// PARTICULAR PURPOSE.
//
// Copyright 2001 - 2001 Microsoft Corporation. All Rights Reserved.
//
// File: porter.hxx
//
// PURPOSE: Simple implementation of the Porter stemming algorithm.
//
// PLATFORM: Windows 2000 and later
//
//--------------------------------------------------------------------------
#pragma once
const ULONG cwcMaxPorterWord = 128;
inline BOOL has_suffix( WCHAR *word, WCHAR const *suffix, WCHAR *stem )
{
WCHAR tmp[cwcMaxPorterWord];
ULONG cwcWord = wcslen( word );
ULONG cwcSuffix = wcslen( suffix );
if ( cwcWord <= cwcSuffix )
return FALSE;
if ( ( cwcSuffix > 1 ) &&
( word[cwcWord - 2] != suffix[ cwcSuffix - 2] ) )
return FALSE;
stem[0] = 0;
wcsncat( stem, word, cwcWord - cwcSuffix );
wcscpy( tmp, stem );
wcscat( tmp, suffix );
return ( wcscmp ( tmp, word ) == 0 );
} //has_suffix
inline int vowel( WCHAR ch, WCHAR prev )
{
switch ( ch )
{
case 'a':
case 'e':
case 'i':
case 'o':
case 'u': return TRUE;
case 'y': return vowel( prev, L'?' );
default : return FALSE;
}
} //vowel
inline int cvc( WCHAR *string )
{
int length = wcslen( string );
if ( length < 3 )
return FALSE;
return ( ( !vowel( string[length-1], string[length-2] ) ) &&
( string[length-1] != 'w') &&
( string[length-1] != 'x') &&
( string[length-1] != 'y') &&
( vowel(string[length-2],string[length-3])) &&
( ( ( length == 3 ) && ( !vowel( string[0], L'a' ) ) ) ||
!vowel( string[length-3], string[length-4] ) ) );
} //cvc
inline int measure( WCHAR *stem )
{
int i=0, count = 0;
int length = wcslen( stem );
while ( i < length )
{
for ( ; i < length ; i++ )
{
if ( i > 0 )
{
if ( vowel( stem[i], stem[i-1] ) )
break;
}
else
{
if ( vowel( stem[i], L'a' ) )
break;
}
}
for ( i++ ; i < length ; i++ )
{
if ( i > 0 )
{
if ( ! vowel( stem[i], stem[i-1] ) )
break;
}
else
{
if ( ! vowel( stem[i], L'?' ) )
break;
}
}
if ( i < length )
{
count++;
i++;
}
}
return count;
} //measure
inline BOOL contains_vowel( WCHAR *word )
{
int i;
int cwc = wcslen( word );
for ( i=0 ; i < cwc; i++ )
{
if ( i > 0 )
{
if ( vowel( word[i], word[i-1] ) )
return TRUE;
}
else
{
if ( vowel( word[0], L'a' ) )
return TRUE;
}
}
return FALSE;
} //contains_vowel
inline void PorterStep1( WCHAR * pwc )
{
WCHAR stem[ cwcMaxPorterWord ];
if ( pwc[wcslen( pwc ) - 1] == L's' )
{
if ( has_suffix( pwc, L"sses", stem ) ||
has_suffix( pwc, L"ies", stem ) )
pwc[wcslen( pwc ) - 2] = '\0';
else if ( pwc[wcslen( pwc ) - 2] != 's' )
pwc[wcslen( pwc ) - 1] = '\0';
}
if ( has_suffix( pwc, L"eed", stem ) )
{
if ( measure(stem) > 0 )
pwc[wcslen(pwc)-1] = '\0';
}
else if ( ( has_suffix( pwc, L"ed", stem ) ||
has_suffix( pwc, L"ing", stem ) ) &&
( contains_vowel( stem ) ) )
{
pwc[wcslen( stem )] = '\0';
if ( ( has_suffix( pwc, L"at", stem ) ) ||
( has_suffix( pwc, L"bl", stem ) ) ||
( has_suffix( pwc, L"iz", stem ) ) )
{
pwc[wcslen( pwc ) + 1] = '\0';
pwc[wcslen( pwc )] = 'e';
}
else
{
int length = wcslen( pwc );
if ( (pwc[length-1] == pwc[length-2]) &&
(pwc[length-1] != 'l') &&
(pwc[length-1] != 's') &&
(pwc[length-1] != 'z') )
pwc[length-1] = '\0';
else if ( measure( pwc ) == 1 )
{
if ( cvc( pwc ) )
{
pwc[wcslen(pwc)+1] = '\0';
pwc[wcslen(pwc)] = 'e';
}
}
}
}
if ( ( has_suffix( pwc, L"y", stem ) ) &&
( contains_vowel( stem ) ) )
pwc[wcslen( pwc ) - 1] = L'i';
} //PorterStep1
inline void PorterStep2( WCHAR * pwc )
{
const WCHAR *suffixes[][2] =
{
{ L"ational", L"ate" },
{ L"tional", L"tion" },
{ L"enci", L"ence" },
{ L"anci", L"ance" },
{ L"izer", L"ize" },
{ L"iser", L"ize" },
{ L"abli", L"able" },
{ L"alli", L"al" },
{ L"entli", L"ent" },
{ L"eli", L"e" },
{ L"ousli", L"ous" },
{ L"ization", L"ize" },
{ L"isation", L"ize" },
{ L"ation", L"ate" },
{ L"ator", L"ate" },
{ L"alism", L"al" },
{ L"iveness", L"ive" },
{ L"fulness", L"ful" },
{ L"ousness", L"ous" },
{ L"aliti", L"al" },
{ L"iviti", L"ive" },
{ L"biliti", L"ble" },
{ 0, 0 }
};
WCHAR stem[cwcMaxPorterWord];
int index;
for ( index = 0 ; suffixes[index][0] != 0 ; index++ )
{
if ( has_suffix ( pwc, suffixes[index][0], stem ) )
{
if ( measure ( stem ) > 0 )
{
wsprintf ( pwc, L"%ws%ws", stem, suffixes[index][1] );
return;
}
}
}
} //PorterStep2
inline void PorterStep3( WCHAR * pwc )
{
const WCHAR *suffixes[][2] =
{
{ L"icate", L"ic" },
{ L"ative", L"" },
{ L"alize", L"al" },
{ L"alise", L"al" },
{ L"iciti", L"ic" },
{ L"ical", L"ic" },
{ L"ful", L"" },
{ L"ness", L"" },
{ 0, 0 }
};
WCHAR stem[cwcMaxPorterWord];
int index;
for ( index = 0 ; suffixes[index][0] != 0 ; index++ )
{
if ( has_suffix ( pwc, suffixes[index][0], stem ) )
if ( measure ( stem ) > 0 )
{
wsprintf ( pwc, L"%ws%ws", stem, suffixes[index][1] );
return;
}
}
} //PorterStep3
inline void PorterStep4( WCHAR * pwc )
{
const WCHAR *suffixes[] =
{
L"al", L"ance", L"ence", L"er", L"ic", L"able",
L"ible", L"ant", L"ement", L"ment", L"ent", L"sion",
L"tion", L"ou", L"ism", L"ate", L"iti", L"ous",
L"ive", L"ize", L"ise", 0
};
WCHAR stem[cwcMaxPorterWord];
int index;
for ( index = 0 ; suffixes[index] != 0 ; index++ )
{
if ( ( has_suffix ( pwc, suffixes[index], stem ) ) &&
( measure ( stem ) > 1 ) )
{
wcscpy( pwc, stem );
return;
}
}
} //PorterStep4
inline void PorterStep5( WCHAR *pwc )
{
if ( pwc[wcslen(pwc)-1] == L'e' )
{
if ( measure(pwc) > 1 )
{
// measure(pwc)==measure(stem) if ends in vowel
pwc[wcslen(pwc)-1] = '\0';
}
else if ( measure(pwc) == 1 )
{
WCHAR stem[cwcMaxPorterWord];
wcscpy(stem,L"");
wcsncat( stem, pwc, wcslen(pwc)-1 );
if ( cvc(stem) == FALSE )
pwc[wcslen(pwc)-1] = '\0';
}
}
if ( (pwc[wcslen(pwc)-1] == L'l') &&
(pwc[wcslen(pwc)-2] == L'l') &&
(measure(pwc) > 1) )
pwc[wcslen(pwc)-1] = L'\0';
} //PorterStep5
inline void GetPorterStemForm( WCHAR * pwc )
{
PorterStep1( pwc );
PorterStep2( pwc );
PorterStep3( pwc );
PorterStep4( pwc );
PorterStep5( pwc );
} //GetPorterStemForm