Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

318 lines
8.3 KiB

  1. //+-------------------------------------------------------------------------
  2. //
  3. // THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
  4. // ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  5. // THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
  6. // PARTICULAR PURPOSE.
  7. //
  8. // Copyright 2001 - 2001 Microsoft Corporation. All Rights Reserved.
  9. //
  10. // File: porter.hxx
  11. //
  12. // PURPOSE: Simple implementation of the Porter stemming algorithm.
  13. //
  14. // PLATFORM: Windows 2000 and later
  15. //
  16. //--------------------------------------------------------------------------
  17. #pragma once
  18. const ULONG cwcMaxPorterWord = 128;
  19. inline BOOL has_suffix( WCHAR *word, WCHAR const *suffix, WCHAR *stem )
  20. {
  21. WCHAR tmp[cwcMaxPorterWord];
  22. ULONG cwcWord = wcslen( word );
  23. ULONG cwcSuffix = wcslen( suffix );
  24. if ( cwcWord <= cwcSuffix )
  25. return FALSE;
  26. if ( ( cwcSuffix > 1 ) &&
  27. ( word[cwcWord - 2] != suffix[ cwcSuffix - 2] ) )
  28. return FALSE;
  29. stem[0] = 0;
  30. wcsncat( stem, word, cwcWord - cwcSuffix );
  31. wcscpy( tmp, stem );
  32. wcscat( tmp, suffix );
  33. return ( wcscmp ( tmp, word ) == 0 );
  34. } //has_suffix
  35. inline int vowel( WCHAR ch, WCHAR prev )
  36. {
  37. switch ( ch )
  38. {
  39. case 'a':
  40. case 'e':
  41. case 'i':
  42. case 'o':
  43. case 'u': return TRUE;
  44. case 'y': return vowel( prev, L'?' );
  45. default : return FALSE;
  46. }
  47. } //vowel
  48. inline int cvc( WCHAR *string )
  49. {
  50. int length = wcslen( string );
  51. if ( length < 3 )
  52. return FALSE;
  53. return ( ( !vowel( string[length-1], string[length-2] ) ) &&
  54. ( string[length-1] != 'w') &&
  55. ( string[length-1] != 'x') &&
  56. ( string[length-1] != 'y') &&
  57. ( vowel(string[length-2],string[length-3])) &&
  58. ( ( ( length == 3 ) && ( !vowel( string[0], L'a' ) ) ) ||
  59. !vowel( string[length-3], string[length-4] ) ) );
  60. } //cvc
  61. inline int measure( WCHAR *stem )
  62. {
  63. int i=0, count = 0;
  64. int length = wcslen( stem );
  65. while ( i < length )
  66. {
  67. for ( ; i < length ; i++ )
  68. {
  69. if ( i > 0 )
  70. {
  71. if ( vowel( stem[i], stem[i-1] ) )
  72. break;
  73. }
  74. else
  75. {
  76. if ( vowel( stem[i], L'a' ) )
  77. break;
  78. }
  79. }
  80. for ( i++ ; i < length ; i++ )
  81. {
  82. if ( i > 0 )
  83. {
  84. if ( ! vowel( stem[i], stem[i-1] ) )
  85. break;
  86. }
  87. else
  88. {
  89. if ( ! vowel( stem[i], L'?' ) )
  90. break;
  91. }
  92. }
  93. if ( i < length )
  94. {
  95. count++;
  96. i++;
  97. }
  98. }
  99. return count;
  100. } //measure
  101. inline BOOL contains_vowel( WCHAR *word )
  102. {
  103. int i;
  104. int cwc = wcslen( word );
  105. for ( i=0 ; i < cwc; i++ )
  106. {
  107. if ( i > 0 )
  108. {
  109. if ( vowel( word[i], word[i-1] ) )
  110. return TRUE;
  111. }
  112. else
  113. {
  114. if ( vowel( word[0], L'a' ) )
  115. return TRUE;
  116. }
  117. }
  118. return FALSE;
  119. } //contains_vowel
  120. inline void PorterStep1( WCHAR * pwc )
  121. {
  122. WCHAR stem[ cwcMaxPorterWord ];
  123. if ( pwc[wcslen( pwc ) - 1] == L's' )
  124. {
  125. if ( has_suffix( pwc, L"sses", stem ) ||
  126. has_suffix( pwc, L"ies", stem ) )
  127. pwc[wcslen( pwc ) - 2] = '\0';
  128. else if ( pwc[wcslen( pwc ) - 2] != 's' )
  129. pwc[wcslen( pwc ) - 1] = '\0';
  130. }
  131. if ( has_suffix( pwc, L"eed", stem ) )
  132. {
  133. if ( measure(stem) > 0 )
  134. pwc[wcslen(pwc)-1] = '\0';
  135. }
  136. else if ( ( has_suffix( pwc, L"ed", stem ) ||
  137. has_suffix( pwc, L"ing", stem ) ) &&
  138. ( contains_vowel( stem ) ) )
  139. {
  140. pwc[wcslen( stem )] = '\0';
  141. if ( ( has_suffix( pwc, L"at", stem ) ) ||
  142. ( has_suffix( pwc, L"bl", stem ) ) ||
  143. ( has_suffix( pwc, L"iz", stem ) ) )
  144. {
  145. pwc[wcslen( pwc ) + 1] = '\0';
  146. pwc[wcslen( pwc )] = 'e';
  147. }
  148. else
  149. {
  150. int length = wcslen( pwc );
  151. if ( (pwc[length-1] == pwc[length-2]) &&
  152. (pwc[length-1] != 'l') &&
  153. (pwc[length-1] != 's') &&
  154. (pwc[length-1] != 'z') )
  155. pwc[length-1] = '\0';
  156. else if ( measure( pwc ) == 1 )
  157. {
  158. if ( cvc( pwc ) )
  159. {
  160. pwc[wcslen(pwc)+1] = '\0';
  161. pwc[wcslen(pwc)] = 'e';
  162. }
  163. }
  164. }
  165. }
  166. if ( ( has_suffix( pwc, L"y", stem ) ) &&
  167. ( contains_vowel( stem ) ) )
  168. pwc[wcslen( pwc ) - 1] = L'i';
  169. } //PorterStep1
  170. inline void PorterStep2( WCHAR * pwc )
  171. {
  172. const WCHAR *suffixes[][2] =
  173. {
  174. { L"ational", L"ate" },
  175. { L"tional", L"tion" },
  176. { L"enci", L"ence" },
  177. { L"anci", L"ance" },
  178. { L"izer", L"ize" },
  179. { L"iser", L"ize" },
  180. { L"abli", L"able" },
  181. { L"alli", L"al" },
  182. { L"entli", L"ent" },
  183. { L"eli", L"e" },
  184. { L"ousli", L"ous" },
  185. { L"ization", L"ize" },
  186. { L"isation", L"ize" },
  187. { L"ation", L"ate" },
  188. { L"ator", L"ate" },
  189. { L"alism", L"al" },
  190. { L"iveness", L"ive" },
  191. { L"fulness", L"ful" },
  192. { L"ousness", L"ous" },
  193. { L"aliti", L"al" },
  194. { L"iviti", L"ive" },
  195. { L"biliti", L"ble" },
  196. { 0, 0 }
  197. };
  198. WCHAR stem[cwcMaxPorterWord];
  199. int index;
  200. for ( index = 0 ; suffixes[index][0] != 0 ; index++ )
  201. {
  202. if ( has_suffix ( pwc, suffixes[index][0], stem ) )
  203. {
  204. if ( measure ( stem ) > 0 )
  205. {
  206. wsprintf ( pwc, L"%ws%ws", stem, suffixes[index][1] );
  207. return;
  208. }
  209. }
  210. }
  211. } //PorterStep2
  212. inline void PorterStep3( WCHAR * pwc )
  213. {
  214. const WCHAR *suffixes[][2] =
  215. {
  216. { L"icate", L"ic" },
  217. { L"ative", L"" },
  218. { L"alize", L"al" },
  219. { L"alise", L"al" },
  220. { L"iciti", L"ic" },
  221. { L"ical", L"ic" },
  222. { L"ful", L"" },
  223. { L"ness", L"" },
  224. { 0, 0 }
  225. };
  226. WCHAR stem[cwcMaxPorterWord];
  227. int index;
  228. for ( index = 0 ; suffixes[index][0] != 0 ; index++ )
  229. {
  230. if ( has_suffix ( pwc, suffixes[index][0], stem ) )
  231. if ( measure ( stem ) > 0 )
  232. {
  233. wsprintf ( pwc, L"%ws%ws", stem, suffixes[index][1] );
  234. return;
  235. }
  236. }
  237. } //PorterStep3
  238. inline void PorterStep4( WCHAR * pwc )
  239. {
  240. const WCHAR *suffixes[] =
  241. {
  242. L"al", L"ance", L"ence", L"er", L"ic", L"able",
  243. L"ible", L"ant", L"ement", L"ment", L"ent", L"sion",
  244. L"tion", L"ou", L"ism", L"ate", L"iti", L"ous",
  245. L"ive", L"ize", L"ise", 0
  246. };
  247. WCHAR stem[cwcMaxPorterWord];
  248. int index;
  249. for ( index = 0 ; suffixes[index] != 0 ; index++ )
  250. {
  251. if ( ( has_suffix ( pwc, suffixes[index], stem ) ) &&
  252. ( measure ( stem ) > 1 ) )
  253. {
  254. wcscpy( pwc, stem );
  255. return;
  256. }
  257. }
  258. } //PorterStep4
  259. inline void PorterStep5( WCHAR *pwc )
  260. {
  261. if ( pwc[wcslen(pwc)-1] == L'e' )
  262. {
  263. if ( measure(pwc) > 1 )
  264. {
  265. // measure(pwc)==measure(stem) if ends in vowel
  266. pwc[wcslen(pwc)-1] = '\0';
  267. }
  268. else if ( measure(pwc) == 1 )
  269. {
  270. WCHAR stem[cwcMaxPorterWord];
  271. wcscpy(stem,L"");
  272. wcsncat( stem, pwc, wcslen(pwc)-1 );
  273. if ( cvc(stem) == FALSE )
  274. pwc[wcslen(pwc)-1] = '\0';
  275. }
  276. }
  277. if ( (pwc[wcslen(pwc)-1] == L'l') &&
  278. (pwc[wcslen(pwc)-2] == L'l') &&
  279. (measure(pwc) > 1) )
  280. pwc[wcslen(pwc)-1] = L'\0';
  281. } //PorterStep5
  282. inline void GetPorterStemForm( WCHAR * pwc )
  283. {
  284. PorterStep1( pwc );
  285. PorterStep2( pwc );
  286. PorterStep3( pwc );
  287. PorterStep4( pwc );
  288. PorterStep5( pwc );
  289. } //GetPorterStemForm