Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

565 lines
16 KiB

  1. //+-------------------------------------------------------------------------
  2. //
  3. // THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
  4. // ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
  5. // THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
  6. // PARTICULAR PURPOSE.
  7. //
  8. // Copyright 2001 - 2001 Microsoft Corporation. All Rights Reserved.
  9. //
  10. // File: stem.hxx
  11. //
  12. // PURPOSE: Classes to read a binary file of stem expansions
  13. //
  14. // PLATFORM: Windows 2000 and later
  15. //
  16. //--------------------------------------------------------------------------
  17. #pragma once
  18. #define SFX_ADD_D 0xff
  19. #define SFX_ADD_DED 0xfe
  20. #define SFX_ADD_DING 0xfd
  21. #define SFX_ADD_E 0xfc
  22. #define SFX_ADD_ED 0xfb
  23. #define SFX_ADD_EN 0xfa
  24. #define SFX_ADD_ER 0xf9
  25. #define SFX_ADD_ES 0xf8
  26. #define SFX_ADD_EST 0xf7
  27. #define SFX_ADD_ING 0xf6
  28. #define SFX_ADD_KED 0xf5
  29. #define SFX_ADD_KING 0xf4
  30. #define SFX_ADD_LED 0xf3
  31. #define SFX_ADD_LING 0xf2
  32. #define SFX_ADD_N 0xf1
  33. #define SFX_ADD_NER 0xf0
  34. #define SFX_ADD_R 0xef
  35. #define SFX_ADD_S 0xee
  36. #define SFX_ADD_SES 0xed
  37. #define SFX_ADD_ST 0xec
  38. #define SFX_ADD_T 0xeb
  39. #define SFX_ADD_TA 0xea
  40. #define SFX_DROP_EY_ADD_IER 0xe9
  41. #define SFX_DROP_EY_ADD_IEST 0xe8
  42. #define SFX_DROP_E_ADD_ING 0xe7
  43. #define SFX_DROP_LAST_ADD_T 0xe6
  44. #define SFX_DROP_ON_ADD_A 0xe5
  45. #define SFX_DROP_O_ADD_I 0xe4
  46. #define SFX_DROP_UM_ADD_A 0xe3
  47. #define SFX_DROP_US_ADD_I 0xe2
  48. #define SFX_DROP_Y_ADD_IED 0xe1
  49. #define SFX_DROP_Y_ADD_IER 0xe0
  50. #define SFX_DROP_Y_ADD_IES 0xdf
  51. #define SFX_DROP_Y_ADD_IEST 0xde
  52. #define SFX_REPEATLAST_ADD_ED 0xdd
  53. #define SFX_REPEATLAST_ADD_ER 0xdc
  54. #define SFX_REPEATLAST_ADD_EST 0xdb
  55. #define SFX_REPEATLAST_ADD_ING 0xda
  56. #define SFX_SINGLE_BYTE 0xd0 // values >= than this take 1 byte
  57. #define SFX_SWAP_PENULTIMATE 0xcf
  58. #define SFX_PREFIX 0xce
  59. #define SFX_NOPREFIX 0xcd
  60. __inline BOOL IsHighBitSet( BYTE b ) { return ( 0 != ( b & 0x80 ) ); }
  61. const unsigned cbMaxStem = 50;
  62. const unsigned stemInvalid = 0xffffffff;
  63. class CDirectoryEntry
  64. {
  65. public:
  66. void Set( unsigned off, unsigned entry )
  67. {
  68. value = ( ( entry << 24 ) | off );
  69. }
  70. unsigned Offset()
  71. {
  72. return ( value & 0x00ffffff );
  73. }
  74. unsigned Entry()
  75. {
  76. return ( ( value & 0xff000000 ) >> 24 );
  77. }
  78. private:
  79. unsigned value;
  80. };
  81. class CStemSet
  82. {
  83. public:
  84. CStemSet( BYTE * pb, unsigned oSet ) : _pb( pb + oSet )
  85. {
  86. _ccRoot = 0;
  87. while ( ( 0 != _pb[_ccRoot] ) &&
  88. ( !IsHighBitSet( _pb[_ccRoot] ) ) )
  89. {
  90. _acRoot[_ccRoot] = _pb[_ccRoot];
  91. _ccRoot++;
  92. }
  93. _acRoot[ _ccRoot ] = 0;
  94. }
  95. BOOL IsGreaterThan( unsigned iEntry, char const * pcKey )
  96. {
  97. char ac[ cbMaxStem ];
  98. unsigned o = stemInvalid;
  99. GetNth( ac, iEntry, o );
  100. return ( strcmp( ac, pcKey ) > 0 );
  101. }
  102. BOOL GetForm( char * pcOut, unsigned & iBmk )
  103. {
  104. return GetNth( pcOut, 0, iBmk );
  105. }
  106. BOOL GetNth( char * pcOut, unsigned iEntry, unsigned & iBmk )
  107. {
  108. BYTE * pbNext = _pb + _ccRoot;
  109. if ( stemInvalid == iBmk )
  110. {
  111. if ( 0 == iEntry )
  112. {
  113. strcpy( pcOut, _acRoot );
  114. iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
  115. return TRUE;
  116. }
  117. unsigned iCurrentEntry = 1;
  118. while ( iCurrentEntry != iEntry )
  119. {
  120. if ( 0 == *pbNext )
  121. break;
  122. if ( *pbNext >= SFX_SINGLE_BYTE )
  123. pbNext++;
  124. else if ( *pbNext == SFX_SWAP_PENULTIMATE )
  125. pbNext += 2;
  126. else if ( *pbNext == SFX_PREFIX )
  127. {
  128. pbNext++;
  129. pbNext++; // prefix
  130. unsigned cb = *pbNext++;
  131. pbNext += cb;
  132. }
  133. else if ( *pbNext == SFX_NOPREFIX )
  134. {
  135. pbNext++;
  136. unsigned cb = *pbNext++;
  137. pbNext += cb;
  138. }
  139. iCurrentEntry++;
  140. }
  141. }
  142. else
  143. {
  144. pbNext = iBmk + _pb;
  145. }
  146. if ( 0 == *pbNext )
  147. {
  148. pbNext++;
  149. iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
  150. return FALSE;
  151. }
  152. strcpy( pcOut, _acRoot );
  153. BYTE bSuffix = *pbNext++;
  154. switch ( bSuffix )
  155. {
  156. case SFX_ADD_S:
  157. strcpy( pcOut + _ccRoot, "s" );
  158. break;
  159. case SFX_ADD_ED:
  160. strcpy( pcOut + _ccRoot, "ed" );
  161. break;
  162. case SFX_ADD_ING:
  163. strcpy( pcOut + _ccRoot, "ing" );
  164. break;
  165. case SFX_ADD_ES:
  166. strcpy( pcOut + _ccRoot, "es" );
  167. break;
  168. case SFX_ADD_D:
  169. strcpy( pcOut + _ccRoot, "d" );
  170. break;
  171. case SFX_ADD_ER:
  172. strcpy( pcOut + _ccRoot, "er" );
  173. break;
  174. case SFX_ADD_N:
  175. strcpy( pcOut + _ccRoot, "n" );
  176. break;
  177. case SFX_ADD_EST:
  178. strcpy( pcOut + _ccRoot, "est" );
  179. break;
  180. case SFX_DROP_E_ADD_ING:
  181. strcpy( pcOut + _ccRoot - 1, "ing" );
  182. break;
  183. case SFX_DROP_Y_ADD_IER:
  184. strcpy( pcOut + _ccRoot - 1, "ier" );
  185. break;
  186. case SFX_DROP_Y_ADD_IES:
  187. strcpy( pcOut + _ccRoot - 1, "ies" );
  188. break;
  189. case SFX_DROP_Y_ADD_IED:
  190. strcpy( pcOut + _ccRoot - 1, "ied" );
  191. break;
  192. case SFX_ADD_SES:
  193. strcpy( pcOut + _ccRoot, "ses" );
  194. break;
  195. case SFX_ADD_E:
  196. strcpy( pcOut + _ccRoot, "e" );
  197. break;
  198. case SFX_ADD_LED:
  199. strcpy( pcOut + _ccRoot, "led" );
  200. break;
  201. case SFX_ADD_NER:
  202. strcpy( pcOut + _ccRoot, "ner" );
  203. break;
  204. case SFX_ADD_DED:
  205. strcpy( pcOut + _ccRoot, "ded" );
  206. break;
  207. case SFX_DROP_Y_ADD_IEST:
  208. strcpy( pcOut + _ccRoot - 1, "iest" );
  209. break;
  210. case SFX_ADD_LING:
  211. strcpy( pcOut + _ccRoot, "ling" );
  212. break;
  213. case SFX_ADD_DING:
  214. strcpy( pcOut + _ccRoot, "ding" );
  215. break;
  216. case SFX_REPEATLAST_ADD_ER:
  217. pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
  218. strcpy( pcOut + _ccRoot + 1, "er" );
  219. break;
  220. case SFX_REPEATLAST_ADD_EST:
  221. pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
  222. strcpy( pcOut + _ccRoot + 1, "est" );
  223. break;
  224. case SFX_REPEATLAST_ADD_ED:
  225. pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
  226. strcpy( pcOut + _ccRoot + 1, "ed" );
  227. break;
  228. case SFX_REPEATLAST_ADD_ING:
  229. pcOut[ _ccRoot ] = pcOut[ _ccRoot - 1 ];
  230. strcpy( pcOut + _ccRoot + 1, "ing" );
  231. break;
  232. case SFX_ADD_R:
  233. strcpy( pcOut + _ccRoot, "r" );
  234. break;
  235. case SFX_ADD_ST:
  236. strcpy( pcOut + _ccRoot, "st" );
  237. break;
  238. case SFX_DROP_O_ADD_I:
  239. break;
  240. case SFX_ADD_KED:
  241. strcpy( pcOut + _ccRoot, "ked" );
  242. break;
  243. case SFX_ADD_KING:
  244. strcpy( pcOut + _ccRoot, "king" );
  245. break;
  246. case SFX_ADD_TA:
  247. strcpy( pcOut + _ccRoot, "ta" );
  248. break;
  249. case SFX_DROP_EY_ADD_IER:
  250. strcpy( pcOut + _ccRoot - 2, "ier" );
  251. break;
  252. case SFX_DROP_EY_ADD_IEST:
  253. strcpy( pcOut + _ccRoot - 2, "iest" );
  254. break;
  255. case SFX_DROP_US_ADD_I:
  256. strcpy( pcOut + _ccRoot - 2, "i" );
  257. break;
  258. case SFX_DROP_UM_ADD_A:
  259. strcpy( pcOut + _ccRoot - 2, "a" );
  260. break;
  261. case SFX_ADD_T:
  262. strcpy( pcOut + _ccRoot, "t" );
  263. break;
  264. case SFX_ADD_EN:
  265. strcpy( pcOut + _ccRoot, "en" );
  266. break;
  267. case SFX_DROP_ON_ADD_A:
  268. break;
  269. case SFX_DROP_LAST_ADD_T:
  270. strcpy( pcOut + _ccRoot - 1, "t" );
  271. break;
  272. case SFX_SWAP_PENULTIMATE:
  273. pcOut[ _ccRoot - 2 ] = *pbNext;
  274. pbNext++;
  275. break;
  276. case SFX_PREFIX:
  277. {
  278. unsigned ccPrefix = *pbNext++;
  279. unsigned ccSuffix = *pbNext++;
  280. CopyMemory( pcOut + ccPrefix, pbNext, ccSuffix );
  281. pcOut[ ccPrefix + ccSuffix ] = 0;
  282. pbNext += ccSuffix;
  283. break;
  284. }
  285. case SFX_NOPREFIX:
  286. {
  287. unsigned cc = *pbNext++;
  288. for ( unsigned i = 0; i < cc; i++ )
  289. pcOut[i] = *pbNext++;
  290. pcOut[i] = 0;
  291. break;
  292. }
  293. }
  294. iBmk = (unsigned) (ULONG_PTR) ( pbNext - _pb );
  295. return TRUE;
  296. }
  297. private:
  298. BYTE * _pb;
  299. unsigned _ccRoot;
  300. char _acRoot[ cbMaxStem ];
  301. };
  302. class CStem
  303. {
  304. public:
  305. CStem( unsigned cDirectory,
  306. CDirectoryEntry * pDirectory,
  307. unsigned cbKeys,
  308. BYTE * pbKeys ) :
  309. _pbKeys( pbKeys ),
  310. _cbKeys( cbKeys ),
  311. _cDirectory( cDirectory ),
  312. _pDirectory( pDirectory )
  313. {
  314. }
  315. ~CStem()
  316. {
  317. delete [] _pDirectory;
  318. delete [] _pbKeys;
  319. }
  320. BOOL FindStemSet( char const * pcKey,
  321. unsigned & iBmk,
  322. unsigned & iStemSet )
  323. {
  324. unsigned oNext = stemInvalid;
  325. char ac[ cbMaxStem ];
  326. if ( stemInvalid == iBmk )
  327. {
  328. // Find a match using the directory
  329. iBmk = FirstList( pcKey );
  330. // Backup until the first match is found
  331. while ( iBmk > 0 )
  332. {
  333. unsigned o = _pDirectory[ iBmk-1 ].Offset();
  334. unsigned e = _pDirectory[ iBmk-1 ].Entry();
  335. CStemSet set( _pbKeys, o );
  336. set.GetNth( ac, e, oNext );
  337. if ( !strcmp( ac, pcKey ) )
  338. iBmk--;
  339. else
  340. break;
  341. }
  342. }
  343. else
  344. {
  345. iBmk++;
  346. }
  347. // Return the list if an entry is found that maches
  348. unsigned o = _pDirectory[ iBmk ].Offset();
  349. unsigned e = _pDirectory[ iBmk ].Entry();
  350. CStemSet set( _pbKeys, o );
  351. oNext = stemInvalid;
  352. set.GetNth( ac, e, oNext );
  353. if ( !strcmp( ac, pcKey ) )
  354. {
  355. iStemSet = o;
  356. return TRUE;
  357. }
  358. return FALSE;
  359. }
  360. unsigned SkipList( unsigned oList )
  361. {
  362. CStemSet set( _pbKeys, oList );
  363. char ac[ cbMaxStem ];
  364. unsigned i = 1;
  365. unsigned o = stemInvalid;
  366. while ( set.GetNth( ac, i, o ) )
  367. i++;
  368. o += oList;
  369. if ( o >= _cbKeys )
  370. return stemInvalid;
  371. return o;
  372. }
  373. unsigned GetNth( char * pcOut, unsigned oList, unsigned iEntry )
  374. {
  375. CStemSet set( _pbKeys, oList );
  376. unsigned o = stemInvalid;
  377. return set.GetNth( pcOut, iEntry, o );
  378. }
  379. BYTE * GetStemSetRoot() { return _pbKeys; }
  380. unsigned GetDirectoryCount() { return _cDirectory; }
  381. CDirectoryEntry * GetDirectory() { return _pDirectory; }
  382. private:
  383. unsigned FirstList( char const * pcKey )
  384. {
  385. unsigned iHi = _cDirectory - 1;
  386. unsigned iLo = 0;
  387. unsigned cKeys = _cDirectory;
  388. // do a binary search looking for the key
  389. do
  390. {
  391. unsigned cHalf = cKeys / 2;
  392. if ( 0 != cHalf )
  393. {
  394. unsigned cTmp = cHalf - 1 + ( cKeys & 1 );
  395. unsigned iMid = iLo + cTmp;
  396. CStemSet set( _pbKeys, _pDirectory[ iMid ].Offset() );
  397. if ( set.IsGreaterThan( _pDirectory[ iMid ].Entry(),
  398. pcKey ) )
  399. {
  400. iHi = iMid - 1;
  401. cKeys = cTmp;
  402. }
  403. else
  404. {
  405. CStemSet set( _pbKeys, _pDirectory[ iMid + 1 ].Offset() );
  406. if ( ! set.IsGreaterThan( _pDirectory[ iMid + 1 ].Entry(),
  407. pcKey ) )
  408. {
  409. iLo = iMid + 1;
  410. cKeys = cHalf;
  411. }
  412. else
  413. return iMid;
  414. }
  415. }
  416. else if ( cKeys > 1 )
  417. {
  418. CStemSet set( _pbKeys, _pDirectory[ iLo + 1 ].Offset() );
  419. if ( set.IsGreaterThan( _pDirectory[ iLo + 1 ].Entry(),
  420. pcKey ) )
  421. return iLo;
  422. return iLo + 1;
  423. }
  424. else
  425. return iLo;
  426. }
  427. while ( TRUE );
  428. return 0;
  429. }
  430. unsigned _cDirectory;
  431. unsigned _cbKeys;
  432. CDirectoryEntry * _pDirectory;
  433. BYTE * _pbKeys;
  434. };
  435. __inline CStem * MakeStemObject( HMODULE hMod )
  436. {
  437. // Get the path of the data file
  438. WCHAR awcPath[ MAX_PATH ];
  439. DWORD cwcCopied = GetModuleFileName( hMod,
  440. awcPath,
  441. ArraySize( awcPath ) );
  442. if ( 0 == cwcCopied )
  443. return 0;
  444. WCHAR *pwcSlash = wcsrchr( awcPath, '\\' );
  445. if ( 0 == pwcSlash )
  446. return 0;
  447. wcscpy( pwcSlash + 1, L"en-stem.dat" );
  448. // Open the data file
  449. FILE *fp = _wfopen( awcPath, L"rb" );
  450. if ( 0 == fp )
  451. return 0;
  452. // Check how big it is
  453. fseek( fp, 0, SEEK_END );
  454. unsigned cb = ftell( fp );
  455. fseek( fp, 0, SEEK_SET );
  456. // Read the directory count and the directory
  457. unsigned cDirectory;
  458. fread( &cDirectory, 1, sizeof( unsigned ), fp );
  459. CDirectoryEntry * aDir = new CDirectoryEntry[ cDirectory ];
  460. if ( 0 == aDir )
  461. {
  462. fclose( fp );
  463. return 0;
  464. }
  465. fread( aDir, cDirectory, sizeof( unsigned ), fp );
  466. // Read the key data
  467. unsigned cbKeys = cb - ( sizeof( unsigned ) * ( cDirectory + 1 ) );
  468. BYTE * pbKeys = new BYTE[ cbKeys ];
  469. if ( 0 == pbKeys )
  470. {
  471. delete [] aDir;
  472. fclose( fp );
  473. return 0;
  474. }
  475. fread( pbKeys, cbKeys, 1, fp );
  476. fclose( fp );
  477. // Make the stemmer object with the buffers
  478. CStem * pStem = new CStem( cDirectory, aDir, cbKeys, pbKeys );
  479. if ( 0 == pStem )
  480. {
  481. delete [] aDir;
  482. delete [] pbKeys;
  483. }
  484. return pStem;
  485. } //MakeStemObject