Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

399 lines
18 KiB

  1. /*******************************************************************************
  2. * morph.h *
  3. *---------*
  4. * Description:
  5. * This is the header file for the CSMorph implementation. This class
  6. * attempts to find pronunciations for morphological variants (which do not
  7. * occur in the lexicon) of root words (which do occur in the lexicon).
  8. *-------------------------------------------------------------------------------
  9. * Created By: AH Date: 08/16/99
  10. * Copyright (C) 1999 Microsoft Corporation
  11. * All Rights Reserved
  12. *******************************************************************************/
  13. #ifndef Morph_h
  14. #define Morph_h
  15. #ifndef __spttseng_h__
  16. #include "spttseng.h"
  17. #endif
  18. // Additional includes...
  19. #include "stdafx.h"
  20. #include "commonlx.h"
  21. //== CONSTANTS ================================================================
  22. #define MAX_POSCONVERSIONS 4
  23. #define NUM_POS 5
  24. /*** SUFFIX_TYPE **************************************************************
  25. * This enumeration contains values for all of the suffixes which can be matched
  26. * and accounted for by the CSMorph class.
  27. */
  28. static const enum SUFFIX_TYPE
  29. {
  30. S_SUFFIX = 0,
  31. ED_SUFFIX,
  32. ING_SUFFIX,
  33. APOSTROPHES_SUFFIX,
  34. APOSTROPHE_SUFFIX,
  35. ER_SUFFIX,
  36. EST_SUFFIX,
  37. OR_SUFFIX,
  38. MENT_SUFFIX,
  39. AGE_SUFFIX,
  40. LESS_SUFFIX,
  41. Y_SUFFIX,
  42. EDLY_SUFFIX,
  43. LY_SUFFIX,
  44. ABLE_SUFFIX,
  45. NESS_SUFFIX,
  46. ISM_SUFFIX,
  47. IZE_SUFFIX,
  48. IZ_SUFFIX,
  49. HOOD_SUFFIX,
  50. FUL_SUFFIX,
  51. LIKE_SUFFIX,
  52. WISE_SUFFIX,
  53. ISH_SUFFIX,
  54. ABLY_SUFFIX,
  55. SHIP_SUFFIX,
  56. ICALLY_SUFFIX,
  57. SOME_SUFFIX,
  58. ILY_SUFFIX,
  59. ICISM_SUFFIX,
  60. ICIZE_SUFFIX,
  61. NO_MATCH = -1,
  62. };
  63. /* SUFFIX_INFO, g_SuffixTable[] ***********************************************
  64. * This table is used to map the orthographic forms of suffixes to their suffix
  65. * types. Each suffix is stored in reverse order for easier comparison with
  66. * the ends of strings...
  67. */
  68. struct SUFFIX_INFO
  69. {
  70. WCHAR Orth[10];
  71. SUFFIX_TYPE Type;
  72. };
  73. static const SUFFIX_INFO g_SuffixTable[] =
  74. {
  75. { L"RE", ER_SUFFIX },
  76. { L"TSE", EST_SUFFIX },
  77. { L"GNI", ING_SUFFIX },
  78. { L"ELBA", ABLE_SUFFIX },
  79. { L"ELBI", ABLE_SUFFIX },
  80. { L"YLDE", EDLY_SUFFIX },
  81. { L"YLBA", ABLY_SUFFIX },
  82. { L"YLBI", ABLY_SUFFIX },
  83. { L"YLLACI", ICALLY_SUFFIX },
  84. { L"YLI", ILY_SUFFIX },
  85. { L"YL", LY_SUFFIX },
  86. { L"Y", Y_SUFFIX },
  87. { L"TNEM", MENT_SUFFIX },
  88. { L"RO", OR_SUFFIX },
  89. { L"SSEN", NESS_SUFFIX },
  90. { L"SSEL", LESS_SUFFIX },
  91. { L"EZICI", ICIZE_SUFFIX },
  92. { L"EZI", IZE_SUFFIX },
  93. { L"ZI", IZ_SUFFIX },
  94. { L"MSICI", ICISM_SUFFIX },
  95. { L"MSI", ISM_SUFFIX },
  96. { L"DE", ED_SUFFIX },
  97. { L"S'", APOSTROPHES_SUFFIX },
  98. { L"S", S_SUFFIX },
  99. { L"'", APOSTROPHE_SUFFIX },
  100. { L"EGA", AGE_SUFFIX },
  101. { L"DOOH", HOOD_SUFFIX },
  102. { L"LUF", FUL_SUFFIX },
  103. { L"EKIL", LIKE_SUFFIX },
  104. { L"ESIW", WISE_SUFFIX },
  105. { L"HSI", ISH_SUFFIX },
  106. { L"PIHS", SHIP_SUFFIX },
  107. { L"EMOS", SOME_SUFFIX },
  108. };
  109. /*** PHONTYPE *****************************************************************
  110. * This enumeration creates flags which can be used to determine the relevant
  111. * features of each phone.
  112. */
  113. static const enum PHONTYPE
  114. {
  115. eCONSONANTF = (1<<0),
  116. eVOICEDF = (1<<1),
  117. ePALATALF = (1<<2),
  118. };
  119. /*** g_PhonTable[], g_PhonS, g_PhonZ *******************************************
  120. * This table is used to map the internal values of phones to their types, which
  121. * are just clusters of features relevant to the necessary phonological rules.
  122. * g_PhonS, g_PhonZ, g_PhonD, g_PhonT are just used to make the code a bit more
  123. * readable.
  124. */
  125. static const long g_PhonTable[] =
  126. {
  127. eCONSONANTF, // Default value - 0 is not a valid phone
  128. eCONSONANTF, // 1 is a syllable boundary - shouldn't ever occur at the end of a word
  129. eCONSONANTF, // 2 is an exclamation point - shouldn't ever occur at the end of a word
  130. eCONSONANTF, // 3 is a word boundary - treated as a consonant
  131. eCONSONANTF, // 4 is a comma - shouldn't ever occur at the end of a word
  132. eCONSONANTF, // 5 is a period - shouldn't ever occur at the end of a word
  133. eCONSONANTF, // 6 is a question mark - shouldn't ever occur at the end of a word
  134. eCONSONANTF, // 7 is a silence - shouldn't ever occur at the end of a word
  135. eVOICEDF, // 8 is primary stress - treat as a vowel since it should always be attached to a vowel nucleus
  136. eVOICEDF, // 9 is secondatry stress - see primary stress
  137. eVOICEDF, // 10 -> AA
  138. eVOICEDF, // 11 -> AE
  139. eVOICEDF, // 12 -> AH
  140. eVOICEDF, // 13 -> AO
  141. eVOICEDF, // 14 -> AW
  142. eVOICEDF, // 15 -> AX
  143. eVOICEDF, // 16 -> AY
  144. eCONSONANTF + eVOICEDF, // 17 -> b
  145. eCONSONANTF + ePALATALF, // 18 -> CH
  146. eCONSONANTF + eVOICEDF, // 19 -> d
  147. eCONSONANTF + eVOICEDF, // 20 -> DH
  148. eVOICEDF, // 21 -> EH
  149. eVOICEDF, // 22 -> ER
  150. eVOICEDF, // 23 -> EY
  151. eCONSONANTF, // 24 -> f
  152. eCONSONANTF + eVOICEDF, // 25 -> g
  153. eCONSONANTF, // 26 -> h
  154. eVOICEDF, // 27 -> IH
  155. eVOICEDF, // 28 -> IY
  156. eCONSONANTF + eVOICEDF + ePALATALF, // 29 -> JH
  157. eCONSONANTF, // 30 -> k
  158. eCONSONANTF + eVOICEDF, // 31 -> l
  159. eCONSONANTF + eVOICEDF, // 32 -> m
  160. eCONSONANTF + eVOICEDF, // 33 -> n
  161. eCONSONANTF + eVOICEDF, // 34 -> NG
  162. eVOICEDF, // 35 -> OW
  163. eVOICEDF, // 36 -> OY
  164. eCONSONANTF, // 37 -> p
  165. eCONSONANTF + eVOICEDF, // 38 -> r
  166. eCONSONANTF, // 39 -> s
  167. eCONSONANTF + ePALATALF, // 40 -> SH
  168. eCONSONANTF, // 41 -> t
  169. eCONSONANTF, // 42 -> TH
  170. eVOICEDF, // 43 -> UH
  171. eVOICEDF, // 44 -> UW
  172. eCONSONANTF + eVOICEDF, // 45 -> v
  173. eCONSONANTF + eVOICEDF, // 46 -> w
  174. eCONSONANTF + eVOICEDF, // 47 -> y
  175. eCONSONANTF + eVOICEDF, // 48 -> z
  176. eCONSONANTF + eVOICEDF + ePALATALF, // 49 -> ZH
  177. };
  178. static WCHAR g_phonAXl[] = L" AX l";
  179. static WCHAR g_phonAXz[] = L" AX z";
  180. static WCHAR g_phonS[] = L" s";
  181. static WCHAR g_phonZ[] = L" z";
  182. static WCHAR g_phonD[] = L" d";
  183. static WCHAR g_phonAXd[] = L" AX d";
  184. static WCHAR g_phonT[] = L" t";
  185. static WCHAR g_phonIY[] = L" IY";
  186. static WCHAR g_phonL[] = L" l";
  187. /*** struct POS_CONVERT *******************************************************
  188. * This struct stores the From and To parts of speech for a suffix...
  189. */
  190. struct POS_CONVERT
  191. {
  192. ENGPARTOFSPEECH FromPos;
  193. ENGPARTOFSPEECH ToPos;
  194. };
  195. /*** MorphSpecialCaseFlags ****************************************************
  196. * This enum allows DoSuffixMorph to be nearly completely table driven. Each
  197. * suffix has a MorphSpecialCaseFlags entry in the SuffixInfoTable which tells
  198. * DoSuffixMorph which special case functions (check for missing E, etc.) need
  199. * to be called if the initial lex lookup fails.
  200. */
  201. typedef enum MorphSpecialCaseFlags
  202. {
  203. eCheckForMissingE = 1L << 0,
  204. eCheckYtoIMutation = 1L << 1,
  205. eCheckDoubledMutation = 1L << 2,
  206. eCheckForMissingY = 1L << 3,
  207. eCheckForMissingL = 1L << 4,
  208. } MorphSpecialCaseFlags;
  209. /*** struct SUFFIXPRON_INFO ***************************************************
  210. * This struct stores the pronunciation of a suffix, as well as the POS
  211. * categories it takes as input and output.
  212. */
  213. struct SUFFIXPRON_INFO
  214. {
  215. WCHAR SuffixString[SP_MAX_PRON_LENGTH];
  216. POS_CONVERT Conversions[MAX_POSCONVERSIONS];
  217. short NumConversions;
  218. DWORD dwMorphSpecialCaseFlags;
  219. };
  220. /*** bool SuffixInfoTableInitialized *******************************************
  221. * This bool just lets threads know whether they are the first to use the
  222. * following table, and thus whether they need to initialize it or not.
  223. */
  224. static bool SuffixInfoTableInitialized = false;
  225. /*** SUFFIXPRON_INFO g_SuffixInfoTable *****************************************
  226. * This table drives the DoSuffixMorph function, by storing the pronunciation,
  227. * conversions, number of conversions, and special case flags for each suffix...
  228. */
  229. static SUFFIXPRON_INFO g_SuffixInfoTable [] =
  230. {
  231. /********************************************************************************************************/
  232. /* Pronunciation * Conversions * NumConversions * Special Case Flags * SuffixType */
  233. /********************************************************************************************************/
  234. { L" s", { {MS_Verb, MS_Verb},
  235. {MS_Noun, MS_Noun} }, 2, 0 }, // S_SUFFIX
  236. { L" d", { {MS_Verb, MS_Verb},
  237. {MS_Verb, MS_Adj} }, 2, eCheckForMissingE +
  238. eCheckYtoIMutation +
  239. eCheckDoubledMutation }, // ED_SUFFIX
  240. { L" IH NG", { {MS_Verb, MS_Verb},
  241. {MS_Verb, MS_Adj},
  242. {MS_Verb, MS_Noun} }, 3, eCheckForMissingE +
  243. eCheckDoubledMutation }, // ING_SUFFIX
  244. { L" s", { {MS_Noun, MS_Noun} }, 1, 0 }, // APOSTROPHES_SUFFIX
  245. { L" s", { {MS_Noun, MS_Noun} }, 1, 0 }, // APOSTROPHE_SUFFIX
  246. { L" ER", { {MS_Verb, MS_Noun},
  247. {MS_Adj, MS_Adj},
  248. {MS_Adv, MS_Adv},
  249. {MS_Adj, MS_Adv} }, 4, eCheckForMissingE +
  250. eCheckYtoIMutation +
  251. eCheckDoubledMutation }, // ER_SUFFIX
  252. { L" AX s t", { {MS_Adj, MS_Adj},
  253. {MS_Adv, MS_Adv},
  254. {MS_Adj, MS_Adv} }, 3, eCheckForMissingE +
  255. eCheckYtoIMutation +
  256. eCheckDoubledMutation }, // EST_SUFFIX
  257. { L" ER", { {MS_Verb, MS_Noun} }, 1, eCheckForMissingE +
  258. eCheckDoubledMutation }, // OR_SUFFIX
  259. { L" m AX n t", { {MS_Verb, MS_Noun} }, 1, eCheckYtoIMutation }, // MENT_SUFFIX
  260. { L" IH JH", { {MS_Verb, MS_Noun} }, 1, eCheckForMissingE +
  261. eCheckDoubledMutation }, // AGE_SUFFIX
  262. { L" l IH s", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // LESS_SUFFIX
  263. { L" IY", { {MS_Noun, MS_Adj},
  264. {MS_Adj, MS_Adv} }, 2, eCheckForMissingE +
  265. eCheckDoubledMutation }, // Y_SUFFIX
  266. { L" AX d l IY", { {MS_Verb, MS_Adj},
  267. {MS_Verb, MS_Adv} }, 2, eCheckForMissingE +
  268. eCheckYtoIMutation +
  269. eCheckDoubledMutation }, // EDLY_SUFFIX
  270. { L" l IY", { {MS_Noun, MS_Adj},
  271. {MS_Adj, MS_Adv} }, 2, eCheckForMissingL }, // LY_XUFFIX
  272. { L" AX - b AX l", { {MS_Verb, MS_Adj},
  273. {MS_Noun, MS_Adj} }, 2, eCheckForMissingE +
  274. eCheckYtoIMutation +
  275. eCheckDoubledMutation }, // ABLE_SUFFIX
  276. { L" n IH s", { {MS_Adj, MS_Noun} }, 1, eCheckYtoIMutation }, // NESS_SUFFIX
  277. { L" IH z AX m", { {MS_Adj, MS_Noun},
  278. {MS_Noun, MS_Noun} }, 2, eCheckForMissingE }, // ISM_SUFFIX
  279. { L" AY z", { {MS_Noun, MS_Verb},
  280. {MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // IZE_SUFFIX
  281. { L" AY z", { {MS_Noun, MS_Verb},
  282. {MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // IZ_SUFFIX
  283. { L" h UH d", { {MS_Noun, MS_Noun} }, 1, 0 }, // HOOD_SUFFIX
  284. { L" f AX l", { {MS_Noun, MS_Adj},
  285. {MS_Verb, MS_Adj} }, 2, 0 } , // FUL_SUFFIX
  286. { L" l AY k", { {MS_Noun, MS_Adj} }, 1, 0 }, // LIKE_SUFFIX
  287. { L" w AY z", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // WISE_SUFFIX
  288. { L" IH SH", { {MS_Noun, MS_Adj} }, 1, eCheckForMissingE +
  289. eCheckDoubledMutation }, // ISH_SUFFIX
  290. { L" AX - b l IY", { {MS_Verb, MS_Adv},
  291. {MS_Noun, MS_Adv} }, 2, eCheckForMissingE +
  292. eCheckYtoIMutation +
  293. eCheckDoubledMutation }, // ABLY_SUFFIX
  294. { L" SH IH 2 p", { {MS_Noun, MS_Noun} }, 1, 0 }, // SHIP_SUFFIX
  295. { L" L IY", { {MS_Adj, MS_Adv} }, 1, 0 }, // ICALLY_SUFFIX
  296. { L" S AX M", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // SOME_SUFFIX
  297. { L" AX L IY", { {MS_Noun, MS_Adv} }, 1, eCheckDoubledMutation +
  298. eCheckForMissingY }, // ILY_SUFFIX
  299. { L" IH z AX m", { {MS_Adj, MS_Noun},
  300. {MS_Noun, MS_Noun} }, 2, eCheckForMissingE }, // ICISM_SUFFIX
  301. { L" AY z", { {MS_Noun, MS_Verb},
  302. {MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // ICIZE_SUFFIX
  303. };
  304. /*** CSuffixList **************************************************************
  305. * This typedef just makes the code a little easier to read. A CSuffixList is
  306. * used to keep track of each of the suffixes which has been stripped from a
  307. * word, so that their pronunciations can be concatenated with that of the root.
  308. */
  309. typedef CSPList<SUFFIXPRON_INFO*, SUFFIXPRON_INFO*> CSuffixList;
  310. /*** CComAutoCriticalSection g_SuffixInfoTableCritSec *************************
  311. * This critical section is used to make sure the SuffixInfoTable only gets
  312. * initialized once.
  313. */
  314. static CComAutoCriticalSection g_SuffixInfoTableCritSec;
  315. /*** CSMorph ******************************************************************
  316. * This is the definition of the CSMorph class.
  317. */
  318. class CSMorph
  319. {
  320. public:
  321. /*=== PUBLIC METHODS =====*/
  322. CSMorph( ISpLexicon *pMasterLex=0, HRESULT *hr=0 );
  323. /*=== INTERFACE METHOD =====*/
  324. HRESULT DoSuffixMorph( const WCHAR *pwWord, WCHAR *pwRoot, LANGID LangID, DWORD dwFlags,
  325. SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
  326. private:
  327. /*=== PRIVATE METHODS =====*/
  328. SUFFIX_TYPE MatchSuffix( WCHAR *TargWord, long *RootLen );
  329. HRESULT LexLookup( const WCHAR *pOrth, long length, DWORD dwFlags,
  330. SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
  331. HRESULT LTSLookup( const WCHAR *pOrth, long length,
  332. SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
  333. HRESULT AccumulateSuffixes( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
  334. HRESULT AccumulateSuffixes_LTS( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
  335. HRESULT DefaultAccumulateSuffixes( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
  336. HRESULT CheckForMissingE( WCHAR *pOrth, long length, DWORD dwFlags,
  337. SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
  338. HRESULT CheckForMissingY( WCHAR *pOrth, long length, DWORD dwFlags,
  339. SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
  340. HRESULT CheckForMissingL( WCHAR *pOrth, long length, DWORD dwFlags,
  341. SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
  342. HRESULT CheckYtoIMutation( WCHAR *pOrth, long length, DWORD dwFlags,
  343. SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
  344. HRESULT CheckDoubledMutation( WCHAR *pOrth, long length, DWORD dwFlags,
  345. SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
  346. HRESULT CheckYtoIEMutation( WCHAR *pOrth, long length, DWORD dwFlags,
  347. SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
  348. HRESULT CheckAbleMutation( WCHAR *pOrth, long length, DWORD dwFlags,
  349. SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
  350. HRESULT Phon_SorZ( WCHAR *pPronunciation, long length );
  351. HRESULT Phon_DorED( WCHAR *pPronunciation, long length );
  352. /*=== MEMBER DATA =====*/
  353. // Pointer to the Master Lexicon...
  354. ISpLexicon *m_pMasterLex;
  355. };
  356. inline BOOL SearchPosSet( ENGPARTOFSPEECH Pos, const ENGPARTOFSPEECH *Set, ULONG Count )
  357. {
  358. for( ULONG i = 0; i < Count; ++i )
  359. {
  360. if( Pos == Set[i] )
  361. {
  362. return true;
  363. }
  364. }
  365. return false;
  366. }
  367. #endif //--- End of File -------------------------------------------------------------