|
|
/*******************************************************************************
* morph.h * *---------* * Description: * This is the header file for the CSMorph implementation. This class * attempts to find pronunciations for morphological variants (which do not * occur in the lexicon) of root words (which do occur in the lexicon). *------------------------------------------------------------------------------- * Created By: AH Date: 08/16/99 * Copyright (C) 1999 Microsoft Corporation * All Rights Reserved *******************************************************************************/ #ifndef Morph_h
#define Morph_h
#ifndef __spttseng_h__
#include "spttseng.h"
#endif
// Additional includes...
#include "stdafx.h"
#include "commonlx.h"
//== CONSTANTS ================================================================
#define MAX_POSCONVERSIONS 4
#define NUM_POS 5
/*** SUFFIX_TYPE **************************************************************
* This enumeration contains values for all of the suffixes which can be matched * and accounted for by the CSMorph class. */ static const enum SUFFIX_TYPE { S_SUFFIX = 0, ED_SUFFIX, ING_SUFFIX, APOSTROPHES_SUFFIX, APOSTROPHE_SUFFIX, ER_SUFFIX, EST_SUFFIX, OR_SUFFIX, MENT_SUFFIX, AGE_SUFFIX, LESS_SUFFIX, Y_SUFFIX, EDLY_SUFFIX, LY_SUFFIX, ABLE_SUFFIX, NESS_SUFFIX, ISM_SUFFIX, IZE_SUFFIX, IZ_SUFFIX, HOOD_SUFFIX, FUL_SUFFIX, LIKE_SUFFIX, WISE_SUFFIX, ISH_SUFFIX, ABLY_SUFFIX, SHIP_SUFFIX, ICALLY_SUFFIX, SOME_SUFFIX, ILY_SUFFIX, ICISM_SUFFIX, ICIZE_SUFFIX, NO_MATCH = -1, };
/* SUFFIX_INFO, g_SuffixTable[] ***********************************************
* This table is used to map the orthographic forms of suffixes to their suffix * types. Each suffix is stored in reverse order for easier comparison with * the ends of strings... */ struct SUFFIX_INFO { WCHAR Orth[10]; SUFFIX_TYPE Type; };
static const SUFFIX_INFO g_SuffixTable[] = { { L"RE", ER_SUFFIX }, { L"TSE", EST_SUFFIX }, { L"GNI", ING_SUFFIX }, { L"ELBA", ABLE_SUFFIX }, { L"ELBI", ABLE_SUFFIX }, { L"YLDE", EDLY_SUFFIX }, { L"YLBA", ABLY_SUFFIX }, { L"YLBI", ABLY_SUFFIX }, { L"YLLACI", ICALLY_SUFFIX }, { L"YLI", ILY_SUFFIX }, { L"YL", LY_SUFFIX }, { L"Y", Y_SUFFIX }, { L"TNEM", MENT_SUFFIX }, { L"RO", OR_SUFFIX }, { L"SSEN", NESS_SUFFIX }, { L"SSEL", LESS_SUFFIX }, { L"EZICI", ICIZE_SUFFIX }, { L"EZI", IZE_SUFFIX }, { L"ZI", IZ_SUFFIX }, { L"MSICI", ICISM_SUFFIX }, { L"MSI", ISM_SUFFIX }, { L"DE", ED_SUFFIX }, { L"S'", APOSTROPHES_SUFFIX }, { L"S", S_SUFFIX }, { L"'", APOSTROPHE_SUFFIX }, { L"EGA", AGE_SUFFIX }, { L"DOOH", HOOD_SUFFIX }, { L"LUF", FUL_SUFFIX }, { L"EKIL", LIKE_SUFFIX }, { L"ESIW", WISE_SUFFIX }, { L"HSI", ISH_SUFFIX }, { L"PIHS", SHIP_SUFFIX }, { L"EMOS", SOME_SUFFIX }, };
/*** PHONTYPE *****************************************************************
* This enumeration creates flags which can be used to determine the relevant * features of each phone. */ static const enum PHONTYPE { eCONSONANTF = (1<<0), eVOICEDF = (1<<1), ePALATALF = (1<<2), };
/*** g_PhonTable[], g_PhonS, g_PhonZ *******************************************
* This table is used to map the internal values of phones to their types, which * are just clusters of features relevant to the necessary phonological rules. * g_PhonS, g_PhonZ, g_PhonD, g_PhonT are just used to make the code a bit more * readable. */ static const long g_PhonTable[] = { eCONSONANTF, // Default value - 0 is not a valid phone
eCONSONANTF, // 1 is a syllable boundary - shouldn't ever occur at the end of a word
eCONSONANTF, // 2 is an exclamation point - shouldn't ever occur at the end of a word
eCONSONANTF, // 3 is a word boundary - treated as a consonant
eCONSONANTF, // 4 is a comma - shouldn't ever occur at the end of a word
eCONSONANTF, // 5 is a period - shouldn't ever occur at the end of a word
eCONSONANTF, // 6 is a question mark - shouldn't ever occur at the end of a word
eCONSONANTF, // 7 is a silence - shouldn't ever occur at the end of a word
eVOICEDF, // 8 is primary stress - treat as a vowel since it should always be attached to a vowel nucleus
eVOICEDF, // 9 is secondatry stress - see primary stress
eVOICEDF, // 10 -> AA
eVOICEDF, // 11 -> AE
eVOICEDF, // 12 -> AH
eVOICEDF, // 13 -> AO
eVOICEDF, // 14 -> AW
eVOICEDF, // 15 -> AX
eVOICEDF, // 16 -> AY
eCONSONANTF + eVOICEDF, // 17 -> b
eCONSONANTF + ePALATALF, // 18 -> CH
eCONSONANTF + eVOICEDF, // 19 -> d
eCONSONANTF + eVOICEDF, // 20 -> DH
eVOICEDF, // 21 -> EH
eVOICEDF, // 22 -> ER
eVOICEDF, // 23 -> EY
eCONSONANTF, // 24 -> f
eCONSONANTF + eVOICEDF, // 25 -> g
eCONSONANTF, // 26 -> h
eVOICEDF, // 27 -> IH
eVOICEDF, // 28 -> IY
eCONSONANTF + eVOICEDF + ePALATALF, // 29 -> JH
eCONSONANTF, // 30 -> k
eCONSONANTF + eVOICEDF, // 31 -> l
eCONSONANTF + eVOICEDF, // 32 -> m
eCONSONANTF + eVOICEDF, // 33 -> n
eCONSONANTF + eVOICEDF, // 34 -> NG
eVOICEDF, // 35 -> OW
eVOICEDF, // 36 -> OY
eCONSONANTF, // 37 -> p
eCONSONANTF + eVOICEDF, // 38 -> r
eCONSONANTF, // 39 -> s
eCONSONANTF + ePALATALF, // 40 -> SH
eCONSONANTF, // 41 -> t
eCONSONANTF, // 42 -> TH
eVOICEDF, // 43 -> UH
eVOICEDF, // 44 -> UW
eCONSONANTF + eVOICEDF, // 45 -> v
eCONSONANTF + eVOICEDF, // 46 -> w
eCONSONANTF + eVOICEDF, // 47 -> y
eCONSONANTF + eVOICEDF, // 48 -> z
eCONSONANTF + eVOICEDF + ePALATALF, // 49 -> ZH
};
static WCHAR g_phonAXl[] = L" AX l"; static WCHAR g_phonAXz[] = L" AX z"; static WCHAR g_phonS[] = L" s"; static WCHAR g_phonZ[] = L" z"; static WCHAR g_phonD[] = L" d"; static WCHAR g_phonAXd[] = L" AX d"; static WCHAR g_phonT[] = L" t"; static WCHAR g_phonIY[] = L" IY"; static WCHAR g_phonL[] = L" l";
/*** struct POS_CONVERT *******************************************************
* This struct stores the From and To parts of speech for a suffix... */ struct POS_CONVERT { ENGPARTOFSPEECH FromPos; ENGPARTOFSPEECH ToPos; };
/*** MorphSpecialCaseFlags ****************************************************
* This enum allows DoSuffixMorph to be nearly completely table driven. Each * suffix has a MorphSpecialCaseFlags entry in the SuffixInfoTable which tells * DoSuffixMorph which special case functions (check for missing E, etc.) need * to be called if the initial lex lookup fails. */ typedef enum MorphSpecialCaseFlags { eCheckForMissingE = 1L << 0, eCheckYtoIMutation = 1L << 1, eCheckDoubledMutation = 1L << 2, eCheckForMissingY = 1L << 3, eCheckForMissingL = 1L << 4, } MorphSpecialCaseFlags;
/*** struct SUFFIXPRON_INFO ***************************************************
* This struct stores the pronunciation of a suffix, as well as the POS * categories it takes as input and output. */ struct SUFFIXPRON_INFO { WCHAR SuffixString[SP_MAX_PRON_LENGTH]; POS_CONVERT Conversions[MAX_POSCONVERSIONS]; short NumConversions; DWORD dwMorphSpecialCaseFlags; };
/*** bool SuffixInfoTableInitialized *******************************************
* This bool just lets threads know whether they are the first to use the * following table, and thus whether they need to initialize it or not. */ static bool SuffixInfoTableInitialized = false;
/*** SUFFIXPRON_INFO g_SuffixInfoTable *****************************************
* This table drives the DoSuffixMorph function, by storing the pronunciation, * conversions, number of conversions, and special case flags for each suffix... */ static SUFFIXPRON_INFO g_SuffixInfoTable [] = { /********************************************************************************************************/ /* Pronunciation * Conversions * NumConversions * Special Case Flags * SuffixType */ /********************************************************************************************************/ { L" s", { {MS_Verb, MS_Verb}, {MS_Noun, MS_Noun} }, 2, 0 }, // S_SUFFIX
{ L" d", { {MS_Verb, MS_Verb}, {MS_Verb, MS_Adj} }, 2, eCheckForMissingE + eCheckYtoIMutation + eCheckDoubledMutation }, // ED_SUFFIX
{ L" IH NG", { {MS_Verb, MS_Verb}, {MS_Verb, MS_Adj}, {MS_Verb, MS_Noun} }, 3, eCheckForMissingE + eCheckDoubledMutation }, // ING_SUFFIX
{ L" s", { {MS_Noun, MS_Noun} }, 1, 0 }, // APOSTROPHES_SUFFIX
{ L" s", { {MS_Noun, MS_Noun} }, 1, 0 }, // APOSTROPHE_SUFFIX
{ L" ER", { {MS_Verb, MS_Noun}, {MS_Adj, MS_Adj}, {MS_Adv, MS_Adv}, {MS_Adj, MS_Adv} }, 4, eCheckForMissingE + eCheckYtoIMutation + eCheckDoubledMutation }, // ER_SUFFIX
{ L" AX s t", { {MS_Adj, MS_Adj}, {MS_Adv, MS_Adv}, {MS_Adj, MS_Adv} }, 3, eCheckForMissingE + eCheckYtoIMutation + eCheckDoubledMutation }, // EST_SUFFIX
{ L" ER", { {MS_Verb, MS_Noun} }, 1, eCheckForMissingE + eCheckDoubledMutation }, // OR_SUFFIX
{ L" m AX n t", { {MS_Verb, MS_Noun} }, 1, eCheckYtoIMutation }, // MENT_SUFFIX
{ L" IH JH", { {MS_Verb, MS_Noun} }, 1, eCheckForMissingE + eCheckDoubledMutation }, // AGE_SUFFIX
{ L" l IH s", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // LESS_SUFFIX
{ L" IY", { {MS_Noun, MS_Adj}, {MS_Adj, MS_Adv} }, 2, eCheckForMissingE + eCheckDoubledMutation }, // Y_SUFFIX
{ L" AX d l IY", { {MS_Verb, MS_Adj}, {MS_Verb, MS_Adv} }, 2, eCheckForMissingE + eCheckYtoIMutation + eCheckDoubledMutation }, // EDLY_SUFFIX
{ L" l IY", { {MS_Noun, MS_Adj}, {MS_Adj, MS_Adv} }, 2, eCheckForMissingL }, // LY_XUFFIX
{ L" AX - b AX l", { {MS_Verb, MS_Adj}, {MS_Noun, MS_Adj} }, 2, eCheckForMissingE + eCheckYtoIMutation + eCheckDoubledMutation }, // ABLE_SUFFIX
{ L" n IH s", { {MS_Adj, MS_Noun} }, 1, eCheckYtoIMutation }, // NESS_SUFFIX
{ L" IH z AX m", { {MS_Adj, MS_Noun}, {MS_Noun, MS_Noun} }, 2, eCheckForMissingE }, // ISM_SUFFIX
{ L" AY z", { {MS_Noun, MS_Verb}, {MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // IZE_SUFFIX
{ L" AY z", { {MS_Noun, MS_Verb}, {MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // IZ_SUFFIX
{ L" h UH d", { {MS_Noun, MS_Noun} }, 1, 0 }, // HOOD_SUFFIX
{ L" f AX l", { {MS_Noun, MS_Adj}, {MS_Verb, MS_Adj} }, 2, 0 } , // FUL_SUFFIX
{ L" l AY k", { {MS_Noun, MS_Adj} }, 1, 0 }, // LIKE_SUFFIX
{ L" w AY z", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // WISE_SUFFIX
{ L" IH SH", { {MS_Noun, MS_Adj} }, 1, eCheckForMissingE + eCheckDoubledMutation }, // ISH_SUFFIX
{ L" AX - b l IY", { {MS_Verb, MS_Adv}, {MS_Noun, MS_Adv} }, 2, eCheckForMissingE + eCheckYtoIMutation + eCheckDoubledMutation }, // ABLY_SUFFIX
{ L" SH IH 2 p", { {MS_Noun, MS_Noun} }, 1, 0 }, // SHIP_SUFFIX
{ L" L IY", { {MS_Adj, MS_Adv} }, 1, 0 }, // ICALLY_SUFFIX
{ L" S AX M", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // SOME_SUFFIX
{ L" AX L IY", { {MS_Noun, MS_Adv} }, 1, eCheckDoubledMutation + eCheckForMissingY }, // ILY_SUFFIX
{ L" IH z AX m", { {MS_Adj, MS_Noun}, {MS_Noun, MS_Noun} }, 2, eCheckForMissingE }, // ICISM_SUFFIX
{ L" AY z", { {MS_Noun, MS_Verb}, {MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // ICIZE_SUFFIX
};
/*** CSuffixList **************************************************************
* This typedef just makes the code a little easier to read. A CSuffixList is * used to keep track of each of the suffixes which has been stripped from a * word, so that their pronunciations can be concatenated with that of the root. */ typedef CSPList<SUFFIXPRON_INFO*, SUFFIXPRON_INFO*> CSuffixList;
/*** CComAutoCriticalSection g_SuffixInfoTableCritSec *************************
* This critical section is used to make sure the SuffixInfoTable only gets * initialized once. */ static CComAutoCriticalSection g_SuffixInfoTableCritSec;
/*** CSMorph ******************************************************************
* This is the definition of the CSMorph class. */ class CSMorph { public:
/*=== PUBLIC METHODS =====*/ CSMorph( ISpLexicon *pMasterLex=0, HRESULT *hr=0 );
/*=== INTERFACE METHOD =====*/ HRESULT DoSuffixMorph( const WCHAR *pwWord, WCHAR *pwRoot, LANGID LangID, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
private:
/*=== PRIVATE METHODS =====*/ SUFFIX_TYPE MatchSuffix( WCHAR *TargWord, long *RootLen ); HRESULT LexLookup( const WCHAR *pOrth, long length, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList ); HRESULT LTSLookup( const WCHAR *pOrth, long length, SPWORDPRONUNCIATIONLIST *pWordPronunciationList); HRESULT AccumulateSuffixes( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList ); HRESULT AccumulateSuffixes_LTS( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList ); HRESULT DefaultAccumulateSuffixes( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
HRESULT CheckForMissingE( WCHAR *pOrth, long length, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList); HRESULT CheckForMissingY( WCHAR *pOrth, long length, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList ); HRESULT CheckForMissingL( WCHAR *pOrth, long length, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList ); HRESULT CheckYtoIMutation( WCHAR *pOrth, long length, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList); HRESULT CheckDoubledMutation( WCHAR *pOrth, long length, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList); HRESULT CheckYtoIEMutation( WCHAR *pOrth, long length, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList); HRESULT CheckAbleMutation( WCHAR *pOrth, long length, DWORD dwFlags, SPWORDPRONUNCIATIONLIST *pWordPronunciationList); HRESULT Phon_SorZ( WCHAR *pPronunciation, long length ); HRESULT Phon_DorED( WCHAR *pPronunciation, long length );
/*=== MEMBER DATA =====*/
// Pointer to the Master Lexicon...
ISpLexicon *m_pMasterLex; };
inline BOOL SearchPosSet( ENGPARTOFSPEECH Pos, const ENGPARTOFSPEECH *Set, ULONG Count ) { for( ULONG i = 0; i < Count; ++i ) { if( Pos == Set[i] ) { return true; } } return false; }
#endif //--- End of File -------------------------------------------------------------
|