Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

399 lines
18 KiB

/*******************************************************************************
* morph.h *
*---------*
* Description:
* This is the header file for the CSMorph implementation. This class
* attempts to find pronunciations for morphological variants (which do not
* occur in the lexicon) of root words (which do occur in the lexicon).
*-------------------------------------------------------------------------------
* Created By: AH Date: 08/16/99
* Copyright (C) 1999 Microsoft Corporation
* All Rights Reserved
*******************************************************************************/
#ifndef Morph_h
#define Morph_h
#ifndef __spttseng_h__
#include "spttseng.h"
#endif
// Additional includes...
#include "stdafx.h"
#include "commonlx.h"
//== CONSTANTS ================================================================
#define MAX_POSCONVERSIONS 4
#define NUM_POS 5
/*** SUFFIX_TYPE **************************************************************
* This enumeration contains values for all of the suffixes which can be matched
* and accounted for by the CSMorph class.
*/
static const enum SUFFIX_TYPE
{
S_SUFFIX = 0,
ED_SUFFIX,
ING_SUFFIX,
APOSTROPHES_SUFFIX,
APOSTROPHE_SUFFIX,
ER_SUFFIX,
EST_SUFFIX,
OR_SUFFIX,
MENT_SUFFIX,
AGE_SUFFIX,
LESS_SUFFIX,
Y_SUFFIX,
EDLY_SUFFIX,
LY_SUFFIX,
ABLE_SUFFIX,
NESS_SUFFIX,
ISM_SUFFIX,
IZE_SUFFIX,
IZ_SUFFIX,
HOOD_SUFFIX,
FUL_SUFFIX,
LIKE_SUFFIX,
WISE_SUFFIX,
ISH_SUFFIX,
ABLY_SUFFIX,
SHIP_SUFFIX,
ICALLY_SUFFIX,
SOME_SUFFIX,
ILY_SUFFIX,
ICISM_SUFFIX,
ICIZE_SUFFIX,
NO_MATCH = -1,
};
/* SUFFIX_INFO, g_SuffixTable[] ***********************************************
* This table is used to map the orthographic forms of suffixes to their suffix
* types. Each suffix is stored in reverse order for easier comparison with
* the ends of strings...
*/
struct SUFFIX_INFO
{
WCHAR Orth[10];
SUFFIX_TYPE Type;
};
static const SUFFIX_INFO g_SuffixTable[] =
{
{ L"RE", ER_SUFFIX },
{ L"TSE", EST_SUFFIX },
{ L"GNI", ING_SUFFIX },
{ L"ELBA", ABLE_SUFFIX },
{ L"ELBI", ABLE_SUFFIX },
{ L"YLDE", EDLY_SUFFIX },
{ L"YLBA", ABLY_SUFFIX },
{ L"YLBI", ABLY_SUFFIX },
{ L"YLLACI", ICALLY_SUFFIX },
{ L"YLI", ILY_SUFFIX },
{ L"YL", LY_SUFFIX },
{ L"Y", Y_SUFFIX },
{ L"TNEM", MENT_SUFFIX },
{ L"RO", OR_SUFFIX },
{ L"SSEN", NESS_SUFFIX },
{ L"SSEL", LESS_SUFFIX },
{ L"EZICI", ICIZE_SUFFIX },
{ L"EZI", IZE_SUFFIX },
{ L"ZI", IZ_SUFFIX },
{ L"MSICI", ICISM_SUFFIX },
{ L"MSI", ISM_SUFFIX },
{ L"DE", ED_SUFFIX },
{ L"S'", APOSTROPHES_SUFFIX },
{ L"S", S_SUFFIX },
{ L"'", APOSTROPHE_SUFFIX },
{ L"EGA", AGE_SUFFIX },
{ L"DOOH", HOOD_SUFFIX },
{ L"LUF", FUL_SUFFIX },
{ L"EKIL", LIKE_SUFFIX },
{ L"ESIW", WISE_SUFFIX },
{ L"HSI", ISH_SUFFIX },
{ L"PIHS", SHIP_SUFFIX },
{ L"EMOS", SOME_SUFFIX },
};
/*** PHONTYPE *****************************************************************
* This enumeration creates flags which can be used to determine the relevant
* features of each phone.
*/
static const enum PHONTYPE
{
eCONSONANTF = (1<<0),
eVOICEDF = (1<<1),
ePALATALF = (1<<2),
};
/*** g_PhonTable[], g_PhonS, g_PhonZ *******************************************
* This table is used to map the internal values of phones to their types, which
* are just clusters of features relevant to the necessary phonological rules.
* g_PhonS, g_PhonZ, g_PhonD, g_PhonT are just used to make the code a bit more
* readable.
*/
static const long g_PhonTable[] =
{
eCONSONANTF, // Default value - 0 is not a valid phone
eCONSONANTF, // 1 is a syllable boundary - shouldn't ever occur at the end of a word
eCONSONANTF, // 2 is an exclamation point - shouldn't ever occur at the end of a word
eCONSONANTF, // 3 is a word boundary - treated as a consonant
eCONSONANTF, // 4 is a comma - shouldn't ever occur at the end of a word
eCONSONANTF, // 5 is a period - shouldn't ever occur at the end of a word
eCONSONANTF, // 6 is a question mark - shouldn't ever occur at the end of a word
eCONSONANTF, // 7 is a silence - shouldn't ever occur at the end of a word
eVOICEDF, // 8 is primary stress - treat as a vowel since it should always be attached to a vowel nucleus
eVOICEDF, // 9 is secondatry stress - see primary stress
eVOICEDF, // 10 -> AA
eVOICEDF, // 11 -> AE
eVOICEDF, // 12 -> AH
eVOICEDF, // 13 -> AO
eVOICEDF, // 14 -> AW
eVOICEDF, // 15 -> AX
eVOICEDF, // 16 -> AY
eCONSONANTF + eVOICEDF, // 17 -> b
eCONSONANTF + ePALATALF, // 18 -> CH
eCONSONANTF + eVOICEDF, // 19 -> d
eCONSONANTF + eVOICEDF, // 20 -> DH
eVOICEDF, // 21 -> EH
eVOICEDF, // 22 -> ER
eVOICEDF, // 23 -> EY
eCONSONANTF, // 24 -> f
eCONSONANTF + eVOICEDF, // 25 -> g
eCONSONANTF, // 26 -> h
eVOICEDF, // 27 -> IH
eVOICEDF, // 28 -> IY
eCONSONANTF + eVOICEDF + ePALATALF, // 29 -> JH
eCONSONANTF, // 30 -> k
eCONSONANTF + eVOICEDF, // 31 -> l
eCONSONANTF + eVOICEDF, // 32 -> m
eCONSONANTF + eVOICEDF, // 33 -> n
eCONSONANTF + eVOICEDF, // 34 -> NG
eVOICEDF, // 35 -> OW
eVOICEDF, // 36 -> OY
eCONSONANTF, // 37 -> p
eCONSONANTF + eVOICEDF, // 38 -> r
eCONSONANTF, // 39 -> s
eCONSONANTF + ePALATALF, // 40 -> SH
eCONSONANTF, // 41 -> t
eCONSONANTF, // 42 -> TH
eVOICEDF, // 43 -> UH
eVOICEDF, // 44 -> UW
eCONSONANTF + eVOICEDF, // 45 -> v
eCONSONANTF + eVOICEDF, // 46 -> w
eCONSONANTF + eVOICEDF, // 47 -> y
eCONSONANTF + eVOICEDF, // 48 -> z
eCONSONANTF + eVOICEDF + ePALATALF, // 49 -> ZH
};
static WCHAR g_phonAXl[] = L" AX l";
static WCHAR g_phonAXz[] = L" AX z";
static WCHAR g_phonS[] = L" s";
static WCHAR g_phonZ[] = L" z";
static WCHAR g_phonD[] = L" d";
static WCHAR g_phonAXd[] = L" AX d";
static WCHAR g_phonT[] = L" t";
static WCHAR g_phonIY[] = L" IY";
static WCHAR g_phonL[] = L" l";
/*** struct POS_CONVERT *******************************************************
* This struct stores the From and To parts of speech for a suffix...
*/
struct POS_CONVERT
{
ENGPARTOFSPEECH FromPos;
ENGPARTOFSPEECH ToPos;
};
/*** MorphSpecialCaseFlags ****************************************************
* This enum allows DoSuffixMorph to be nearly completely table driven. Each
* suffix has a MorphSpecialCaseFlags entry in the SuffixInfoTable which tells
* DoSuffixMorph which special case functions (check for missing E, etc.) need
* to be called if the initial lex lookup fails.
*/
typedef enum MorphSpecialCaseFlags
{
eCheckForMissingE = 1L << 0,
eCheckYtoIMutation = 1L << 1,
eCheckDoubledMutation = 1L << 2,
eCheckForMissingY = 1L << 3,
eCheckForMissingL = 1L << 4,
} MorphSpecialCaseFlags;
/*** struct SUFFIXPRON_INFO ***************************************************
* This struct stores the pronunciation of a suffix, as well as the POS
* categories it takes as input and output.
*/
struct SUFFIXPRON_INFO
{
WCHAR SuffixString[SP_MAX_PRON_LENGTH];
POS_CONVERT Conversions[MAX_POSCONVERSIONS];
short NumConversions;
DWORD dwMorphSpecialCaseFlags;
};
/*** bool SuffixInfoTableInitialized *******************************************
* This bool just lets threads know whether they are the first to use the
* following table, and thus whether they need to initialize it or not.
*/
static bool SuffixInfoTableInitialized = false;
/*** SUFFIXPRON_INFO g_SuffixInfoTable *****************************************
* This table drives the DoSuffixMorph function, by storing the pronunciation,
* conversions, number of conversions, and special case flags for each suffix...
*/
static SUFFIXPRON_INFO g_SuffixInfoTable [] =
{
/********************************************************************************************************/
/* Pronunciation * Conversions * NumConversions * Special Case Flags * SuffixType */
/********************************************************************************************************/
{ L" s", { {MS_Verb, MS_Verb},
{MS_Noun, MS_Noun} }, 2, 0 }, // S_SUFFIX
{ L" d", { {MS_Verb, MS_Verb},
{MS_Verb, MS_Adj} }, 2, eCheckForMissingE +
eCheckYtoIMutation +
eCheckDoubledMutation }, // ED_SUFFIX
{ L" IH NG", { {MS_Verb, MS_Verb},
{MS_Verb, MS_Adj},
{MS_Verb, MS_Noun} }, 3, eCheckForMissingE +
eCheckDoubledMutation }, // ING_SUFFIX
{ L" s", { {MS_Noun, MS_Noun} }, 1, 0 }, // APOSTROPHES_SUFFIX
{ L" s", { {MS_Noun, MS_Noun} }, 1, 0 }, // APOSTROPHE_SUFFIX
{ L" ER", { {MS_Verb, MS_Noun},
{MS_Adj, MS_Adj},
{MS_Adv, MS_Adv},
{MS_Adj, MS_Adv} }, 4, eCheckForMissingE +
eCheckYtoIMutation +
eCheckDoubledMutation }, // ER_SUFFIX
{ L" AX s t", { {MS_Adj, MS_Adj},
{MS_Adv, MS_Adv},
{MS_Adj, MS_Adv} }, 3, eCheckForMissingE +
eCheckYtoIMutation +
eCheckDoubledMutation }, // EST_SUFFIX
{ L" ER", { {MS_Verb, MS_Noun} }, 1, eCheckForMissingE +
eCheckDoubledMutation }, // OR_SUFFIX
{ L" m AX n t", { {MS_Verb, MS_Noun} }, 1, eCheckYtoIMutation }, // MENT_SUFFIX
{ L" IH JH", { {MS_Verb, MS_Noun} }, 1, eCheckForMissingE +
eCheckDoubledMutation }, // AGE_SUFFIX
{ L" l IH s", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // LESS_SUFFIX
{ L" IY", { {MS_Noun, MS_Adj},
{MS_Adj, MS_Adv} }, 2, eCheckForMissingE +
eCheckDoubledMutation }, // Y_SUFFIX
{ L" AX d l IY", { {MS_Verb, MS_Adj},
{MS_Verb, MS_Adv} }, 2, eCheckForMissingE +
eCheckYtoIMutation +
eCheckDoubledMutation }, // EDLY_SUFFIX
{ L" l IY", { {MS_Noun, MS_Adj},
{MS_Adj, MS_Adv} }, 2, eCheckForMissingL }, // LY_XUFFIX
{ L" AX - b AX l", { {MS_Verb, MS_Adj},
{MS_Noun, MS_Adj} }, 2, eCheckForMissingE +
eCheckYtoIMutation +
eCheckDoubledMutation }, // ABLE_SUFFIX
{ L" n IH s", { {MS_Adj, MS_Noun} }, 1, eCheckYtoIMutation }, // NESS_SUFFIX
{ L" IH z AX m", { {MS_Adj, MS_Noun},
{MS_Noun, MS_Noun} }, 2, eCheckForMissingE }, // ISM_SUFFIX
{ L" AY z", { {MS_Noun, MS_Verb},
{MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // IZE_SUFFIX
{ L" AY z", { {MS_Noun, MS_Verb},
{MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // IZ_SUFFIX
{ L" h UH d", { {MS_Noun, MS_Noun} }, 1, 0 }, // HOOD_SUFFIX
{ L" f AX l", { {MS_Noun, MS_Adj},
{MS_Verb, MS_Adj} }, 2, 0 } , // FUL_SUFFIX
{ L" l AY k", { {MS_Noun, MS_Adj} }, 1, 0 }, // LIKE_SUFFIX
{ L" w AY z", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // WISE_SUFFIX
{ L" IH SH", { {MS_Noun, MS_Adj} }, 1, eCheckForMissingE +
eCheckDoubledMutation }, // ISH_SUFFIX
{ L" AX - b l IY", { {MS_Verb, MS_Adv},
{MS_Noun, MS_Adv} }, 2, eCheckForMissingE +
eCheckYtoIMutation +
eCheckDoubledMutation }, // ABLY_SUFFIX
{ L" SH IH 2 p", { {MS_Noun, MS_Noun} }, 1, 0 }, // SHIP_SUFFIX
{ L" L IY", { {MS_Adj, MS_Adv} }, 1, 0 }, // ICALLY_SUFFIX
{ L" S AX M", { {MS_Noun, MS_Adj} }, 1, eCheckYtoIMutation }, // SOME_SUFFIX
{ L" AX L IY", { {MS_Noun, MS_Adv} }, 1, eCheckDoubledMutation +
eCheckForMissingY }, // ILY_SUFFIX
{ L" IH z AX m", { {MS_Adj, MS_Noun},
{MS_Noun, MS_Noun} }, 2, eCheckForMissingE }, // ICISM_SUFFIX
{ L" AY z", { {MS_Noun, MS_Verb},
{MS_Adj, MS_Verb} }, 2, eCheckForMissingE }, // ICIZE_SUFFIX
};
/*** CSuffixList **************************************************************
* This typedef just makes the code a little easier to read. A CSuffixList is
* used to keep track of each of the suffixes which has been stripped from a
* word, so that their pronunciations can be concatenated with that of the root.
*/
typedef CSPList<SUFFIXPRON_INFO*, SUFFIXPRON_INFO*> CSuffixList;
/*** CComAutoCriticalSection g_SuffixInfoTableCritSec *************************
* This critical section is used to make sure the SuffixInfoTable only gets
* initialized once.
*/
static CComAutoCriticalSection g_SuffixInfoTableCritSec;
/*** CSMorph ******************************************************************
* This is the definition of the CSMorph class.
*/
class CSMorph
{
public:
/*=== PUBLIC METHODS =====*/
CSMorph( ISpLexicon *pMasterLex=0, HRESULT *hr=0 );
/*=== INTERFACE METHOD =====*/
HRESULT DoSuffixMorph( const WCHAR *pwWord, WCHAR *pwRoot, LANGID LangID, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
private:
/*=== PRIVATE METHODS =====*/
SUFFIX_TYPE MatchSuffix( WCHAR *TargWord, long *RootLen );
HRESULT LexLookup( const WCHAR *pOrth, long length, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
HRESULT LTSLookup( const WCHAR *pOrth, long length,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
HRESULT AccumulateSuffixes( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
HRESULT AccumulateSuffixes_LTS( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
HRESULT DefaultAccumulateSuffixes( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
HRESULT CheckForMissingE( WCHAR *pOrth, long length, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
HRESULT CheckForMissingY( WCHAR *pOrth, long length, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
HRESULT CheckForMissingL( WCHAR *pOrth, long length, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
HRESULT CheckYtoIMutation( WCHAR *pOrth, long length, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
HRESULT CheckDoubledMutation( WCHAR *pOrth, long length, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
HRESULT CheckYtoIEMutation( WCHAR *pOrth, long length, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
HRESULT CheckAbleMutation( WCHAR *pOrth, long length, DWORD dwFlags,
SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
HRESULT Phon_SorZ( WCHAR *pPronunciation, long length );
HRESULT Phon_DorED( WCHAR *pPronunciation, long length );
/*=== MEMBER DATA =====*/
// Pointer to the Master Lexicon...
ISpLexicon *m_pMasterLex;
};
inline BOOL SearchPosSet( ENGPARTOFSPEECH Pos, const ENGPARTOFSPEECH *Set, ULONG Count )
{
for( ULONG i = 0; i < Count; ++i )
{
if( Pos == Set[i] )
{
return true;
}
}
return false;
}
#endif //--- End of File -------------------------------------------------------------