windows-server-2003/enduser/speech/tts/msttsdrv/engine/morph.h

/*******************************************************************************
* morph.h *
*---------*
*   Description:
*       This is the header file for the CSMorph implementation.  This class 
*   attempts to find pronunciations for morphological variants (which do not
*   occur in the lexicon) of root words (which do occur in the lexicon).  
*-------------------------------------------------------------------------------
*  Created By: AH                            Date: 08/16/99
*  Copyright (C) 1999 Microsoft Corporation
*  All Rights Reserved
*******************************************************************************/
#ifndef Morph_h
#define Morph_h

#ifndef __spttseng_h__
#include "spttseng.h"
#endif

// Additional includes...
#include "stdafx.h"
#include "commonlx.h"

//== CONSTANTS ================================================================

#define MAX_POSCONVERSIONS 4
#define NUM_POS 5


/*** SUFFIX_TYPE **************************************************************
* This enumeration contains values for all of the suffixes which can be matched
* and accounted for by the CSMorph class.
*/
static const enum SUFFIX_TYPE
{
    S_SUFFIX = 0,
    ED_SUFFIX,
    ING_SUFFIX,
    APOSTROPHES_SUFFIX,
    APOSTROPHE_SUFFIX,
    ER_SUFFIX,
    EST_SUFFIX,
    OR_SUFFIX,
    MENT_SUFFIX,
    AGE_SUFFIX,
    LESS_SUFFIX,
    Y_SUFFIX,
    EDLY_SUFFIX,
    LY_SUFFIX,
    ABLE_SUFFIX,
    NESS_SUFFIX,
    ISM_SUFFIX,
    IZE_SUFFIX,
    IZ_SUFFIX,
    HOOD_SUFFIX,
    FUL_SUFFIX,
    LIKE_SUFFIX,
    WISE_SUFFIX,
    ISH_SUFFIX,
    ABLY_SUFFIX,
    SHIP_SUFFIX,
    ICALLY_SUFFIX,
    SOME_SUFFIX,
    ILY_SUFFIX,
    ICISM_SUFFIX,
    ICIZE_SUFFIX,
    NO_MATCH = -1,
};


/* SUFFIX_INFO, g_SuffixTable[] ***********************************************
* This table is used to map the orthographic forms of suffixes to their suffix
* types.  Each suffix is stored in reverse order for easier comparison with 
* the ends of strings...
*/
struct SUFFIX_INFO 
{
    WCHAR       Orth[10];
    SUFFIX_TYPE Type;
};

static const SUFFIX_INFO g_SuffixTable[] = 
{ 
    { L"RE",        ER_SUFFIX },
    { L"TSE",       EST_SUFFIX },
    { L"GNI",       ING_SUFFIX },
    { L"ELBA",      ABLE_SUFFIX },
    { L"ELBI",      ABLE_SUFFIX },
    { L"YLDE",      EDLY_SUFFIX },
    { L"YLBA",      ABLY_SUFFIX },
    { L"YLBI",      ABLY_SUFFIX },
    { L"YLLACI",    ICALLY_SUFFIX },
    { L"YLI",       ILY_SUFFIX },
    { L"YL",        LY_SUFFIX },
    { L"Y",         Y_SUFFIX },
    { L"TNEM",      MENT_SUFFIX },
    { L"RO",        OR_SUFFIX },
    { L"SSEN",      NESS_SUFFIX },
    { L"SSEL",      LESS_SUFFIX },
    { L"EZICI",     ICIZE_SUFFIX },
    { L"EZI",       IZE_SUFFIX },
    { L"ZI",        IZ_SUFFIX },
    { L"MSICI",     ICISM_SUFFIX },
    { L"MSI",       ISM_SUFFIX },
    { L"DE",        ED_SUFFIX },
    { L"S'",        APOSTROPHES_SUFFIX },
    { L"S",         S_SUFFIX },
    { L"'",         APOSTROPHE_SUFFIX },
    { L"EGA",       AGE_SUFFIX },
    { L"DOOH",      HOOD_SUFFIX },
    { L"LUF",       FUL_SUFFIX },
    { L"EKIL",      LIKE_SUFFIX },
    { L"ESIW",      WISE_SUFFIX },
    { L"HSI",       ISH_SUFFIX },
    { L"PIHS",      SHIP_SUFFIX },
    { L"EMOS",      SOME_SUFFIX },
};


/*** PHONTYPE *****************************************************************
* This enumeration creates flags which can be used to determine the relevant
* features of each phone.
*/
static const enum PHONTYPE
{	
    eCONSONANTF = (1<<0),
    eVOICEDF = (1<<1),
    ePALATALF = (1<<2),
};


/*** g_PhonTable[], g_PhonS, g_PhonZ *******************************************
* This table is used to map the internal values of phones to their types, which 
* are just clusters of features relevant to the necessary phonological rules.
* g_PhonS, g_PhonZ, g_PhonD, g_PhonT are just used to make the code a bit more
* readable.
*/
static const long g_PhonTable[] = 
{
    eCONSONANTF,                        // Default value - 0 is not a valid phone
    eCONSONANTF,                        // 1 is a syllable boundary - shouldn't ever occur at the end of a word
    eCONSONANTF,                        // 2 is an exclamation point - shouldn't ever occur at the end of a word
    eCONSONANTF,                        // 3 is a word boundary - treated as a consonant
    eCONSONANTF,                        // 4 is a comma - shouldn't ever occur at the end of a word
    eCONSONANTF,                        // 5 is a period - shouldn't ever occur at the end of a word
    eCONSONANTF,                        // 6 is a question mark - shouldn't ever occur at the end of a word
    eCONSONANTF,                        // 7 is a silence - shouldn't ever occur at the end of a word
    eVOICEDF,                           // 8 is primary stress - treat as a vowel since it should always be attached to a vowel nucleus
    eVOICEDF,                           // 9 is secondatry stress - see primary stress
    eVOICEDF,                           // 10 -> AA
    eVOICEDF,                           // 11 -> AE
    eVOICEDF,                           // 12 -> AH
    eVOICEDF,                           // 13 -> AO
    eVOICEDF,                           // 14 -> AW
    eVOICEDF,                           // 15 -> AX
    eVOICEDF,                           // 16 -> AY
    eCONSONANTF + eVOICEDF,             // 17 -> b
    eCONSONANTF + ePALATALF,            // 18 -> CH
    eCONSONANTF + eVOICEDF,             // 19 -> d
    eCONSONANTF + eVOICEDF,             // 20 -> DH
    eVOICEDF,                           // 21 -> EH
    eVOICEDF,                           // 22 -> ER
    eVOICEDF,                           // 23 -> EY
    eCONSONANTF,                        // 24 -> f
    eCONSONANTF + eVOICEDF,             // 25 -> g
    eCONSONANTF,                        // 26 -> h
    eVOICEDF,                           // 27 -> IH
    eVOICEDF,                           // 28 -> IY
    eCONSONANTF + eVOICEDF + ePALATALF, // 29 -> JH
    eCONSONANTF,                        // 30 -> k
    eCONSONANTF + eVOICEDF,             // 31 -> l
    eCONSONANTF + eVOICEDF,             // 32 -> m
    eCONSONANTF + eVOICEDF,             // 33 -> n
    eCONSONANTF + eVOICEDF,             // 34 -> NG
    eVOICEDF,                           // 35 -> OW
    eVOICEDF,                           // 36 -> OY
    eCONSONANTF,                        // 37 -> p
    eCONSONANTF + eVOICEDF,             // 38 -> r
    eCONSONANTF,                        // 39 -> s
    eCONSONANTF + ePALATALF,            // 40 -> SH
    eCONSONANTF,                        // 41 -> t
    eCONSONANTF,                        // 42 -> TH
    eVOICEDF,                           // 43 -> UH
    eVOICEDF,                           // 44 -> UW
    eCONSONANTF + eVOICEDF,             // 45 -> v
    eCONSONANTF + eVOICEDF,             // 46 -> w
    eCONSONANTF + eVOICEDF,             // 47 -> y
    eCONSONANTF + eVOICEDF,             // 48 -> z
    eCONSONANTF + eVOICEDF + ePALATALF, // 49 -> ZH
};

static WCHAR g_phonAXl[] = L" AX l";
static WCHAR g_phonAXz[] = L" AX z";
static WCHAR g_phonS[] = L" s";
static WCHAR g_phonZ[] = L" z";
static WCHAR g_phonD[] = L" d";
static WCHAR g_phonAXd[] = L" AX d";
static WCHAR g_phonT[] = L" t";
static WCHAR g_phonIY[] = L" IY";
static WCHAR g_phonL[] = L" l";

/*** struct POS_CONVERT *******************************************************
* This struct stores the From and To parts of speech for a suffix...
*/
struct POS_CONVERT
{
    ENGPARTOFSPEECH FromPos;
    ENGPARTOFSPEECH ToPos;
};

/*** MorphSpecialCaseFlags ****************************************************
* This enum allows DoSuffixMorph to be nearly completely table driven.  Each
* suffix has a MorphSpecialCaseFlags entry in the SuffixInfoTable which tells
* DoSuffixMorph which special case functions (check for missing E, etc.) need
* to be called if the initial lex lookup fails.
*/
typedef enum MorphSpecialCaseFlags
{
    eCheckForMissingE       = 1L << 0,
    eCheckYtoIMutation      = 1L << 1,
    eCheckDoubledMutation   = 1L << 2,
    eCheckForMissingY       = 1L << 3,
    eCheckForMissingL       = 1L << 4,
} MorphSpecialCaseFlags;

/*** struct SUFFIXPRON_INFO ***************************************************
* This struct stores the pronunciation of a suffix, as well as the POS 
* categories it takes as input and output.
*/
struct SUFFIXPRON_INFO 
{
    WCHAR SuffixString[SP_MAX_PRON_LENGTH];
    POS_CONVERT Conversions[MAX_POSCONVERSIONS];
    short NumConversions;
    DWORD dwMorphSpecialCaseFlags;
};

/*** bool SuffixInfoTableInitialized *******************************************
* This bool just lets threads know whether they are the first to use the 
* following table, and thus whether they need to initialize it or not.
*/
static bool SuffixInfoTableInitialized = false;

/*** SUFFIXPRON_INFO g_SuffixInfoTable *****************************************
* This table drives the DoSuffixMorph function, by storing the pronunciation, 
* conversions, number of conversions, and special case flags for each suffix...
*/
static SUFFIXPRON_INFO g_SuffixInfoTable [] =
{
/********************************************************************************************************/
/*    Pronunciation     *  Conversions  *   NumConversions * Special Case Flags      *   SuffixType      */
/********************************************************************************************************/
    { L" s",            { {MS_Verb,   MS_Verb}, 
                          {MS_Noun,   MS_Noun}  },    2,  0 },                          // S_SUFFIX
    { L" d",            { {MS_Verb,   MS_Verb}, 
                          {MS_Verb,   MS_Adj}   },    2,  eCheckForMissingE +
                                                          eCheckYtoIMutation +
                                                          eCheckDoubledMutation   },    // ED_SUFFIX
    { L" IH NG",        { {MS_Verb,   MS_Verb}, 
                          {MS_Verb,   MS_Adj},
                          {MS_Verb,   MS_Noun}  },    3,  eCheckForMissingE +
                                                          eCheckDoubledMutation   },    // ING_SUFFIX
    { L" s",            { {MS_Noun,   MS_Noun}  },    1,  0 },                          // APOSTROPHES_SUFFIX
    { L" s",            { {MS_Noun,   MS_Noun}  },    1,  0 },                          // APOSTROPHE_SUFFIX
    { L" ER",           { {MS_Verb,   MS_Noun},
                          {MS_Adj,    MS_Adj}, 
                          {MS_Adv,    MS_Adv}, 
                          {MS_Adj,    MS_Adv}   },    4,  eCheckForMissingE +
                                                          eCheckYtoIMutation +
                                                          eCheckDoubledMutation   },    // ER_SUFFIX
    { L" AX s t",       { {MS_Adj,    MS_Adj}, 
                          {MS_Adv,    MS_Adv},
                          {MS_Adj,    MS_Adv}   },    3,  eCheckForMissingE +
                                                          eCheckYtoIMutation +
                                                          eCheckDoubledMutation   },    // EST_SUFFIX
    { L" ER",           { {MS_Verb,   MS_Noun}  },    1,  eCheckForMissingE +
                                                          eCheckDoubledMutation },      // OR_SUFFIX
    { L" m AX n t",     { {MS_Verb,   MS_Noun}  },    1,  eCheckYtoIMutation },         // MENT_SUFFIX
    { L" IH JH",        { {MS_Verb,   MS_Noun}  },    1,  eCheckForMissingE + 
                                                          eCheckDoubledMutation   },    // AGE_SUFFIX
    { L" l IH s",       { {MS_Noun,   MS_Adj}   },    1,  eCheckYtoIMutation      },    // LESS_SUFFIX
    { L" IY",           { {MS_Noun,   MS_Adj},
                          {MS_Adj,    MS_Adv}   },    2,  eCheckForMissingE +
                                                          eCheckDoubledMutation   },    // Y_SUFFIX
    { L" AX d l IY",    { {MS_Verb,   MS_Adj},
                          {MS_Verb,   MS_Adv}   },    2,  eCheckForMissingE +
                                                          eCheckYtoIMutation +
                                                          eCheckDoubledMutation   },    // EDLY_SUFFIX
    { L" l IY",         { {MS_Noun,   MS_Adj},
                          {MS_Adj,    MS_Adv}   },    2,  eCheckForMissingL },          // LY_XUFFIX
    { L" AX - b AX l",  { {MS_Verb,   MS_Adj},
                          {MS_Noun,   MS_Adj}   },    2,  eCheckForMissingE +
                                                          eCheckYtoIMutation +
                                                          eCheckDoubledMutation   },    // ABLE_SUFFIX
    { L" n IH s",       { {MS_Adj,    MS_Noun}  },    1,  eCheckYtoIMutation      },    // NESS_SUFFIX
    { L" IH z AX m",    { {MS_Adj,    MS_Noun},
                          {MS_Noun,   MS_Noun}  },    2,  eCheckForMissingE       },    // ISM_SUFFIX
    { L" AY z",         { {MS_Noun,   MS_Verb}, 
                          {MS_Adj,    MS_Verb}  },    2,  eCheckForMissingE       },    // IZE_SUFFIX
    { L" AY z",         { {MS_Noun,   MS_Verb},
                          {MS_Adj,    MS_Verb}  },    2,  eCheckForMissingE       },    // IZ_SUFFIX
    { L" h UH d",       { {MS_Noun,   MS_Noun}  },    1,  0 },                          // HOOD_SUFFIX
    { L" f AX l",       { {MS_Noun,   MS_Adj},
                          {MS_Verb,   MS_Adj}   },    2,  0 } ,                         // FUL_SUFFIX
    { L" l AY k",       { {MS_Noun,   MS_Adj}   },    1,  0 },                          // LIKE_SUFFIX
    { L" w AY z",       { {MS_Noun,   MS_Adj}   },    1,  eCheckYtoIMutation },                        // WISE_SUFFIX
    { L" IH SH",        { {MS_Noun,   MS_Adj}   },    1,  eCheckForMissingE +
                                                          eCheckDoubledMutation   },    // ISH_SUFFIX
    { L" AX - b l IY",  { {MS_Verb,   MS_Adv},
                          {MS_Noun,   MS_Adv}   },    2,  eCheckForMissingE +
                                                          eCheckYtoIMutation +
                                                          eCheckDoubledMutation   },    // ABLY_SUFFIX
    { L" SH IH 2 p",    { {MS_Noun,   MS_Noun}  },    1,  0 },                          // SHIP_SUFFIX
    { L" L IY",         { {MS_Adj,    MS_Adv}   },    1,  0 },                          // ICALLY_SUFFIX
    { L" S AX M",       { {MS_Noun,   MS_Adj}   },    1,  eCheckYtoIMutation      },    // SOME_SUFFIX
    { L" AX L IY",      { {MS_Noun,   MS_Adv}   },    1,  eCheckDoubledMutation +
                                                          eCheckForMissingY       },    // ILY_SUFFIX
    { L" IH z AX m",    { {MS_Adj,    MS_Noun},
                          {MS_Noun,   MS_Noun}  },    2,  eCheckForMissingE       },    // ICISM_SUFFIX
    { L" AY z",         { {MS_Noun,   MS_Verb}, 
                          {MS_Adj,    MS_Verb}  },    2,  eCheckForMissingE       },    // ICIZE_SUFFIX
};

/*** CSuffixList **************************************************************
* This typedef just makes the code a little easier to read.  A CSuffixList is
* used to keep track of each of the suffixes which has been stripped from a
* word, so that their pronunciations can be concatenated with that of the root.
*/
typedef CSPList<SUFFIXPRON_INFO*, SUFFIXPRON_INFO*> CSuffixList;

/*** CComAutoCriticalSection g_SuffixInfoTableCritSec *************************
* This critical section is used to make sure the SuffixInfoTable only gets
* initialized once.
*/
static CComAutoCriticalSection g_SuffixInfoTableCritSec;

/*** CSMorph ******************************************************************
* This is the definition of the CSMorph class.
*/
class CSMorph
{
public:

    /*=== PUBLIC METHODS =====*/
    CSMorph( ISpLexicon *pMasterLex=0, HRESULT *hr=0 );

    /*=== INTERFACE METHOD =====*/
    HRESULT DoSuffixMorph( const WCHAR *pwWord, WCHAR *pwRoot, LANGID LangID, DWORD dwFlags,
                           SPWORDPRONUNCIATIONLIST *pWordPronunciationList );

private:


    /*=== PRIVATE METHODS =====*/
    SUFFIX_TYPE MatchSuffix( WCHAR *TargWord, long *RootLen );
    HRESULT LexLookup( const WCHAR *pOrth, long length, DWORD dwFlags, 
                       SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
    HRESULT LTSLookup( const WCHAR *pOrth, long length,
                       SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
    HRESULT AccumulateSuffixes( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
    HRESULT AccumulateSuffixes_LTS( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
    HRESULT DefaultAccumulateSuffixes( CSuffixList *pSuffixList, SPWORDPRONUNCIATIONLIST *pWordPronunciationList );

    HRESULT CheckForMissingE( WCHAR *pOrth, long length, DWORD dwFlags,
                              SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
    HRESULT CheckForMissingY( WCHAR *pOrth, long length, DWORD dwFlags,
                              SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
    HRESULT CheckForMissingL( WCHAR *pOrth, long length, DWORD dwFlags,
                              SPWORDPRONUNCIATIONLIST *pWordPronunciationList );
    HRESULT CheckYtoIMutation( WCHAR *pOrth, long length, DWORD dwFlags, 
                               SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
    HRESULT CheckDoubledMutation( WCHAR *pOrth, long length, DWORD dwFlags,
                                  SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
    HRESULT CheckYtoIEMutation( WCHAR *pOrth, long length, DWORD dwFlags,
                                SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
    HRESULT CheckAbleMutation( WCHAR *pOrth, long length, DWORD dwFlags,
                               SPWORDPRONUNCIATIONLIST *pWordPronunciationList);
    HRESULT Phon_SorZ( WCHAR *pPronunciation, long length );
    HRESULT Phon_DorED( WCHAR *pPronunciation, long length ); 

    /*=== MEMBER DATA =====*/

    // Pointer to the Master Lexicon...
    ISpLexicon  *m_pMasterLex;
};

inline BOOL SearchPosSet( ENGPARTOFSPEECH Pos, const ENGPARTOFSPEECH *Set, ULONG Count )
{
    for( ULONG i = 0; i < Count; ++i )
    {
        if( Pos == Set[i] )
        {
            return true;
        }
    }
    return false;
}

#endif //--- End of File -------------------------------------------------------------