You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
386 lines
13 KiB
386 lines
13 KiB
/*****************************************************************************
|
|
* spttseng.idl *
|
|
*--------------*
|
|
* Description:
|
|
* This is the idl file for the Microsoft Text To Speech Driver.
|
|
*-----------------------------------------------------------------------------
|
|
* Creation: 03/01/99
|
|
* Copyright (C) Microsoft Corporation 1999
|
|
* All rights reserved.
|
|
*
|
|
****************************************************************** EDC ******/
|
|
//--- Import base idl
|
|
import "oaidl.idl";
|
|
import "ocidl.idl";
|
|
import "sapiddk.idl";
|
|
|
|
//=== Forward References ======================================================
|
|
interface IMSVoiceData;
|
|
interface IMSTTSEngineInit;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef enum ENGPARTOFSPEECH
|
|
{
|
|
MS_NotOverriden = SPPS_NotOverriden,
|
|
MS_Unknown = SPPS_Unknown, // Probably from user lexicon
|
|
MS_Noun = SPPS_Noun,
|
|
MS_Verb = SPPS_Verb,
|
|
MS_Modifier = SPPS_Modifier,
|
|
MS_Function = SPPS_Function,
|
|
MS_Interjection = SPPS_Interjection,
|
|
|
|
// MS Nouns
|
|
MS_Pron = ( SPPS_Noun + 1 ),
|
|
MS_SubjPron = ( SPPS_Noun + 2 ),
|
|
MS_ObjPron = ( SPPS_Noun + 3 ),
|
|
MS_RelPron = ( SPPS_Noun + 4 ),
|
|
MS_PosNoun = ( SPPS_Noun + 9 ),
|
|
// MS Modifiers
|
|
MS_Adj = ( SPPS_Modifier + 1 ),
|
|
MS_Adv = ( SPPS_Modifier + 2 ),
|
|
// MS Function Words
|
|
MS_VAux = ( SPPS_Function + 1 ),
|
|
MS_Conj = ( SPPS_Function + 3 ),
|
|
MS_CConj = ( SPPS_Function + 4 ),
|
|
MS_Interr = ( SPPS_Function + 5 ),
|
|
MS_Det = ( SPPS_Function + 6 ),
|
|
MS_Contr = ( SPPS_Function + 7 ),
|
|
MS_Prep = ( SPPS_Function + 9 ),
|
|
// MS Punctuation
|
|
MS_Punctuation = ( SPPS_Function + 11 ),
|
|
MS_GroupBegin = ( MS_Punctuation + 1 ),
|
|
MS_GroupEnd = ( MS_Punctuation + 2 ),
|
|
MS_EOSItem = ( MS_Punctuation + 3 ),
|
|
MS_MiscPunc = ( MS_Punctuation + 4 ),
|
|
MS_Quotation = ( MS_Punctuation + 5 )
|
|
} ENGPARTOFSPEECH;
|
|
|
|
|
|
|
|
|
|
|
|
typedef enum TTSItemType
|
|
{
|
|
eWORDLIST_NOT_VALID = 0x0000,
|
|
eWORDLIST_IS_VALID = 0x1000,
|
|
eUNMATCHED = eWORDLIST_IS_VALID + 1,
|
|
eALPHA_WORD = eWORDLIST_IS_VALID + 2,
|
|
eABBREVIATION = eWORDLIST_IS_VALID + 3,
|
|
eABBREVIATION_NORMALIZE = eWORDLIST_IS_VALID + 4,
|
|
eINITIALISM = eWORDLIST_IS_VALID + 5,
|
|
eNUM_CARDINAL = eWORDLIST_IS_VALID + 6,
|
|
eNUM_ORDINAL = eWORDLIST_IS_VALID + 7,
|
|
eNUM_DECIMAL = eWORDLIST_IS_VALID + 8,
|
|
eNUM_PERCENT = eWORDLIST_IS_VALID + 9,
|
|
eNUM_DEGREES = eWORDLIST_IS_VALID + 10,
|
|
eNUM_SQUARED = eWORDLIST_IS_VALID + 11,
|
|
eNUM_CUBED = eWORDLIST_IS_VALID + 12,
|
|
eNUM_CURRENCY = eWORDLIST_IS_VALID + 13,
|
|
eNUM_FRACTION = eWORDLIST_IS_VALID + 14,
|
|
eNUM_MIXEDFRACTION = eWORDLIST_IS_VALID + 15,
|
|
eNUM_ROMAN_NUMERAL = eWORDLIST_IS_VALID + 16,
|
|
eNUM_ROMAN_NUMERAL_ORDINAL = eWORDLIST_IS_VALID + 17,
|
|
eNUM_PHONENUMBER = eWORDLIST_IS_VALID + 18,
|
|
eNUM_ZIPCODE = eWORDLIST_IS_VALID + 19,
|
|
eDATE_YEAR = eWORDLIST_IS_VALID + 20,
|
|
eDATE = eWORDLIST_IS_VALID + 21,
|
|
eDATE_LONGFORM = eWORDLIST_IS_VALID + 22,
|
|
eDECADE = eWORDLIST_IS_VALID + 23,
|
|
eTIMEOFDAY = eWORDLIST_IS_VALID + 24,
|
|
eTIME = eWORDLIST_IS_VALID + 25,
|
|
eSPELLOUT = eWORDLIST_IS_VALID + 26,
|
|
eHYPHENATED_STRING = eWORDLIST_IS_VALID + 27,
|
|
eSTATE_AND_ZIPCODE = eWORDLIST_IS_VALID + 28,
|
|
eTIME_RANGE = eWORDLIST_IS_VALID + 29,
|
|
eNUM_RANGE = eWORDLIST_IS_VALID + 30,
|
|
eTEMP_NUMBER = eWORDLIST_IS_VALID + 31,
|
|
eTEMP_PERCENT = eWORDLIST_IS_VALID + 32,
|
|
eTEMP_DEGREES = eWORDLIST_IS_VALID + 33,
|
|
eTEMP_NUM_FRACTION = eWORDLIST_IS_VALID + 34,
|
|
eTEMP_NUM_MIXEDFRACTION = eWORDLIST_IS_VALID + 35,
|
|
eTEMP_NUM_DECIMAL = eWORDLIST_IS_VALID + 36,
|
|
eTEMP_NUM_ORDINAL = eWORDLIST_IS_VALID + 37,
|
|
eTEMP_NUM_CURRENCY = eWORDLIST_IS_VALID + 38,
|
|
eNEWNUM_PHONENUMBER = eWORDLIST_IS_VALID + 39,
|
|
eNUM_CURRENCYRANGE = eWORDLIST_IS_VALID + 40,
|
|
eSUFFIX = eWORDLIST_IS_VALID + 41,
|
|
eOPEN_PARENTHESIS = eWORDLIST_NOT_VALID + 1,
|
|
eOPEN_BRACKET = eWORDLIST_NOT_VALID + 2,
|
|
eOPEN_BRACE = eWORDLIST_NOT_VALID + 3,
|
|
eCLOSE_PARENTHESIS = eWORDLIST_NOT_VALID + 4,
|
|
eCLOSE_BRACKET = eWORDLIST_NOT_VALID + 5,
|
|
eCLOSE_BRACE = eWORDLIST_NOT_VALID + 6,
|
|
eSINGLE_QUOTE = eWORDLIST_NOT_VALID + 7,
|
|
eDOUBLE_QUOTE = eWORDLIST_NOT_VALID + 8,
|
|
ePERIOD = eWORDLIST_NOT_VALID + 9,
|
|
eEXCLAMATION = eWORDLIST_NOT_VALID + 10,
|
|
eQUESTION = eWORDLIST_NOT_VALID + 11,
|
|
eCOMMA = eWORDLIST_NOT_VALID + 12,
|
|
eSEMICOLON = eWORDLIST_NOT_VALID + 13,
|
|
eCOLON = eWORDLIST_NOT_VALID + 14,
|
|
eHYPHEN = eWORDLIST_NOT_VALID + 15,
|
|
eELLIPSIS = eWORDLIST_NOT_VALID + 16,
|
|
} TTSItemType;
|
|
|
|
typedef struct TTSWord
|
|
{
|
|
const SPVSTATE* pXmlState; // The XML State of the word
|
|
LPCWSTR pWordText; // Pointer to the orthographic form of the word
|
|
ULONG ulWordLen; // Length of the word, in WCHARs
|
|
LPCWSTR pLemma; // Pointer to the orthographic form of the root word
|
|
ULONG ulLemmaLen; // Length of the lemma, in WCHARs
|
|
SPPHONEID* pWordPron; // Pointer to the NULL terminated pronunciation of the word
|
|
ENGPARTOFSPEECH eWordPartOfSpeech; // The part of speech of the word - Is this needed???
|
|
} TTSWord;
|
|
|
|
typedef struct TTSItemInfo
|
|
{
|
|
TTSItemType Type;
|
|
} TTSItemInfo;
|
|
|
|
typedef struct TTSSentItem
|
|
{
|
|
LPCWSTR pItemSrcText; // Pointer to original text of the item
|
|
ULONG ulItemSrcLen; // Length of the original text of the item
|
|
ULONG ulItemSrcOffset; // Offset of the original text of the item
|
|
TTSWord* Words; // The words of the item, post normalization
|
|
ULONG ulNumWords; // The number of words of the item, post normalization
|
|
ENGPARTOFSPEECH eItemPartOfSpeech; // The part of speech of the entire item
|
|
TTSItemInfo* pItemInfo;
|
|
} TTSSentItem;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//=== Constants ===============================================================
|
|
typedef enum INVCONST
|
|
{
|
|
MAX_LPCORDER = 30,
|
|
MAX_FFTSIZE = 512
|
|
} INVCONST;
|
|
|
|
//=== Interface definitions ===================================================
|
|
|
|
///// NOTE: This section to be moved to SAPI.IDL in SAPI6
|
|
|
|
[
|
|
object,
|
|
local,
|
|
uuid(E0F4088D-CD08-11d2-B503-00C04F797396),
|
|
helpstring("IEnumSENTITEM Interface"),
|
|
pointer_default(unique)
|
|
]
|
|
interface IEnumSENTITEM : IUnknown
|
|
{
|
|
HRESULT Next( [out] TTSSentItem *pItemEnum );
|
|
HRESULT Reset(void);
|
|
};
|
|
|
|
//--- IEnumSpSentence -------------------------------------------------------
|
|
// This structure points to a text fragement within the input stream and
|
|
// the rendering attributes that are described by associated XML tags
|
|
//
|
|
[
|
|
object,
|
|
local,
|
|
uuid(299A9157-CD08-11d2-B503-00C04F797396),
|
|
helpstring("IEnumSpSentence Interface"),
|
|
pointer_default(unique)
|
|
]
|
|
interface IEnumSpSentence : IUnknown
|
|
{
|
|
HRESULT SetFragList( [in] const SPVTEXTFRAG* pTextFragList, [in] DWORD dwSpeakFlags);
|
|
HRESULT Next( [out]IEnumSENTITEM **ppSentItemEnum );
|
|
HRESULT Previous( [out]IEnumSENTITEM **ppSentItemEnum );
|
|
HRESULT Reset(void);
|
|
};
|
|
|
|
///// End SAPI6 section
|
|
|
|
// Max number of POS per pronunciation
|
|
enum { POS_MAX = 4 };
|
|
|
|
// Pronunciation source
|
|
typedef enum PRONSRC
|
|
{
|
|
PRON_LEX = 0,
|
|
PRON_LTS,
|
|
} PRONSRC;
|
|
|
|
//------------------------
|
|
// POS subset for prosody
|
|
//------------------------
|
|
enum PROSODY_POS
|
|
{
|
|
POS_UNK, // unknown
|
|
POS_FUNC, // any function word
|
|
POS_CONTENT, // any content word
|
|
POS_AUX,
|
|
};
|
|
|
|
// Revberb delay presets
|
|
typedef enum REVERBTYPE
|
|
{
|
|
REVERB_TYPE_OFF = 0,
|
|
REVERB_TYPE_BATHTUB,
|
|
REVERB_TYPE_ROOM,
|
|
REVERB_TYPE_HALL,
|
|
REVERB_TYPE_CHURCH,
|
|
REVERB_TYPE_STADIUM,
|
|
REVERB_TYPE_ECHO,
|
|
REVERB_TYPE_ROBOSEQ, // Robot with 'sequencer'
|
|
} REVERBTYPE;
|
|
|
|
|
|
typedef enum UNITFLAGS
|
|
{
|
|
WORD_START_FLAG = (1L << 0), // Word starts on this unit
|
|
SENT_START_FLAG = (1L << 1), // Sentence starts on this unit
|
|
}UNITFLAGS;
|
|
|
|
typedef enum TAPS
|
|
{
|
|
MAXTAPS = 8
|
|
}TAPS;
|
|
|
|
enum USER_RATE_VALUE
|
|
{
|
|
MIN_USER_RATE = (-18),
|
|
MAX_USER_RATE = 18,
|
|
DEFAULT_USER_RATE = 0 // None
|
|
};
|
|
|
|
// Change to new rate if value is NOT this
|
|
enum { NO_RATE_CHANGE = MAX_USER_RATE + 1 };
|
|
|
|
|
|
/*** UNITINFO
|
|
* This describes the unit info structure
|
|
*/
|
|
|
|
typedef struct UNIT_CVT
|
|
{
|
|
ULONG PhonID; // {in} Phoneme ID
|
|
ULONG flags; // {in] Position flags
|
|
ULONG UnitID; // [out] Inventory table ID
|
|
ULONG SenoneID; // [out] Context offset
|
|
float Dur; // [out] duration in seconds
|
|
float Amp; // [out] Amplitude
|
|
float AmpRatio; // [out] Amplitude gain
|
|
CHAR szUnitName[15]; // [out] name string
|
|
} UNIT_CVT;
|
|
|
|
/*** MSVOICEINFO
|
|
* This describes the voice data object
|
|
*/
|
|
typedef struct MSVOICEINFO
|
|
{
|
|
WAVEFORMATEX WaveFormatEx; // Voice data format.
|
|
LCID LangID; // Voice data language ID
|
|
ULONG Rate; // Words-per-minute
|
|
ULONG Pitch; // Average pitch in Hz
|
|
REVERBTYPE eReverbType; // Reverb param
|
|
ULONG ProsodyGain; // 0 = monotone
|
|
ULONG NumOfTaps; // BE: Whisper param
|
|
float TapCoefficients[MAXTAPS]; // BE: Whisper param
|
|
float VibratoFreq; // Hertz
|
|
ULONG VibratoDepth; // 0 - 100%
|
|
ULONG SampleRate; // 22050 typical
|
|
ULONG LPCOrder; // Number of LPC coefficients
|
|
ULONG FFTSize; // FFT window length
|
|
float* pWindow; // Hanning Window
|
|
} MSVOICEINFO;
|
|
|
|
/*** MSUNITINFO
|
|
* This is the result of an Unit fetch
|
|
*/
|
|
typedef struct MSUNITDATA
|
|
{
|
|
ULONG cNumEpochs;
|
|
ULONG cNumSamples;
|
|
ULONG cOrder;
|
|
float *pEpoch;
|
|
float *pLPC;
|
|
float *pRes;
|
|
float *pGain;
|
|
} MSUNITDATA;
|
|
|
|
// AlloToUnit() attributes
|
|
enum { ALLO_IS_STRESSED = (1 << 0) };
|
|
|
|
/*** IMSVoiceData
|
|
* Private interface on TTS voice data objects. A voice data object encapsulates
|
|
* the voice data with the necessary lookup logic.
|
|
*/
|
|
[
|
|
object,
|
|
local,
|
|
uuid(6265B7E1-0340-11d3-B50C-00C04F797396),
|
|
helpstring("IMSVoiceData Interface"),
|
|
pointer_default(unique)
|
|
]
|
|
interface IMSVoiceData : IUnknown
|
|
{
|
|
HRESULT GetVoiceInfo( [out]MSVOICEINFO* pVoiceInfo );
|
|
HRESULT GetUnitIDs( [in,out]UNIT_CVT* pUnits, [in]ULONG cUnits );
|
|
HRESULT GetUnitData( [in]ULONG unitID, [out]MSUNITDATA* pUnitData );
|
|
HRESULT AlloToUnit( [in]short allo, [in]long attributes, [out]long* pUnitID );
|
|
};
|
|
|
|
/*** IMSTTSEngineInit
|
|
* Private engine initialization interface used to connect the voice
|
|
* object to the synthesizer.
|
|
*/
|
|
[
|
|
object,
|
|
local,
|
|
uuid(8A7C38EB-D8B0-11d2-B504-00C04F797396),
|
|
helpstring("IMSTTSEngineInit Interface"),
|
|
pointer_default(unique)
|
|
]
|
|
interface IMSTTSEngineInit : IUnknown
|
|
{
|
|
HRESULT VoiceInit( [in]IMSVoiceData* pVoiceData );
|
|
};
|
|
|
|
//=== CoClass definitions =====================================================
|
|
[
|
|
uuid(3F7C4D29-D007-11D2-B503-00C04F797396),
|
|
version(1.0),
|
|
helpstring("MS TTS Engine 1.0 Type Library")
|
|
]
|
|
library MSTTSENGINELib
|
|
{
|
|
importlib("stdole32.tlb");
|
|
importlib("stdole2.tlb");
|
|
|
|
//--- This object is used to load the voice data files
|
|
// and expose them to the driver.
|
|
[
|
|
uuid(65DBDDEF-0725-11d3-B50C-00C04F797396),
|
|
helpstring("MSVoiceData Class")
|
|
]
|
|
coclass MSVoiceData
|
|
{
|
|
[default] interface IMSVoiceData;
|
|
};
|
|
|
|
//--- This is the synthesizer object
|
|
[
|
|
uuid(B93AE09F-D033-11D2-B503-00C04F797396),
|
|
helpstring("MSTTSEngine Class")
|
|
]
|
|
coclass MSTTSEngine
|
|
{
|
|
[default] interface ISpTTSEngine;
|
|
interface IMSTTSEngineInit;
|
|
};
|
|
};
|