Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

386 lines
12 KiB

  1. /*****************************************************************************
  2. * spttseng.idl *
  3. *--------------*
  4. * Description:
  5. * This is the idl file for the Microsoft Text To Speech Driver.
  6. *-----------------------------------------------------------------------------
  7. * Creation: 03/01/99
  8. * Copyright (C) Microsoft Corporation 1999
  9. * All rights reserved.
  10. *
  11. ****************************************************************** EDC ******/
  12. //--- Import base idl
  13. import "oaidl.idl";
  14. import "ocidl.idl";
  15. import "sapiddk.idl";
  16. //=== Forward References ======================================================
  17. interface IMSVoiceData;
  18. interface IMSTTSEngineInit;
  19. typedef enum ENGPARTOFSPEECH
  20. {
  21. MS_NotOverriden = SPPS_NotOverriden,
  22. MS_Unknown = SPPS_Unknown, // Probably from user lexicon
  23. MS_Noun = SPPS_Noun,
  24. MS_Verb = SPPS_Verb,
  25. MS_Modifier = SPPS_Modifier,
  26. MS_Function = SPPS_Function,
  27. MS_Interjection = SPPS_Interjection,
  28. // MS Nouns
  29. MS_Pron = ( SPPS_Noun + 1 ),
  30. MS_SubjPron = ( SPPS_Noun + 2 ),
  31. MS_ObjPron = ( SPPS_Noun + 3 ),
  32. MS_RelPron = ( SPPS_Noun + 4 ),
  33. MS_PosNoun = ( SPPS_Noun + 9 ),
  34. // MS Modifiers
  35. MS_Adj = ( SPPS_Modifier + 1 ),
  36. MS_Adv = ( SPPS_Modifier + 2 ),
  37. // MS Function Words
  38. MS_VAux = ( SPPS_Function + 1 ),
  39. MS_Conj = ( SPPS_Function + 3 ),
  40. MS_CConj = ( SPPS_Function + 4 ),
  41. MS_Interr = ( SPPS_Function + 5 ),
  42. MS_Det = ( SPPS_Function + 6 ),
  43. MS_Contr = ( SPPS_Function + 7 ),
  44. MS_Prep = ( SPPS_Function + 9 ),
  45. // MS Punctuation
  46. MS_Punctuation = ( SPPS_Function + 11 ),
  47. MS_GroupBegin = ( MS_Punctuation + 1 ),
  48. MS_GroupEnd = ( MS_Punctuation + 2 ),
  49. MS_EOSItem = ( MS_Punctuation + 3 ),
  50. MS_MiscPunc = ( MS_Punctuation + 4 ),
  51. MS_Quotation = ( MS_Punctuation + 5 )
  52. } ENGPARTOFSPEECH;
  53. typedef enum TTSItemType
  54. {
  55. eWORDLIST_NOT_VALID = 0x0000,
  56. eWORDLIST_IS_VALID = 0x1000,
  57. eUNMATCHED = eWORDLIST_IS_VALID + 1,
  58. eALPHA_WORD = eWORDLIST_IS_VALID + 2,
  59. eABBREVIATION = eWORDLIST_IS_VALID + 3,
  60. eABBREVIATION_NORMALIZE = eWORDLIST_IS_VALID + 4,
  61. eINITIALISM = eWORDLIST_IS_VALID + 5,
  62. eNUM_CARDINAL = eWORDLIST_IS_VALID + 6,
  63. eNUM_ORDINAL = eWORDLIST_IS_VALID + 7,
  64. eNUM_DECIMAL = eWORDLIST_IS_VALID + 8,
  65. eNUM_PERCENT = eWORDLIST_IS_VALID + 9,
  66. eNUM_DEGREES = eWORDLIST_IS_VALID + 10,
  67. eNUM_SQUARED = eWORDLIST_IS_VALID + 11,
  68. eNUM_CUBED = eWORDLIST_IS_VALID + 12,
  69. eNUM_CURRENCY = eWORDLIST_IS_VALID + 13,
  70. eNUM_FRACTION = eWORDLIST_IS_VALID + 14,
  71. eNUM_MIXEDFRACTION = eWORDLIST_IS_VALID + 15,
  72. eNUM_ROMAN_NUMERAL = eWORDLIST_IS_VALID + 16,
  73. eNUM_ROMAN_NUMERAL_ORDINAL = eWORDLIST_IS_VALID + 17,
  74. eNUM_PHONENUMBER = eWORDLIST_IS_VALID + 18,
  75. eNUM_ZIPCODE = eWORDLIST_IS_VALID + 19,
  76. eDATE_YEAR = eWORDLIST_IS_VALID + 20,
  77. eDATE = eWORDLIST_IS_VALID + 21,
  78. eDATE_LONGFORM = eWORDLIST_IS_VALID + 22,
  79. eDECADE = eWORDLIST_IS_VALID + 23,
  80. eTIMEOFDAY = eWORDLIST_IS_VALID + 24,
  81. eTIME = eWORDLIST_IS_VALID + 25,
  82. eSPELLOUT = eWORDLIST_IS_VALID + 26,
  83. eHYPHENATED_STRING = eWORDLIST_IS_VALID + 27,
  84. eSTATE_AND_ZIPCODE = eWORDLIST_IS_VALID + 28,
  85. eTIME_RANGE = eWORDLIST_IS_VALID + 29,
  86. eNUM_RANGE = eWORDLIST_IS_VALID + 30,
  87. eTEMP_NUMBER = eWORDLIST_IS_VALID + 31,
  88. eTEMP_PERCENT = eWORDLIST_IS_VALID + 32,
  89. eTEMP_DEGREES = eWORDLIST_IS_VALID + 33,
  90. eTEMP_NUM_FRACTION = eWORDLIST_IS_VALID + 34,
  91. eTEMP_NUM_MIXEDFRACTION = eWORDLIST_IS_VALID + 35,
  92. eTEMP_NUM_DECIMAL = eWORDLIST_IS_VALID + 36,
  93. eTEMP_NUM_ORDINAL = eWORDLIST_IS_VALID + 37,
  94. eTEMP_NUM_CURRENCY = eWORDLIST_IS_VALID + 38,
  95. eNEWNUM_PHONENUMBER = eWORDLIST_IS_VALID + 39,
  96. eNUM_CURRENCYRANGE = eWORDLIST_IS_VALID + 40,
  97. eSUFFIX = eWORDLIST_IS_VALID + 41,
  98. eOPEN_PARENTHESIS = eWORDLIST_NOT_VALID + 1,
  99. eOPEN_BRACKET = eWORDLIST_NOT_VALID + 2,
  100. eOPEN_BRACE = eWORDLIST_NOT_VALID + 3,
  101. eCLOSE_PARENTHESIS = eWORDLIST_NOT_VALID + 4,
  102. eCLOSE_BRACKET = eWORDLIST_NOT_VALID + 5,
  103. eCLOSE_BRACE = eWORDLIST_NOT_VALID + 6,
  104. eSINGLE_QUOTE = eWORDLIST_NOT_VALID + 7,
  105. eDOUBLE_QUOTE = eWORDLIST_NOT_VALID + 8,
  106. ePERIOD = eWORDLIST_NOT_VALID + 9,
  107. eEXCLAMATION = eWORDLIST_NOT_VALID + 10,
  108. eQUESTION = eWORDLIST_NOT_VALID + 11,
  109. eCOMMA = eWORDLIST_NOT_VALID + 12,
  110. eSEMICOLON = eWORDLIST_NOT_VALID + 13,
  111. eCOLON = eWORDLIST_NOT_VALID + 14,
  112. eHYPHEN = eWORDLIST_NOT_VALID + 15,
  113. eELLIPSIS = eWORDLIST_NOT_VALID + 16,
  114. } TTSItemType;
  115. typedef struct TTSWord
  116. {
  117. const SPVSTATE* pXmlState; // The XML State of the word
  118. LPCWSTR pWordText; // Pointer to the orthographic form of the word
  119. ULONG ulWordLen; // Length of the word, in WCHARs
  120. LPCWSTR pLemma; // Pointer to the orthographic form of the root word
  121. ULONG ulLemmaLen; // Length of the lemma, in WCHARs
  122. SPPHONEID* pWordPron; // Pointer to the NULL terminated pronunciation of the word
  123. ENGPARTOFSPEECH eWordPartOfSpeech; // The part of speech of the word - Is this needed???
  124. } TTSWord;
  125. typedef struct TTSItemInfo
  126. {
  127. TTSItemType Type;
  128. } TTSItemInfo;
  129. typedef struct TTSSentItem
  130. {
  131. LPCWSTR pItemSrcText; // Pointer to original text of the item
  132. ULONG ulItemSrcLen; // Length of the original text of the item
  133. ULONG ulItemSrcOffset; // Offset of the original text of the item
  134. TTSWord* Words; // The words of the item, post normalization
  135. ULONG ulNumWords; // The number of words of the item, post normalization
  136. ENGPARTOFSPEECH eItemPartOfSpeech; // The part of speech of the entire item
  137. TTSItemInfo* pItemInfo;
  138. } TTSSentItem;
  139. //=== Constants ===============================================================
  140. typedef enum INVCONST
  141. {
  142. MAX_LPCORDER = 30,
  143. MAX_FFTSIZE = 512
  144. } INVCONST;
  145. //=== Interface definitions ===================================================
  146. ///// NOTE: This section to be moved to SAPI.IDL in SAPI6
  147. [
  148. object,
  149. local,
  150. uuid(E0F4088D-CD08-11d2-B503-00C04F797396),
  151. helpstring("IEnumSENTITEM Interface"),
  152. pointer_default(unique)
  153. ]
  154. interface IEnumSENTITEM : IUnknown
  155. {
  156. HRESULT Next( [out] TTSSentItem *pItemEnum );
  157. HRESULT Reset(void);
  158. };
  159. //--- IEnumSpSentence -------------------------------------------------------
  160. // This structure points to a text fragement within the input stream and
  161. // the rendering attributes that are described by associated XML tags
  162. //
  163. [
  164. object,
  165. local,
  166. uuid(299A9157-CD08-11d2-B503-00C04F797396),
  167. helpstring("IEnumSpSentence Interface"),
  168. pointer_default(unique)
  169. ]
  170. interface IEnumSpSentence : IUnknown
  171. {
  172. HRESULT SetFragList( [in] const SPVTEXTFRAG* pTextFragList, [in] DWORD dwSpeakFlags);
  173. HRESULT Next( [out]IEnumSENTITEM **ppSentItemEnum );
  174. HRESULT Previous( [out]IEnumSENTITEM **ppSentItemEnum );
  175. HRESULT Reset(void);
  176. };
  177. ///// End SAPI6 section
  178. // Max number of POS per pronunciation
  179. enum { POS_MAX = 4 };
  180. // Pronunciation source
  181. typedef enum PRONSRC
  182. {
  183. PRON_LEX = 0,
  184. PRON_LTS,
  185. } PRONSRC;
  186. //------------------------
  187. // POS subset for prosody
  188. //------------------------
  189. enum PROSODY_POS
  190. {
  191. POS_UNK, // unknown
  192. POS_FUNC, // any function word
  193. POS_CONTENT, // any content word
  194. POS_AUX,
  195. };
  196. // Revberb delay presets
  197. typedef enum REVERBTYPE
  198. {
  199. REVERB_TYPE_OFF = 0,
  200. REVERB_TYPE_BATHTUB,
  201. REVERB_TYPE_ROOM,
  202. REVERB_TYPE_HALL,
  203. REVERB_TYPE_CHURCH,
  204. REVERB_TYPE_STADIUM,
  205. REVERB_TYPE_ECHO,
  206. REVERB_TYPE_ROBOSEQ, // Robot with 'sequencer'
  207. } REVERBTYPE;
  208. typedef enum UNITFLAGS
  209. {
  210. WORD_START_FLAG = (1L << 0), // Word starts on this unit
  211. SENT_START_FLAG = (1L << 1), // Sentence starts on this unit
  212. }UNITFLAGS;
  213. typedef enum TAPS
  214. {
  215. MAXTAPS = 8
  216. }TAPS;
  217. enum USER_RATE_VALUE
  218. {
  219. MIN_USER_RATE = (-18),
  220. MAX_USER_RATE = 18,
  221. DEFAULT_USER_RATE = 0 // None
  222. };
  223. // Change to new rate if value is NOT this
  224. enum { NO_RATE_CHANGE = MAX_USER_RATE + 1 };
  225. /*** UNITINFO
  226. * This describes the unit info structure
  227. */
  228. typedef struct UNIT_CVT
  229. {
  230. ULONG PhonID; // {in} Phoneme ID
  231. ULONG flags; // {in] Position flags
  232. ULONG UnitID; // [out] Inventory table ID
  233. ULONG SenoneID; // [out] Context offset
  234. float Dur; // [out] duration in seconds
  235. float Amp; // [out] Amplitude
  236. float AmpRatio; // [out] Amplitude gain
  237. CHAR szUnitName[15]; // [out] name string
  238. } UNIT_CVT;
  239. /*** MSVOICEINFO
  240. * This describes the voice data object
  241. */
  242. typedef struct MSVOICEINFO
  243. {
  244. WAVEFORMATEX WaveFormatEx; // Voice data format.
  245. LCID LangID; // Voice data language ID
  246. ULONG Rate; // Words-per-minute
  247. ULONG Pitch; // Average pitch in Hz
  248. REVERBTYPE eReverbType; // Reverb param
  249. ULONG ProsodyGain; // 0 = monotone
  250. ULONG NumOfTaps; // BE: Whisper param
  251. float TapCoefficients[MAXTAPS]; // BE: Whisper param
  252. float VibratoFreq; // Hertz
  253. ULONG VibratoDepth; // 0 - 100%
  254. ULONG SampleRate; // 22050 typical
  255. ULONG LPCOrder; // Number of LPC coefficients
  256. ULONG FFTSize; // FFT window length
  257. float* pWindow; // Hanning Window
  258. } MSVOICEINFO;
  259. /*** MSUNITINFO
  260. * This is the result of an Unit fetch
  261. */
  262. typedef struct MSUNITDATA
  263. {
  264. ULONG cNumEpochs;
  265. ULONG cNumSamples;
  266. ULONG cOrder;
  267. float *pEpoch;
  268. float *pLPC;
  269. float *pRes;
  270. float *pGain;
  271. } MSUNITDATA;
  272. // AlloToUnit() attributes
  273. enum { ALLO_IS_STRESSED = (1 << 0) };
  274. /*** IMSVoiceData
  275. * Private interface on TTS voice data objects. A voice data object encapsulates
  276. * the voice data with the necessary lookup logic.
  277. */
  278. [
  279. object,
  280. local,
  281. uuid(6265B7E1-0340-11d3-B50C-00C04F797396),
  282. helpstring("IMSVoiceData Interface"),
  283. pointer_default(unique)
  284. ]
  285. interface IMSVoiceData : IUnknown
  286. {
  287. HRESULT GetVoiceInfo( [out]MSVOICEINFO* pVoiceInfo );
  288. HRESULT GetUnitIDs( [in,out]UNIT_CVT* pUnits, [in]ULONG cUnits );
  289. HRESULT GetUnitData( [in]ULONG unitID, [out]MSUNITDATA* pUnitData );
  290. HRESULT AlloToUnit( [in]short allo, [in]long attributes, [out]long* pUnitID );
  291. };
  292. /*** IMSTTSEngineInit
  293. * Private engine initialization interface used to connect the voice
  294. * object to the synthesizer.
  295. */
  296. [
  297. object,
  298. local,
  299. uuid(8A7C38EB-D8B0-11d2-B504-00C04F797396),
  300. helpstring("IMSTTSEngineInit Interface"),
  301. pointer_default(unique)
  302. ]
  303. interface IMSTTSEngineInit : IUnknown
  304. {
  305. HRESULT VoiceInit( [in]IMSVoiceData* pVoiceData );
  306. };
  307. //=== CoClass definitions =====================================================
  308. [
  309. uuid(3F7C4D29-D007-11D2-B503-00C04F797396),
  310. version(1.0),
  311. helpstring("MS TTS Engine 1.0 Type Library")
  312. ]
  313. library MSTTSENGINELib
  314. {
  315. importlib("stdole32.tlb");
  316. importlib("stdole2.tlb");
  317. //--- This object is used to load the voice data files
  318. // and expose them to the driver.
  319. [
  320. uuid(65DBDDEF-0725-11d3-B50C-00C04F797396),
  321. helpstring("MSVoiceData Class")
  322. ]
  323. coclass MSVoiceData
  324. {
  325. [default] interface IMSVoiceData;
  326. };
  327. //--- This is the synthesizer object
  328. [
  329. uuid(B93AE09F-D033-11D2-B503-00C04F797396),
  330. helpstring("MSTTSEngine Class")
  331. ]
  332. coclass MSTTSEngine
  333. {
  334. [default] interface ISpTTSEngine;
  335. interface IMSTTSEngineInit;
  336. };
  337. };