Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

632 lines
15 KiB

  1. /******************************************************************************
  2. * AlloOps.h *
  3. *-----------*
  4. * This is the header file for the following clsses:
  5. * CAlloCell
  6. * CAlloList
  7. * CDuration
  8. * CSyllableTagger
  9. * CToneTargets
  10. * CPitchProsody
  11. *------------------------------------------------------------------------------
  12. * Copyright (C) 1999 Microsoft Corporation Date: 03/01/99
  13. * All Rights Reserved
  14. *
  15. *********************************************************************** MC ****/
  16. #ifndef AlloOps_H
  17. #define AlloOps_H
  18. #include "stdafx.h"
  19. #include "commonlx.h"
  20. #ifndef __spttseng_h__
  21. #include "spttseng.h"
  22. #endif
  23. #ifndef FeedChain_H
  24. #include "FeedChain.h"
  25. #endif
  26. #ifndef SPCollec_h
  27. #include <SPCollec.h>
  28. #endif
  29. #include "SpTtsEngDebug.h"
  30. //***************************
  31. // Allophones
  32. //***************************
  33. typedef enum
  34. {
  35. _IY_, _IH_, _EH_, _AE_, _AA_, _AH_, _AO_, _UH_, _AX_, _ER_,
  36. _EY_, _AY_, _OY_, _AW_, _OW_, _UW_,
  37. _IX_, _SIL_, _w_, _y_,
  38. _r_, _l_, _h_, _m_, _n_, _NG_, _f_, _v_, _TH_, _DH_,
  39. _s_, _z_, _SH_, _ZH_, _p_, _b_, _t_, _d_, _k_, _g_,
  40. _CH_, _JH_, _DX_,
  41. _STRESS1_,
  42. _STRESS2_,
  43. _EMPHSTRESS_,
  44. _SYLLABLE_,
  45. } ALLO_CODE;
  46. static const long NUMBER_OF_ALLO = (_SYLLABLE_ + 1);
  47. //-----------------------------------
  48. // For 2-word allo conversion
  49. //-----------------------------------
  50. static const short NO_IPA = 0;
  51. // XXXX XXXX XXXX XXXX XXXX bLis ssoo ttBB
  52. // X = unused
  53. // B = boundary type
  54. // t = syllable type
  55. // o = vowel order
  56. // s = stress type
  57. // i = word initial consonant
  58. // L = syLlable start
  59. // b = break
  60. enum ALLOTAGS
  61. {
  62. WORD_START = (1 << 0),
  63. TERM_BOUND = (1 << 1),
  64. BOUNDARY_TYPE_FIELD = WORD_START | TERM_BOUND, // mask
  65. WORD_END_SYLL = (1 << 2),
  66. TERM_END_SYLL = (1 << 3),
  67. SYLLABLE_TYPE_FIELD = WORD_END_SYLL | TERM_END_SYLL, // mask
  68. FIRST_SYLLABLE_IN_WORD = (1 << 4), // in multi-syllable word
  69. MID_SYLLABLE_IN_WORD = (2 << 4),
  70. LAST_SYLLABLE_IN_WORD = (3 << 4),
  71. MORE_THAN_ONE_SYLLABLE_IN_WORD = LAST_SYLLABLE_IN_WORD, // either bit is set
  72. ONE_OR_NO_SYLLABLE_IN_WORD = 0x0000, // niether bits are set
  73. SYLLABLE_ORDER_FIELD = LAST_SYLLABLE_IN_WORD, // mask
  74. PRIMARY_STRESS = (1 << 6),
  75. SECONDARY_STRESS = (1 << 7),
  76. EMPHATIC_STRESS = (1 << 8),
  77. IS_STRESSED = PRIMARY_STRESS | SECONDARY_STRESS | EMPHATIC_STRESS,
  78. PRIM_OR_EMPH_STRESS = PRIMARY_STRESS | EMPHATIC_STRESS,
  79. STRESS_FIELD = PRIMARY_STRESS | SECONDARY_STRESS | EMPHATIC_STRESS, // mask
  80. WORD_INITIAL_CONSONANT = (1 << 9), // up to 1st vowel in word
  81. STRESSED_INITIAL_CONS = (IS_STRESSED + WORD_INITIAL_CONSONANT),
  82. SYLLABLE_START = (1 << 10),
  83. SIL_BREAK = (1 << 11),
  84. };
  85. //***************************
  86. // AlloFlags
  87. //***************************
  88. enum ALLOFLAGS
  89. {
  90. KVOWELF = (1<<0),
  91. KCONSONANTF = (1<<1),
  92. KVOICEDF = (1<<2),
  93. KVOWEL1F = (1<<3),
  94. KSONORANTF = (1<<4),
  95. KSONORANT1F = (1<<5),
  96. KNASALF = (1<<6),
  97. KLIQGLIDEF = (1<<7),
  98. KSONORCONSONF = (1<<8),
  99. KPLOSIVEF = (1<<9),
  100. KPLOSFRICF = (1<<10),
  101. KOBSTF = (1<<11),
  102. KSTOPF = (1<<12),
  103. KALVEOLARF = (1<<13),
  104. KVELAR = (1<<14),
  105. KLABIALF = (1<<15),
  106. KDENTALF = (1<<16),
  107. KPALATALF = (1<<17),
  108. KYGLIDESTARTF = (1<<18),
  109. KYGLIDEENDF = (1<<19),
  110. KGSTOPF = (1<<20),
  111. KFRONTF = (1<<21),
  112. KDIPHTHONGF = (1<<22),
  113. KHASRELEASEF = (1<<23),
  114. KAFFRICATEF = (1<<24),
  115. KLIQGLIDE2F = (1<<25),
  116. KVOCLIQ = (1<<26),
  117. KFRIC = (1<<27),
  118. KFLAGMASK1 = (KLABIALF+KDENTALF+KPALATALF+KALVEOLARF+KVELAR+KGSTOPF),
  119. KFLAGMASK2 = (KALVEOLARF-1),
  120. };
  121. #define BOUNDARY_BASE 1000
  122. enum TOBI_BOUNDARY
  123. {
  124. K_NOBND = 0,
  125. K_LMINUS = BOUNDARY_BASE, // fall
  126. K_HMINUS, // none
  127. K_LMINUSLPERC,
  128. K_LMINUSHPERC,
  129. K_HMINUSHPERC,
  130. K_HMINUSLPERC,
  131. };
  132. enum TUNE_TYPE
  133. {
  134. NULL_BOUNDARY = 0, // no boundary NOTE: always put this at the beginning
  135. PHRASE_BOUNDARY, // comma
  136. EXCLAM_BOUNDARY, // exclamatory utterance terminator
  137. YN_QUEST_BOUNDARY, // yes-no question terminator
  138. WH_QUEST_BOUNDARY, // yes-no question terminator
  139. DECLAR_BOUNDARY, // declarative terminator
  140. PAREN_L_BOUNDARY, // left paren
  141. PAREN_R_BOUNDARY, // right paren
  142. QUOTE_L_BOUNDARY, // left quote
  143. QUOTE_R_BOUNDARY, // right quote
  144. PHONE_BOUNDARY,
  145. TOD_BOUNDARY,
  146. ELLIPSIS_BOUNDARY,
  147. SUB_BOUNDARY_1, // NOTE: always put these at the end
  148. SUB_BOUNDARY_2,
  149. SUB_BOUNDARY_3,
  150. SUB_BOUNDARY_4,
  151. SUB_BOUNDARY_5,
  152. SUB_BOUNDARY_6,
  153. NUMBER_BOUNDARY,
  154. TAIL_BOUNDARY,
  155. };
  156. //***************************
  157. // ToBI Constants
  158. //***************************
  159. // !H is removed from consideration in the first pass processing
  160. // !H can possibly be recovered from analysis of the labeling and
  161. // contour at later stages (tilt, prominence, pitch range, downstep)
  162. #define ACCENT_BASE 1
  163. enum TOBI_ACCENT
  164. {
  165. K_NOACC = 0,
  166. K_HSTAR = ACCENT_BASE, // peak rise / fall
  167. K_LSTAR, // acc syll nucleus valley early fall
  168. K_LSTARH, // late rise
  169. K_RSTAR, //
  170. K_LHSTAR, // early rise
  171. K_DHSTAR, //
  172. K_HSTARLSTAR,
  173. };
  174. enum BOUNDARY_SOURCE
  175. {
  176. BND_NoSource = 0,
  177. //-- Phrase boundary rules
  178. BND_PhraseRule1,
  179. BND_PhraseRule2,
  180. BND_PhraseRule3,
  181. BND_PhraseRule4,
  182. BND_PhraseRule5,
  183. BND_PhraseRule6,
  184. BND_PhraseRule7,
  185. BND_PhraseRule8,
  186. BND_PhraseRule9,
  187. BND_PhraseRule10,
  188. BND_PhraseRule11,
  189. BND_PhraseRule12,
  190. BND_PhraseRule13,
  191. //-- ToBI
  192. BND_YNQuest,
  193. BND_WHQuest,
  194. BND_Period,
  195. BND_Comma,
  196. //--Templates
  197. BND_NumberTemplate, // Should never get this!
  198. BND_IntegerQuant,
  199. BND_Currency_DOLLAR,
  200. BND_Frac_Num,
  201. BND_Phone_COUNTRY,
  202. BND_Phone_AREA,
  203. BND_Phone_ONE,
  204. BND_Phone_DIGITS,
  205. BND_TimeOFDay_HR,
  206. BND_TimeOFDay_AB,
  207. BND_Ellipsis,
  208. BND_ForcedTerm, // Should never get this!
  209. BND_IDontKnow,
  210. };
  211. enum ACCENT_SOURCE
  212. {
  213. ACC_NoSource = 0,
  214. //-- Phrase boundary rules
  215. ACC_PhraseRule1,
  216. ACC_PhraseRule2,
  217. ACC_PhraseRule3,
  218. ACC_PhraseRule4,
  219. ACC_PhraseRule5,
  220. ACC_PhraseRule6,
  221. ACC_PhraseRule7,
  222. ACC_PhraseRule8,
  223. ACC_PhraseRule9,
  224. ACC_PhraseRule10,
  225. ACC_PhraseRule11,
  226. ACC_PhraseRule12,
  227. ACC_PhraseRule13,
  228. //-- ToBI
  229. ACC_InitialVAux,
  230. ACC_FunctionSeq,
  231. ACC_ContentSeq,
  232. ACC_YNQuest,
  233. ACC_Period,
  234. ACC_Comma,
  235. //--Templates
  236. ACC_IntegerGroup,
  237. ACC_NumByNum,
  238. ACC_Frac_DEN, // "half", "tenths", etc.
  239. ACC_Phone_1stArea, // 1st digit in area code
  240. ACC_Phone_3rdArea, // 3rd digit in area code
  241. ACC_Phone_1st3,
  242. ACC_Phone_3rd3,
  243. ACC_Phone_1st4,
  244. ACC_Phone_3rd4,
  245. ACC_TimeOFDay_HR,
  246. ACC_TimeOFDay_1stMin,
  247. ACC_TimeOFDay_M,
  248. ACC_PhoneBnd_AREA,
  249. ACC_PhoneBnd_34,
  250. ACC_PhoneBnd_4,
  251. ACC_IDontKnow,
  252. };
  253. enum SILENCE_SOURCE
  254. {
  255. SIL_NoSource = 0,
  256. SIL_Term,
  257. SIL_QuoteStart,
  258. SIL_QuoteEnd,
  259. SIL_ParenStart,
  260. SIL_ParenEnd,
  261. SIL_Emph,
  262. SIL_SubBound, // Should never see this (gets removed)
  263. SIL_XML,
  264. //-- Prosody templates
  265. SIL_TimeOfDay_HR,
  266. SIL_TimeOfDay_AB,
  267. SIL_Phone_COUNTRY,
  268. SIL_Phone_AREA,
  269. SIL_Phone_ONE,
  270. SIL_Phone_DIGITS,
  271. SIL_Fractions_NUM,
  272. SIL_Currency_DOLLAR,
  273. SIL_Integer_Quant,
  274. SIL_Head,
  275. SIL_Tail,
  276. SIL_Ellipsis,
  277. SIL_ForcedTerm, // Should never get this!
  278. };
  279. static const short TOKEN_LEN_MAX = 20;
  280. class CFEToken
  281. {
  282. public:
  283. CFEToken();
  284. ~CFEToken();
  285. WCHAR tokStr[TOKEN_LEN_MAX];
  286. long tokLen;
  287. PRONSRC m_PronType;
  288. long phon_Len;
  289. ALLO_CODE phon_Str[SP_MAX_PRON_LENGTH]; // Allo string
  290. ENGPARTOFSPEECH POScode;
  291. PROSODY_POS m_posClass;
  292. ULONG srcPosition; // Source position for this token
  293. ULONG srcLen; // Source length for this token
  294. ULONG sentencePosition; // Source position for sentence
  295. ULONG sentenceLen; // Source length for sentence
  296. ULONG user_Volume; // 1 - 101
  297. long user_Rate; // -10 - 10
  298. long user_Pitch; // -10 - 10
  299. long user_Emph; // 0 or 5
  300. ULONG user_Break; // ms of silence
  301. CBookmarkList *pBMObj;
  302. TOBI_ACCENT m_Accent; // accent prosodic control
  303. long m_Accent_Prom; // prominence prosodic control
  304. TOBI_BOUNDARY m_Boundary; // boundary tone prosodic control
  305. long m_Boundary_Prom; // prominence prosodic control
  306. TUNE_TYPE m_TuneBoundaryType; // Current token is a boundary
  307. float m_TermSil; // Pad word with silence (in sec)
  308. float m_DurScale; // Duration ratio
  309. float m_ProsodyDurScale;
  310. float m_PitchBaseOffs; // Relative baseline pitch offset in octaves
  311. float m_PitchRangeScale; // Pitch range offset scale (0 - 2.0)
  312. //--- Diagnostic
  313. ACCENT_SOURCE m_AccentSource;
  314. BOUNDARY_SOURCE m_BoundarySource;
  315. SILENCE_SOURCE m_SilenceSource;
  316. };
  317. typedef CSPList<CFEToken*,CFEToken*> CFETokenList;
  318. class CAlloCell
  319. {
  320. public:
  321. CAlloCell();
  322. ~CAlloCell();
  323. //--------------------------------
  324. // Member Vars
  325. //--------------------------------
  326. ALLO_CODE m_allo;
  327. short m_dur;
  328. float m_ftDuration;
  329. float m_UnitDur;
  330. short m_knots;
  331. float m_ftTime[KNOTS_PER_PHON];
  332. float m_ftPitch[KNOTS_PER_PHON];
  333. long m_ctrlFlags;
  334. TOBI_ACCENT m_ToBI_Accent;
  335. long m_Accent_Prom; // prominence prosodic control
  336. TOBI_BOUNDARY m_ToBI_Boundary;
  337. long m_Boundary_Prom; // prominence prosodic control
  338. long m_PitchBufStart;
  339. long m_PitchBufEnd;
  340. ULONG m_user_Volume;
  341. long m_user_Rate;
  342. long m_user_Pitch;
  343. long m_user_Emph;
  344. ULONG m_user_Break;
  345. ULONG m_Sil_Break;
  346. float m_Pitch_HI;
  347. float m_Pitch_LO;
  348. ULONG m_SrcPosition;
  349. ULONG m_SrcLen;
  350. ULONG m_SentencePosition; // Source position for sentence
  351. ULONG m_SentenceLen; // Source length for sentence
  352. TUNE_TYPE m_TuneBoundaryType;
  353. TUNE_TYPE m_NextTuneBoundaryType;
  354. CBookmarkList *m_pBMObj;
  355. float m_DurScale; // Duration ratio
  356. float m_ProsodyDurScale;
  357. float m_PitchBaseOffs; // Relative baseline pitch offset in octaves
  358. float m_PitchRangeScale; // Pitch range offset scale (0 - 2.0)
  359. //--- Diagnostic
  360. ACCENT_SOURCE m_AccentSource;
  361. BOUNDARY_SOURCE m_BoundarySource;
  362. SILENCE_SOURCE m_SilenceSource;
  363. char *m_pTextStr;
  364. };
  365. class CAlloList
  366. {
  367. public:
  368. CAlloList();
  369. ~CAlloList();
  370. //--------------------------------
  371. // Methods
  372. //--------------------------------
  373. CAlloCell *GetCell( long index );
  374. CAlloCell *GetTailCell();
  375. long GetCount();
  376. bool WordToAllo( CFEToken *pPrevTok, CFEToken *pTok, CFEToken *pNextTok, CAlloCell *pEndCell );
  377. CAlloCell *GetHeadCell()
  378. {
  379. m_ListPos = m_AlloCellList.GetHeadPosition();
  380. return m_AlloCellList.GetNext( m_ListPos );
  381. }
  382. CAlloCell *GetNextCell()
  383. {
  384. if( m_ListPos )
  385. {
  386. return m_AlloCellList.GetNext( m_ListPos );
  387. }
  388. else
  389. {
  390. //-- We're at end of list!
  391. return NULL;
  392. }
  393. }
  394. //-- For debug only
  395. void OutAllos();
  396. private:
  397. //--------------------------------
  398. // Member Vars
  399. //--------------------------------
  400. long m_cAllos;
  401. SPLISTPOS m_ListPos;
  402. CSPList<CAlloCell*,CAlloCell*> m_AlloCellList;
  403. };
  404. //-----------------------------------
  405. // Speaking Rate parameters
  406. //-----------------------------------
  407. static const float MAX_SIL_DUR = 1.0f; // seconds
  408. static const float MIN_ALLO_DUR = 0.011f; // seconds
  409. static const float MAX_ALLO_DUR = 5.0f; // seconds
  410. class CDuration
  411. {
  412. public:
  413. //--------------------------------
  414. // Methods
  415. //--------------------------------
  416. void AlloDuration( CAlloList *pAllos, float rateRatio );
  417. private:
  418. void Pause_Insertion( long userDuration, long silBreak );
  419. void PhraseFinal_Lengthen( long cellCount );
  420. long Emphatic_Lenghen( long lastStress );
  421. //--------------------------------
  422. // Member vars
  423. //--------------------------------
  424. float m_DurHold;
  425. float m_TotalDurScale;
  426. float m_durationPad;
  427. ALLO_CODE m_cur_Phon;
  428. long m_cur_PhonCtrl;
  429. long m_cur_PhonFlags;
  430. long m_cur_SyllableType;
  431. short m_cur_VowelFlag;
  432. long m_cur_Stress;
  433. ALLO_CODE m_prev_Phon;
  434. long m_prev_PhonCtrl;
  435. long m_prev_PhonFlags;
  436. ALLO_CODE m_next_Phon;
  437. long m_next_PhonCtrl;
  438. long m_next_PhonFlags;
  439. ALLO_CODE m_next2_Phon;
  440. long m_next2_PhonCtrl;
  441. long m_next2_PhonFlags;
  442. TUNE_TYPE m_NextBoundary, m_CurBoundary;
  443. };
  444. typedef struct
  445. {
  446. ALLO_CODE allo;
  447. long ctrlFlags;
  448. }ALLO_ARRAY;
  449. class CSyllableTagger
  450. {
  451. public:
  452. //--------------------------------
  453. // Methods
  454. //--------------------------------
  455. void TagSyllables( CAlloList *pAllos );
  456. private:
  457. void MarkSyllableOrder( long scanIndex);
  458. void MarkSyllableBoundry( long scanIndex);
  459. void MarkSyllableStart();
  460. short Find_Next_Word_Bound( short index );
  461. short If_Consonant_Cluster( ALLO_CODE Consonant_1st, ALLO_CODE Consonant_2nd);
  462. void ListToArray( CAlloList *pAllos );
  463. void ArrayToList( CAlloList *pAllos );
  464. //--------------------------------
  465. // Member vars
  466. //--------------------------------
  467. ALLO_ARRAY *m_pAllos;
  468. long m_numOfCells;
  469. };
  470. enum { TARG_PER_ALLO_MAX = 2 }; // One for accent and one for boundary
  471. enum TUNE_STYLE
  472. {
  473. FLAT_TUNE = 0, // flat
  474. DESCEND_TUNE, // go down
  475. ASCEND_TUNE, // go up
  476. };
  477. //------------------
  478. // Global Constants
  479. //------------------
  480. static const float PITCH_BUF_RES = (float)0.010;
  481. static const float K_HSTAR_OFFSET = (float)0.5;
  482. static const float K_HDOWNSTEP_COEFF = (float)0.5;
  483. //------------------
  484. // Macros
  485. //------------------
  486. #define CeilVal(x) ((m_CeilSlope * x) + m_CeilStart)
  487. #define FloorVal(x) ((m_FloorSlope * x) + m_FloorStart)
  488. #define RefVal(x) ((m_RefSlope * x) + m_RefStart)
  489. class CPitchProsody
  490. {
  491. public:
  492. //--------------------------------
  493. // Methods
  494. //--------------------------------
  495. void AlloPitch( CAlloList *pAllos, float baseLine, float pitchRange );
  496. private:
  497. float DoPitchControl( long pitchControl, float basePitch );
  498. void PitchTrack();
  499. void SetDefaultPitch();
  500. void GetKnots();
  501. void NewTarget( long index, float value );
  502. //--------------------------------
  503. // Member vars
  504. //--------------------------------
  505. CAlloList *m_pAllos;
  506. long m_numOfCells;
  507. float m_TotalDur; // phrase duration in seconds
  508. TUNE_STYLE m_Tune_Style;
  509. float *m_pContBuf;
  510. float m_OffsTime;
  511. TOBI_ACCENT m_CurAccent;
  512. //------------------------
  513. // Diagnostic
  514. //------------------------
  515. ACCENT_SOURCE m_CurAccentSource;
  516. BOUNDARY_SOURCE m_CurBoundarySource;
  517. char *m_pCurTextStr;
  518. };
  519. #endif //--- This must be the last line in the file