Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

630 lines
15 KiB

  1. /******************************************************************************
  2. * AlloOps.h *
  3. *-----------*
  4. * This is the header file for the following clsses:
  5. * CAlloCell
  6. * CAlloList
  7. * CDuration
  8. * CSyllableTagger
  9. * CToneTargets
  10. * CPitchProsody
  11. *------------------------------------------------------------------------------
  12. * Copyright (C) 1999 Microsoft Corporation Date: 03/01/99
  13. * All Rights Reserved
  14. *
  15. *********************************************************************** MC ****/
  16. #ifndef AlloOps_H
  17. #define AlloOps_H
  18. #include "stdafx.h"
  19. #include "commonlx.h"
  20. #include "ms_entropicengine.h"
  21. #include "FeedChain.h"
  22. #include <SPCollec.h>
  23. #include "SpTtsEngDebug.h"
  24. //***************************
  25. // Allophones
  26. //***************************
  27. typedef enum
  28. {
  29. _IY_, _IH_, _EH_, _AE_, _AA_, _AH_, _AO_, _UH_, _AX_, _ER_,
  30. _EY_, _AY_, _OY_, _AW_, _OW_, _UW_,
  31. _IX_, _SIL_, _w_, _y_,
  32. _r_, _l_, _h_, _m_, _n_, _NG_, _f_, _v_, _TH_, _DH_,
  33. _s_, _z_, _SH_, _ZH_, _p_, _b_, _t_, _d_, _k_, _g_,
  34. _CH_, _JH_, _DX_,
  35. _STRESS1_,
  36. _STRESS2_,
  37. _EMPHSTRESS_,
  38. _SYLLABLE_,
  39. } ALLO_CODE;
  40. static const long NUMBER_OF_ALLO = (_SYLLABLE_ + 1);
  41. //-----------------------------------
  42. // For 2-word allo conversion
  43. //-----------------------------------
  44. static const short NO_IPA = 0;
  45. // XXXX XXXX XXXX XXXX XXXX bLis ssoo ttBB
  46. // X = unused
  47. // B = boundary type
  48. // t = syllable type
  49. // o = vowel order
  50. // s = stress type
  51. // i = word initial consonant
  52. // L = syLlable start
  53. // b = break
  54. enum ALLOTAGS
  55. {
  56. WORD_START = (1 << 0),
  57. TERM_BOUND = (1 << 1),
  58. BOUNDARY_TYPE_FIELD = WORD_START | TERM_BOUND, // mask
  59. WORD_END_SYLL = (1 << 2),
  60. TERM_END_SYLL = (1 << 3),
  61. SYLLABLE_TYPE_FIELD = WORD_END_SYLL | TERM_END_SYLL, // mask
  62. FIRST_SYLLABLE_IN_WORD = (1 << 4), // in multi-syllable word
  63. MID_SYLLABLE_IN_WORD = (2 << 4),
  64. LAST_SYLLABLE_IN_WORD = (3 << 4),
  65. MORE_THAN_ONE_SYLLABLE_IN_WORD = LAST_SYLLABLE_IN_WORD, // either bit is set
  66. ONE_OR_NO_SYLLABLE_IN_WORD = 0x0000, // niether bits are set
  67. SYLLABLE_ORDER_FIELD = LAST_SYLLABLE_IN_WORD, // mask
  68. PRIMARY_STRESS = (1 << 6),
  69. SECONDARY_STRESS = (1 << 7),
  70. EMPHATIC_STRESS = (1 << 8),
  71. IS_STRESSED = PRIMARY_STRESS | SECONDARY_STRESS | EMPHATIC_STRESS,
  72. PRIM_OR_EMPH_STRESS = PRIMARY_STRESS | EMPHATIC_STRESS,
  73. STRESS_FIELD = PRIMARY_STRESS | SECONDARY_STRESS | EMPHATIC_STRESS, // mask
  74. WORD_INITIAL_CONSONANT = (1 << 9), // up to 1st vowel in word
  75. STRESSED_INITIAL_CONS = (IS_STRESSED + WORD_INITIAL_CONSONANT),
  76. SYLLABLE_START = (1 << 10),
  77. SIL_BREAK = (1 << 11),
  78. };
  79. //***************************
  80. // AlloFlags
  81. //***************************
  82. enum ALLOFLAGS
  83. {
  84. KVOWELF = (1<<0),
  85. KCONSONANTF = (1<<1),
  86. KVOICEDF = (1<<2),
  87. KVOWEL1F = (1<<3),
  88. KSONORANTF = (1<<4),
  89. KSONORANT1F = (1<<5),
  90. KNASALF = (1<<6),
  91. KLIQGLIDEF = (1<<7),
  92. KSONORCONSONF = (1<<8),
  93. KPLOSIVEF = (1<<9),
  94. KPLOSFRICF = (1<<10),
  95. KOBSTF = (1<<11),
  96. KSTOPF = (1<<12),
  97. KALVEOLARF = (1<<13),
  98. KVELAR = (1<<14),
  99. KLABIALF = (1<<15),
  100. KDENTALF = (1<<16),
  101. KPALATALF = (1<<17),
  102. KYGLIDESTARTF = (1<<18),
  103. KYGLIDEENDF = (1<<19),
  104. KGSTOPF = (1<<20),
  105. KFRONTF = (1<<21),
  106. KDIPHTHONGF = (1<<22),
  107. KHASRELEASEF = (1<<23),
  108. KAFFRICATEF = (1<<24),
  109. KLIQGLIDE2F = (1<<25),
  110. KVOCLIQ = (1<<26),
  111. KFRIC = (1<<27),
  112. KFLAGMASK1 = (KLABIALF+KDENTALF+KPALATALF+KALVEOLARF+KVELAR+KGSTOPF),
  113. KFLAGMASK2 = (KALVEOLARF-1),
  114. };
  115. #define BOUNDARY_BASE 1000
  116. enum TOBI_BOUNDARY
  117. {
  118. K_NOBND = 0,
  119. K_LMINUS = BOUNDARY_BASE, // fall
  120. K_HMINUS, // none
  121. K_LMINUSLPERC,
  122. K_LMINUSHPERC,
  123. K_HMINUSHPERC,
  124. K_HMINUSLPERC,
  125. };
  126. enum TUNE_TYPE
  127. {
  128. NULL_BOUNDARY = 0, // no boundary NOTE: always put this at the beginning
  129. PHRASE_BOUNDARY, // comma
  130. EXCLAM_BOUNDARY, // exclamatory utterance terminator
  131. YN_QUEST_BOUNDARY, // yes-no question terminator
  132. WH_QUEST_BOUNDARY, // yes-no question terminator
  133. DECLAR_BOUNDARY, // declarative terminator
  134. PAREN_L_BOUNDARY, // left paren
  135. PAREN_R_BOUNDARY, // right paren
  136. QUOTE_L_BOUNDARY, // left quote
  137. QUOTE_R_BOUNDARY, // right quote
  138. PHONE_BOUNDARY,
  139. TOD_BOUNDARY,
  140. ELLIPSIS_BOUNDARY,
  141. SUB_BOUNDARY_1, // NOTE: always put these at the end
  142. SUB_BOUNDARY_2,
  143. SUB_BOUNDARY_3,
  144. SUB_BOUNDARY_4,
  145. SUB_BOUNDARY_5,
  146. SUB_BOUNDARY_6,
  147. NUMBER_BOUNDARY,
  148. TAIL_BOUNDARY,
  149. };
  150. //***************************
  151. // ToBI Constants
  152. //***************************
  153. // !H is removed from consideration in the first pass processing
  154. // !H can possibly be recovered from analysis of the labeling and
  155. // contour at later stages (tilt, prominence, pitch range, downstep)
  156. #define ACCENT_BASE 1
  157. enum TOBI_ACCENT
  158. {
  159. K_NOACC = 0,
  160. K_HSTAR = ACCENT_BASE, // peak rise / fall
  161. K_LSTAR, // acc syll nucleus valley early fall
  162. K_LSTARH, // late rise
  163. K_RSTAR, //
  164. K_LHSTAR, // early rise
  165. K_DHSTAR, //
  166. K_HSTARLSTAR,
  167. };
  168. enum BOUNDARY_SOURCE
  169. {
  170. BND_NoSource = 0,
  171. //-- Phrase boundary rules
  172. BND_PhraseRule1,
  173. BND_PhraseRule2,
  174. BND_PhraseRule3,
  175. BND_PhraseRule4,
  176. BND_PhraseRule5,
  177. BND_PhraseRule6,
  178. BND_PhraseRule7,
  179. BND_PhraseRule8,
  180. BND_PhraseRule9,
  181. BND_PhraseRule10,
  182. BND_PhraseRule11,
  183. BND_PhraseRule12,
  184. BND_PhraseRule13,
  185. //-- ToBI
  186. BND_YNQuest,
  187. BND_WHQuest,
  188. BND_Period,
  189. BND_Comma,
  190. //--Templates
  191. BND_NumberTemplate, // Should never get this!
  192. BND_IntegerQuant,
  193. BND_Currency_DOLLAR,
  194. BND_Frac_Num,
  195. BND_Phone_COUNTRY,
  196. BND_Phone_AREA,
  197. BND_Phone_ONE,
  198. BND_Phone_DIGITS,
  199. BND_TimeOFDay_HR,
  200. BND_TimeOFDay_AB,
  201. BND_Ellipsis,
  202. BND_ForcedTerm, // Should never get this!
  203. BND_IDontKnow,
  204. };
  205. enum ACCENT_SOURCE
  206. {
  207. ACC_NoSource = 0,
  208. //-- Phrase boundary rules
  209. ACC_PhraseRule1,
  210. ACC_PhraseRule2,
  211. ACC_PhraseRule3,
  212. ACC_PhraseRule4,
  213. ACC_PhraseRule5,
  214. ACC_PhraseRule6,
  215. ACC_PhraseRule7,
  216. ACC_PhraseRule8,
  217. ACC_PhraseRule9,
  218. ACC_PhraseRule10,
  219. ACC_PhraseRule11,
  220. ACC_PhraseRule12,
  221. ACC_PhraseRule13,
  222. //-- ToBI
  223. ACC_InitialVAux,
  224. ACC_FunctionSeq,
  225. ACC_ContentSeq,
  226. ACC_YNQuest,
  227. ACC_Period,
  228. ACC_Comma,
  229. //--Templates
  230. ACC_IntegerGroup,
  231. ACC_NumByNum,
  232. ACC_Frac_DEN, // "half", "tenths", etc.
  233. ACC_Phone_1stArea, // 1st digit in area code
  234. ACC_Phone_3rdArea, // 3rd digit in area code
  235. ACC_Phone_1st3,
  236. ACC_Phone_3rd3,
  237. ACC_Phone_1st4,
  238. ACC_Phone_3rd4,
  239. ACC_TimeOFDay_HR,
  240. ACC_TimeOFDay_1stMin,
  241. ACC_TimeOFDay_M,
  242. ACC_PhoneBnd_AREA,
  243. ACC_PhoneBnd_34,
  244. ACC_PhoneBnd_4,
  245. ACC_IDontKnow,
  246. };
  247. enum SILENCE_SOURCE
  248. {
  249. SIL_NoSource = 0,
  250. SIL_Term,
  251. SIL_QuoteStart,
  252. SIL_QuoteEnd,
  253. SIL_ParenStart,
  254. SIL_ParenEnd,
  255. SIL_Emph,
  256. SIL_SubBound, // Should never see this (gets removed)
  257. SIL_XML,
  258. //-- Prosody templates
  259. SIL_TimeOfDay_HR,
  260. SIL_TimeOfDay_AB,
  261. SIL_Phone_COUNTRY,
  262. SIL_Phone_AREA,
  263. SIL_Phone_ONE,
  264. SIL_Phone_DIGITS,
  265. SIL_Fractions_NUM,
  266. SIL_Currency_DOLLAR,
  267. SIL_Integer_Quant,
  268. SIL_Head,
  269. SIL_Tail,
  270. SIL_Ellipsis,
  271. SIL_ForcedTerm, // Should never get this!
  272. };
  273. static const short TOKEN_LEN_MAX = 20;
  274. class CFEToken
  275. {
  276. public:
  277. CFEToken();
  278. ~CFEToken();
  279. WCHAR tokStr[TOKEN_LEN_MAX];
  280. long tokLen;
  281. PRONSRC m_PronType;
  282. long phon_Len;
  283. ALLO_CODE phon_Str[SP_MAX_PRON_LENGTH]; // Allo string
  284. ENGPARTOFSPEECH POScode;
  285. PROSODY_POS m_posClass;
  286. ULONG srcPosition; // Source position for this token
  287. ULONG srcLen; // Source length for this token
  288. ULONG sentencePosition; // Source position for sentence
  289. ULONG sentenceLen; // Source length for sentence
  290. ULONG user_Volume; // 1 - 101
  291. long user_Rate; // -10 - 10
  292. long user_Pitch; // -10 - 10
  293. long user_Emph; // 0 or 5
  294. ULONG user_Break; // ms of silence
  295. CBookmarkList *pBMObj;
  296. TOBI_ACCENT m_Accent; // accent prosodic control
  297. long m_Accent_Prom; // prominence prosodic control
  298. TOBI_BOUNDARY m_Boundary; // boundary tone prosodic control
  299. long m_Boundary_Prom; // prominence prosodic control
  300. TUNE_TYPE m_TuneBoundaryType; // Current token is a boundary
  301. float m_TermSil; // Pad word with silence (in sec)
  302. float m_DurScale; // Duration ratio
  303. float m_ProsodyDurScale;
  304. float m_PitchBaseOffs; // Relative baseline pitch offset in octaves
  305. float m_PitchRangeScale; // Pitch range offset scale (0 - 2.0)
  306. //--- Diagnostic
  307. ACCENT_SOURCE m_AccentSource;
  308. BOUNDARY_SOURCE m_BoundarySource;
  309. SILENCE_SOURCE m_SilenceSource;
  310. };
  311. typedef CSPList<CFEToken*,CFEToken*> CFETokenList;
  312. class CAlloCell
  313. {
  314. public:
  315. CAlloCell();
  316. ~CAlloCell();
  317. CAlloCell* operator =( CAlloCell *pNewCell );
  318. //--------------------------------
  319. // Member Vars
  320. //--------------------------------
  321. ALLO_CODE m_allo;
  322. short m_dur;
  323. float m_ftDuration;
  324. float m_UnitDur;
  325. short m_knots;
  326. float m_ftTime[KNOTS_PER_PHON];
  327. float m_ftPitch[KNOTS_PER_PHON];
  328. long m_ctrlFlags;
  329. TOBI_ACCENT m_ToBI_Accent;
  330. long m_Accent_Prom; // prominence prosodic control
  331. TOBI_BOUNDARY m_ToBI_Boundary;
  332. long m_Boundary_Prom; // prominence prosodic control
  333. long m_PitchBufStart;
  334. long m_PitchBufEnd;
  335. ULONG m_user_Volume;
  336. long m_user_Rate;
  337. long m_user_Pitch;
  338. long m_user_Emph;
  339. ULONG m_user_Break;
  340. ULONG m_Sil_Break;
  341. float m_Pitch_HI;
  342. float m_Pitch_LO;
  343. ULONG m_SrcPosition;
  344. ULONG m_SrcLen;
  345. ULONG m_SentencePosition; // Source position for sentence
  346. ULONG m_SentenceLen; // Source length for sentence
  347. TUNE_TYPE m_TuneBoundaryType;
  348. TUNE_TYPE m_NextTuneBoundaryType;
  349. CBookmarkList *m_pBMObj;
  350. float m_DurScale; // Duration ratio
  351. float m_ProsodyDurScale;
  352. float m_PitchBaseOffs; // Relative baseline pitch offset in octaves
  353. float m_PitchRangeScale; // Pitch range offset scale (0 - 2.0)
  354. //--- Diagnostic
  355. ACCENT_SOURCE m_AccentSource;
  356. BOUNDARY_SOURCE m_BoundarySource;
  357. SILENCE_SOURCE m_SilenceSource;
  358. char *m_pTextStr;
  359. };
  360. class CAlloList
  361. {
  362. public:
  363. CAlloList();
  364. ~CAlloList();
  365. //--------------------------------
  366. // Methods
  367. //--------------------------------
  368. CAlloCell *GetCell( long index );
  369. CAlloCell *GetTailCell();
  370. long GetCount();
  371. bool WordToAllo( CFEToken *pPrevTok, CFEToken *pTok, CFEToken *pNextTok, CAlloCell *pEndCell );
  372. CAlloCell *GetHeadCell()
  373. {
  374. m_ListPos = m_AlloCellList.GetHeadPosition();
  375. return m_AlloCellList.GetNext( m_ListPos );
  376. }
  377. CAlloCell *GetNextCell()
  378. {
  379. if( m_ListPos )
  380. {
  381. return m_AlloCellList.GetNext( m_ListPos );
  382. }
  383. else
  384. {
  385. //-- We're at end of list!
  386. return NULL;
  387. }
  388. }
  389. //-- For debug only
  390. void OutAllos();
  391. private:
  392. //--------------------------------
  393. // Member Vars
  394. //--------------------------------
  395. long m_cAllos;
  396. SPLISTPOS m_ListPos;
  397. CSPList<CAlloCell*,CAlloCell*> m_AlloCellList;
  398. };
  399. //-----------------------------------
  400. // Speaking Rate parameters
  401. //-----------------------------------
  402. static const float MAX_SIL_DUR = 1.0f; // seconds
  403. static const float MIN_ALLO_DUR = 0.011f; // seconds
  404. static const float MAX_ALLO_DUR = 5.0f; // seconds
  405. class CDuration
  406. {
  407. public:
  408. //--------------------------------
  409. // Methods
  410. //--------------------------------
  411. void AlloDuration( CAlloList *pAllos, float rateRatio );
  412. private:
  413. void Pause_Insertion( long userDuration, long silBreak );
  414. void PhraseFinal_Lengthen( long cellCount );
  415. long Emphatic_Lenghen( long lastStress );
  416. //--------------------------------
  417. // Member vars
  418. //--------------------------------
  419. float m_DurHold;
  420. float m_TotalDurScale;
  421. float m_durationPad;
  422. ALLO_CODE m_cur_Phon;
  423. long m_cur_PhonCtrl;
  424. long m_cur_PhonFlags;
  425. long m_cur_SyllableType;
  426. short m_cur_VowelFlag;
  427. long m_cur_Stress;
  428. ALLO_CODE m_prev_Phon;
  429. long m_prev_PhonCtrl;
  430. long m_prev_PhonFlags;
  431. ALLO_CODE m_next_Phon;
  432. long m_next_PhonCtrl;
  433. long m_next_PhonFlags;
  434. ALLO_CODE m_next2_Phon;
  435. long m_next2_PhonCtrl;
  436. long m_next2_PhonFlags;
  437. TUNE_TYPE m_NextBoundary, m_CurBoundary;
  438. };
  439. typedef struct
  440. {
  441. ALLO_CODE allo;
  442. long ctrlFlags;
  443. }ALLO_ARRAY;
  444. class CSyllableTagger
  445. {
  446. public:
  447. //--------------------------------
  448. // Methods
  449. //--------------------------------
  450. void TagSyllables( CAlloList *pAllos );
  451. private:
  452. void MarkSyllableOrder( long scanIndex);
  453. void MarkSyllableBoundry( long scanIndex);
  454. void MarkSyllableStart();
  455. short Find_Next_Word_Bound( short index );
  456. short If_Consonant_Cluster( ALLO_CODE Consonant_1st, ALLO_CODE Consonant_2nd);
  457. void ListToArray( CAlloList *pAllos );
  458. void ArrayToList( CAlloList *pAllos );
  459. //--------------------------------
  460. // Member vars
  461. //--------------------------------
  462. ALLO_ARRAY *m_pAllos;
  463. long m_numOfCells;
  464. };
  465. enum { TARG_PER_ALLO_MAX = 2 }; // One for accent and one for boundary
  466. enum TUNE_STYLE
  467. {
  468. FLAT_TUNE = 0, // flat
  469. DESCEND_TUNE, // go down
  470. ASCEND_TUNE, // go up
  471. };
  472. //------------------
  473. // Global Constants
  474. //------------------
  475. static const float PITCH_BUF_RES = (float)0.005;
  476. static const float K_HSTAR_OFFSET = (float)0.5;
  477. static const float K_HDOWNSTEP_COEFF = (float)0.5;
  478. //------------------
  479. // Macros
  480. //------------------
  481. #define CeilVal(x) ((m_CeilSlope * x) + m_CeilStart)
  482. #define FloorVal(x) ((m_FloorSlope * x) + m_FloorStart)
  483. #define RefVal(x) ((m_RefSlope * x) + m_RefStart)
  484. class CPitchProsody
  485. {
  486. public:
  487. //--------------------------------
  488. // Methods
  489. //--------------------------------
  490. CPitchProsody() { m_pContBuf = NULL; };
  491. ~CPitchProsody() { if ( m_pContBuf ) delete m_pContBuf; };
  492. void AlloPitch( CAlloList *pAllos, int baseLine, int pitchRange );
  493. void GetContour( float**, ULONG* );
  494. private:
  495. float DoPitchControl( long pitchControl, float basePitch );
  496. void PitchTrack();
  497. void SetDefaultPitch();
  498. void GetKnots();
  499. void NewTarget( long index, float value );
  500. //--------------------------------
  501. // Member vars
  502. //--------------------------------
  503. CAlloList *m_pAllos;
  504. long m_numOfCells;
  505. float m_TotalDur; // phrase duration in seconds
  506. TUNE_STYLE m_Tune_Style;
  507. float *m_pContBuf;
  508. ULONG m_ulNumPoints;
  509. float m_OffsTime;
  510. TOBI_ACCENT m_CurAccent;
  511. //------------------------
  512. // Diagnostic
  513. //------------------------
  514. ACCENT_SOURCE m_CurAccentSource;
  515. BOUNDARY_SOURCE m_CurBoundarySource;
  516. char *m_pCurTextStr;
  517. };
  518. #endif //--- This must be the last line in the file