mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
630 lines
15 KiB
630 lines
15 KiB
/******************************************************************************
|
|
* AlloOps.h *
|
|
*-----------*
|
|
* This is the header file for the following clsses:
|
|
* CAlloCell
|
|
* CAlloList
|
|
* CDuration
|
|
* CSyllableTagger
|
|
* CToneTargets
|
|
* CPitchProsody
|
|
|
|
*------------------------------------------------------------------------------
|
|
* Copyright (C) 1999 Microsoft Corporation Date: 03/01/99
|
|
* All Rights Reserved
|
|
*
|
|
*********************************************************************** MC ****/
|
|
|
|
#ifndef AlloOps_H
|
|
#define AlloOps_H
|
|
|
|
#include "stdafx.h"
|
|
#include "commonlx.h"
|
|
|
|
#include "ms_entropicengine.h"
|
|
#include "FeedChain.h"
|
|
#include <SPCollec.h>
|
|
#include "SpTtsEngDebug.h"
|
|
|
|
//***************************
|
|
// Allophones
|
|
//***************************
|
|
typedef enum
|
|
{
|
|
_IY_, _IH_, _EH_, _AE_, _AA_, _AH_, _AO_, _UH_, _AX_, _ER_,
|
|
_EY_, _AY_, _OY_, _AW_, _OW_, _UW_,
|
|
_IX_, _SIL_, _w_, _y_,
|
|
_r_, _l_, _h_, _m_, _n_, _NG_, _f_, _v_, _TH_, _DH_,
|
|
_s_, _z_, _SH_, _ZH_, _p_, _b_, _t_, _d_, _k_, _g_,
|
|
_CH_, _JH_, _DX_,
|
|
_STRESS1_,
|
|
_STRESS2_,
|
|
_EMPHSTRESS_,
|
|
_SYLLABLE_,
|
|
} ALLO_CODE;
|
|
|
|
static const long NUMBER_OF_ALLO = (_SYLLABLE_ + 1);
|
|
|
|
//-----------------------------------
|
|
// For 2-word allo conversion
|
|
//-----------------------------------
|
|
static const short NO_IPA = 0;
|
|
|
|
|
|
|
|
// XXXX XXXX XXXX XXXX XXXX bLis ssoo ttBB
|
|
|
|
// X = unused
|
|
// B = boundary type
|
|
// t = syllable type
|
|
// o = vowel order
|
|
// s = stress type
|
|
// i = word initial consonant
|
|
// L = syLlable start
|
|
// b = break
|
|
|
|
enum ALLOTAGS
|
|
{
|
|
WORD_START = (1 << 0),
|
|
TERM_BOUND = (1 << 1),
|
|
BOUNDARY_TYPE_FIELD = WORD_START | TERM_BOUND, // mask
|
|
|
|
WORD_END_SYLL = (1 << 2),
|
|
TERM_END_SYLL = (1 << 3),
|
|
SYLLABLE_TYPE_FIELD = WORD_END_SYLL | TERM_END_SYLL, // mask
|
|
|
|
FIRST_SYLLABLE_IN_WORD = (1 << 4), // in multi-syllable word
|
|
MID_SYLLABLE_IN_WORD = (2 << 4),
|
|
LAST_SYLLABLE_IN_WORD = (3 << 4),
|
|
MORE_THAN_ONE_SYLLABLE_IN_WORD = LAST_SYLLABLE_IN_WORD, // either bit is set
|
|
ONE_OR_NO_SYLLABLE_IN_WORD = 0x0000, // niether bits are set
|
|
SYLLABLE_ORDER_FIELD = LAST_SYLLABLE_IN_WORD, // mask
|
|
|
|
PRIMARY_STRESS = (1 << 6),
|
|
SECONDARY_STRESS = (1 << 7),
|
|
EMPHATIC_STRESS = (1 << 8),
|
|
IS_STRESSED = PRIMARY_STRESS | SECONDARY_STRESS | EMPHATIC_STRESS,
|
|
PRIM_OR_EMPH_STRESS = PRIMARY_STRESS | EMPHATIC_STRESS,
|
|
STRESS_FIELD = PRIMARY_STRESS | SECONDARY_STRESS | EMPHATIC_STRESS, // mask
|
|
|
|
WORD_INITIAL_CONSONANT = (1 << 9), // up to 1st vowel in word
|
|
STRESSED_INITIAL_CONS = (IS_STRESSED + WORD_INITIAL_CONSONANT),
|
|
SYLLABLE_START = (1 << 10),
|
|
|
|
SIL_BREAK = (1 << 11),
|
|
};
|
|
|
|
|
|
//***************************
|
|
// AlloFlags
|
|
//***************************
|
|
enum ALLOFLAGS
|
|
{
|
|
KVOWELF = (1<<0),
|
|
KCONSONANTF = (1<<1),
|
|
KVOICEDF = (1<<2),
|
|
KVOWEL1F = (1<<3),
|
|
KSONORANTF = (1<<4),
|
|
KSONORANT1F = (1<<5),
|
|
KNASALF = (1<<6),
|
|
KLIQGLIDEF = (1<<7),
|
|
KSONORCONSONF = (1<<8),
|
|
KPLOSIVEF = (1<<9),
|
|
KPLOSFRICF = (1<<10),
|
|
KOBSTF = (1<<11),
|
|
KSTOPF = (1<<12),
|
|
KALVEOLARF = (1<<13),
|
|
KVELAR = (1<<14),
|
|
KLABIALF = (1<<15),
|
|
KDENTALF = (1<<16),
|
|
KPALATALF = (1<<17),
|
|
KYGLIDESTARTF = (1<<18),
|
|
KYGLIDEENDF = (1<<19),
|
|
KGSTOPF = (1<<20),
|
|
KFRONTF = (1<<21),
|
|
KDIPHTHONGF = (1<<22),
|
|
KHASRELEASEF = (1<<23),
|
|
KAFFRICATEF = (1<<24),
|
|
KLIQGLIDE2F = (1<<25),
|
|
KVOCLIQ = (1<<26),
|
|
KFRIC = (1<<27),
|
|
|
|
KFLAGMASK1 = (KLABIALF+KDENTALF+KPALATALF+KALVEOLARF+KVELAR+KGSTOPF),
|
|
KFLAGMASK2 = (KALVEOLARF-1),
|
|
};
|
|
|
|
|
|
#define BOUNDARY_BASE 1000
|
|
enum TOBI_BOUNDARY
|
|
{
|
|
K_NOBND = 0,
|
|
K_LMINUS = BOUNDARY_BASE, // fall
|
|
K_HMINUS, // none
|
|
K_LMINUSLPERC,
|
|
K_LMINUSHPERC,
|
|
K_HMINUSHPERC,
|
|
K_HMINUSLPERC,
|
|
};
|
|
|
|
|
|
|
|
|
|
enum TUNE_TYPE
|
|
{
|
|
NULL_BOUNDARY = 0, // no boundary NOTE: always put this at the beginning
|
|
PHRASE_BOUNDARY, // comma
|
|
EXCLAM_BOUNDARY, // exclamatory utterance terminator
|
|
YN_QUEST_BOUNDARY, // yes-no question terminator
|
|
WH_QUEST_BOUNDARY, // yes-no question terminator
|
|
DECLAR_BOUNDARY, // declarative terminator
|
|
PAREN_L_BOUNDARY, // left paren
|
|
PAREN_R_BOUNDARY, // right paren
|
|
QUOTE_L_BOUNDARY, // left quote
|
|
QUOTE_R_BOUNDARY, // right quote
|
|
PHONE_BOUNDARY,
|
|
TOD_BOUNDARY,
|
|
ELLIPSIS_BOUNDARY,
|
|
|
|
SUB_BOUNDARY_1, // NOTE: always put these at the end
|
|
SUB_BOUNDARY_2,
|
|
SUB_BOUNDARY_3,
|
|
SUB_BOUNDARY_4,
|
|
SUB_BOUNDARY_5,
|
|
SUB_BOUNDARY_6,
|
|
NUMBER_BOUNDARY,
|
|
|
|
TAIL_BOUNDARY,
|
|
};
|
|
|
|
|
|
//***************************
|
|
// ToBI Constants
|
|
//***************************
|
|
// !H is removed from consideration in the first pass processing
|
|
// !H can possibly be recovered from analysis of the labeling and
|
|
// contour at later stages (tilt, prominence, pitch range, downstep)
|
|
#define ACCENT_BASE 1
|
|
enum TOBI_ACCENT
|
|
{
|
|
K_NOACC = 0,
|
|
K_HSTAR = ACCENT_BASE, // peak rise / fall
|
|
K_LSTAR, // acc syll nucleus valley early fall
|
|
K_LSTARH, // late rise
|
|
K_RSTAR, //
|
|
K_LHSTAR, // early rise
|
|
K_DHSTAR, //
|
|
K_HSTARLSTAR,
|
|
};
|
|
|
|
|
|
|
|
enum BOUNDARY_SOURCE
|
|
{
|
|
BND_NoSource = 0,
|
|
|
|
//-- Phrase boundary rules
|
|
BND_PhraseRule1,
|
|
BND_PhraseRule2,
|
|
BND_PhraseRule3,
|
|
BND_PhraseRule4,
|
|
BND_PhraseRule5,
|
|
BND_PhraseRule6,
|
|
BND_PhraseRule7,
|
|
BND_PhraseRule8,
|
|
BND_PhraseRule9,
|
|
BND_PhraseRule10,
|
|
BND_PhraseRule11,
|
|
BND_PhraseRule12,
|
|
BND_PhraseRule13,
|
|
|
|
//-- ToBI
|
|
BND_YNQuest,
|
|
BND_WHQuest,
|
|
BND_Period,
|
|
BND_Comma,
|
|
|
|
//--Templates
|
|
BND_NumberTemplate, // Should never get this!
|
|
BND_IntegerQuant,
|
|
BND_Currency_DOLLAR,
|
|
BND_Frac_Num,
|
|
|
|
BND_Phone_COUNTRY,
|
|
BND_Phone_AREA,
|
|
BND_Phone_ONE,
|
|
BND_Phone_DIGITS,
|
|
|
|
BND_TimeOFDay_HR,
|
|
BND_TimeOFDay_AB,
|
|
BND_Ellipsis,
|
|
|
|
BND_ForcedTerm, // Should never get this!
|
|
|
|
BND_IDontKnow,
|
|
};
|
|
|
|
enum ACCENT_SOURCE
|
|
{
|
|
ACC_NoSource = 0,
|
|
|
|
//-- Phrase boundary rules
|
|
ACC_PhraseRule1,
|
|
ACC_PhraseRule2,
|
|
ACC_PhraseRule3,
|
|
ACC_PhraseRule4,
|
|
ACC_PhraseRule5,
|
|
ACC_PhraseRule6,
|
|
ACC_PhraseRule7,
|
|
ACC_PhraseRule8,
|
|
ACC_PhraseRule9,
|
|
ACC_PhraseRule10,
|
|
ACC_PhraseRule11,
|
|
ACC_PhraseRule12,
|
|
ACC_PhraseRule13,
|
|
|
|
//-- ToBI
|
|
ACC_InitialVAux,
|
|
ACC_FunctionSeq,
|
|
ACC_ContentSeq,
|
|
ACC_YNQuest,
|
|
ACC_Period,
|
|
ACC_Comma,
|
|
|
|
//--Templates
|
|
ACC_IntegerGroup,
|
|
ACC_NumByNum,
|
|
ACC_Frac_DEN, // "half", "tenths", etc.
|
|
ACC_Phone_1stArea, // 1st digit in area code
|
|
ACC_Phone_3rdArea, // 3rd digit in area code
|
|
ACC_Phone_1st3,
|
|
ACC_Phone_3rd3,
|
|
ACC_Phone_1st4,
|
|
ACC_Phone_3rd4,
|
|
ACC_TimeOFDay_HR,
|
|
ACC_TimeOFDay_1stMin,
|
|
ACC_TimeOFDay_M,
|
|
|
|
ACC_PhoneBnd_AREA,
|
|
ACC_PhoneBnd_34,
|
|
ACC_PhoneBnd_4,
|
|
|
|
ACC_IDontKnow,
|
|
};
|
|
|
|
|
|
|
|
enum SILENCE_SOURCE
|
|
{
|
|
SIL_NoSource = 0,
|
|
|
|
SIL_Term,
|
|
SIL_QuoteStart,
|
|
SIL_QuoteEnd,
|
|
SIL_ParenStart,
|
|
SIL_ParenEnd,
|
|
SIL_Emph,
|
|
SIL_SubBound, // Should never see this (gets removed)
|
|
SIL_XML,
|
|
|
|
//-- Prosody templates
|
|
SIL_TimeOfDay_HR,
|
|
SIL_TimeOfDay_AB,
|
|
|
|
SIL_Phone_COUNTRY,
|
|
SIL_Phone_AREA,
|
|
SIL_Phone_ONE,
|
|
SIL_Phone_DIGITS,
|
|
|
|
SIL_Fractions_NUM,
|
|
SIL_Currency_DOLLAR,
|
|
SIL_Integer_Quant,
|
|
|
|
SIL_Head,
|
|
SIL_Tail,
|
|
SIL_Ellipsis,
|
|
|
|
SIL_ForcedTerm, // Should never get this!
|
|
};
|
|
|
|
|
|
|
|
|
|
static const short TOKEN_LEN_MAX = 20;
|
|
|
|
class CFEToken
|
|
{
|
|
public:
|
|
CFEToken();
|
|
~CFEToken();
|
|
|
|
WCHAR tokStr[TOKEN_LEN_MAX];
|
|
long tokLen;
|
|
PRONSRC m_PronType;
|
|
|
|
long phon_Len;
|
|
ALLO_CODE phon_Str[SP_MAX_PRON_LENGTH]; // Allo string
|
|
ENGPARTOFSPEECH POScode;
|
|
PROSODY_POS m_posClass;
|
|
|
|
ULONG srcPosition; // Source position for this token
|
|
ULONG srcLen; // Source length for this token
|
|
ULONG sentencePosition; // Source position for sentence
|
|
ULONG sentenceLen; // Source length for sentence
|
|
ULONG user_Volume; // 1 - 101
|
|
long user_Rate; // -10 - 10
|
|
long user_Pitch; // -10 - 10
|
|
long user_Emph; // 0 or 5
|
|
ULONG user_Break; // ms of silence
|
|
CBookmarkList *pBMObj;
|
|
TOBI_ACCENT m_Accent; // accent prosodic control
|
|
long m_Accent_Prom; // prominence prosodic control
|
|
TOBI_BOUNDARY m_Boundary; // boundary tone prosodic control
|
|
long m_Boundary_Prom; // prominence prosodic control
|
|
TUNE_TYPE m_TuneBoundaryType; // Current token is a boundary
|
|
float m_TermSil; // Pad word with silence (in sec)
|
|
float m_DurScale; // Duration ratio
|
|
float m_ProsodyDurScale;
|
|
float m_PitchBaseOffs; // Relative baseline pitch offset in octaves
|
|
float m_PitchRangeScale; // Pitch range offset scale (0 - 2.0)
|
|
|
|
//--- Diagnostic
|
|
ACCENT_SOURCE m_AccentSource;
|
|
BOUNDARY_SOURCE m_BoundarySource;
|
|
SILENCE_SOURCE m_SilenceSource;
|
|
};
|
|
typedef CSPList<CFEToken*,CFEToken*> CFETokenList;
|
|
|
|
|
|
|
|
class CAlloCell
|
|
{
|
|
public:
|
|
CAlloCell();
|
|
~CAlloCell();
|
|
|
|
CAlloCell* operator =( CAlloCell *pNewCell );
|
|
//--------------------------------
|
|
// Member Vars
|
|
//--------------------------------
|
|
ALLO_CODE m_allo;
|
|
short m_dur;
|
|
float m_ftDuration;
|
|
float m_UnitDur;
|
|
short m_knots;
|
|
float m_ftTime[KNOTS_PER_PHON];
|
|
float m_ftPitch[KNOTS_PER_PHON];
|
|
long m_ctrlFlags;
|
|
TOBI_ACCENT m_ToBI_Accent;
|
|
long m_Accent_Prom; // prominence prosodic control
|
|
TOBI_BOUNDARY m_ToBI_Boundary;
|
|
long m_Boundary_Prom; // prominence prosodic control
|
|
long m_PitchBufStart;
|
|
long m_PitchBufEnd;
|
|
ULONG m_user_Volume;
|
|
long m_user_Rate;
|
|
long m_user_Pitch;
|
|
long m_user_Emph;
|
|
ULONG m_user_Break;
|
|
ULONG m_Sil_Break;
|
|
float m_Pitch_HI;
|
|
float m_Pitch_LO;
|
|
ULONG m_SrcPosition;
|
|
ULONG m_SrcLen;
|
|
ULONG m_SentencePosition; // Source position for sentence
|
|
ULONG m_SentenceLen; // Source length for sentence
|
|
TUNE_TYPE m_TuneBoundaryType;
|
|
TUNE_TYPE m_NextTuneBoundaryType;
|
|
CBookmarkList *m_pBMObj;
|
|
float m_DurScale; // Duration ratio
|
|
float m_ProsodyDurScale;
|
|
float m_PitchBaseOffs; // Relative baseline pitch offset in octaves
|
|
float m_PitchRangeScale; // Pitch range offset scale (0 - 2.0)
|
|
|
|
//--- Diagnostic
|
|
ACCENT_SOURCE m_AccentSource;
|
|
BOUNDARY_SOURCE m_BoundarySource;
|
|
SILENCE_SOURCE m_SilenceSource;
|
|
char *m_pTextStr;
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
class CAlloList
|
|
{
|
|
public:
|
|
CAlloList();
|
|
~CAlloList();
|
|
//--------------------------------
|
|
// Methods
|
|
//--------------------------------
|
|
CAlloCell *GetCell( long index );
|
|
CAlloCell *GetTailCell();
|
|
long GetCount();
|
|
bool WordToAllo( CFEToken *pPrevTok, CFEToken *pTok, CFEToken *pNextTok, CAlloCell *pEndCell );
|
|
CAlloCell *GetHeadCell()
|
|
{
|
|
m_ListPos = m_AlloCellList.GetHeadPosition();
|
|
return m_AlloCellList.GetNext( m_ListPos );
|
|
}
|
|
CAlloCell *GetNextCell()
|
|
{
|
|
if( m_ListPos )
|
|
{
|
|
return m_AlloCellList.GetNext( m_ListPos );
|
|
}
|
|
else
|
|
{
|
|
//-- We're at end of list!
|
|
return NULL;
|
|
}
|
|
}
|
|
//-- For debug only
|
|
void OutAllos();
|
|
|
|
private:
|
|
//--------------------------------
|
|
// Member Vars
|
|
//--------------------------------
|
|
long m_cAllos;
|
|
SPLISTPOS m_ListPos;
|
|
CSPList<CAlloCell*,CAlloCell*> m_AlloCellList;
|
|
};
|
|
|
|
|
|
|
|
//-----------------------------------
|
|
// Speaking Rate parameters
|
|
//-----------------------------------
|
|
static const float MAX_SIL_DUR = 1.0f; // seconds
|
|
static const float MIN_ALLO_DUR = 0.011f; // seconds
|
|
static const float MAX_ALLO_DUR = 5.0f; // seconds
|
|
|
|
|
|
class CDuration
|
|
{
|
|
public:
|
|
//--------------------------------
|
|
// Methods
|
|
//--------------------------------
|
|
void AlloDuration( CAlloList *pAllos, float rateRatio );
|
|
|
|
private:
|
|
void Pause_Insertion( long userDuration, long silBreak );
|
|
void PhraseFinal_Lengthen( long cellCount );
|
|
long Emphatic_Lenghen( long lastStress );
|
|
//--------------------------------
|
|
// Member vars
|
|
//--------------------------------
|
|
float m_DurHold;
|
|
float m_TotalDurScale;
|
|
float m_durationPad;
|
|
|
|
ALLO_CODE m_cur_Phon;
|
|
long m_cur_PhonCtrl;
|
|
long m_cur_PhonFlags;
|
|
long m_cur_SyllableType;
|
|
short m_cur_VowelFlag;
|
|
long m_cur_Stress;
|
|
ALLO_CODE m_prev_Phon;
|
|
long m_prev_PhonCtrl;
|
|
long m_prev_PhonFlags;
|
|
ALLO_CODE m_next_Phon;
|
|
long m_next_PhonCtrl;
|
|
long m_next_PhonFlags;
|
|
ALLO_CODE m_next2_Phon;
|
|
long m_next2_PhonCtrl;
|
|
long m_next2_PhonFlags;
|
|
TUNE_TYPE m_NextBoundary, m_CurBoundary;
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct
|
|
{
|
|
ALLO_CODE allo;
|
|
long ctrlFlags;
|
|
}ALLO_ARRAY;
|
|
|
|
|
|
|
|
|
|
class CSyllableTagger
|
|
{
|
|
public:
|
|
//--------------------------------
|
|
// Methods
|
|
//--------------------------------
|
|
void TagSyllables( CAlloList *pAllos );
|
|
|
|
private:
|
|
void MarkSyllableOrder( long scanIndex);
|
|
void MarkSyllableBoundry( long scanIndex);
|
|
void MarkSyllableStart();
|
|
short Find_Next_Word_Bound( short index );
|
|
short If_Consonant_Cluster( ALLO_CODE Consonant_1st, ALLO_CODE Consonant_2nd);
|
|
void ListToArray( CAlloList *pAllos );
|
|
void ArrayToList( CAlloList *pAllos );
|
|
|
|
//--------------------------------
|
|
// Member vars
|
|
//--------------------------------
|
|
ALLO_ARRAY *m_pAllos;
|
|
long m_numOfCells;
|
|
};
|
|
|
|
|
|
enum { TARG_PER_ALLO_MAX = 2 }; // One for accent and one for boundary
|
|
|
|
|
|
|
|
enum TUNE_STYLE
|
|
{
|
|
FLAT_TUNE = 0, // flat
|
|
DESCEND_TUNE, // go down
|
|
ASCEND_TUNE, // go up
|
|
};
|
|
|
|
//------------------
|
|
// Global Constants
|
|
//------------------
|
|
static const float PITCH_BUF_RES = (float)0.005;
|
|
static const float K_HSTAR_OFFSET = (float)0.5;
|
|
static const float K_HDOWNSTEP_COEFF = (float)0.5;
|
|
|
|
|
|
//------------------
|
|
// Macros
|
|
//------------------
|
|
#define CeilVal(x) ((m_CeilSlope * x) + m_CeilStart)
|
|
#define FloorVal(x) ((m_FloorSlope * x) + m_FloorStart)
|
|
#define RefVal(x) ((m_RefSlope * x) + m_RefStart)
|
|
|
|
|
|
|
|
class CPitchProsody
|
|
{
|
|
public:
|
|
//--------------------------------
|
|
// Methods
|
|
//--------------------------------
|
|
CPitchProsody() { m_pContBuf = NULL; };
|
|
~CPitchProsody() { if ( m_pContBuf ) delete m_pContBuf; };
|
|
void AlloPitch( CAlloList *pAllos, int baseLine, int pitchRange );
|
|
void GetContour( float**, ULONG* );
|
|
|
|
private:
|
|
float DoPitchControl( long pitchControl, float basePitch );
|
|
void PitchTrack();
|
|
void SetDefaultPitch();
|
|
void GetKnots();
|
|
void NewTarget( long index, float value );
|
|
|
|
//--------------------------------
|
|
// Member vars
|
|
//--------------------------------
|
|
CAlloList *m_pAllos;
|
|
long m_numOfCells;
|
|
|
|
float m_TotalDur; // phrase duration in seconds
|
|
TUNE_STYLE m_Tune_Style;
|
|
float *m_pContBuf;
|
|
ULONG m_ulNumPoints;
|
|
float m_OffsTime;
|
|
TOBI_ACCENT m_CurAccent;
|
|
|
|
//------------------------
|
|
// Diagnostic
|
|
//------------------------
|
|
ACCENT_SOURCE m_CurAccentSource;
|
|
BOUNDARY_SOURCE m_CurBoundarySource;
|
|
char *m_pCurTextStr;
|
|
};
|
|
|
|
|
|
|
|
#endif //--- This must be the last line in the file
|