mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
171 lines
6.2 KiB
171 lines
6.2 KiB
/******************************************************************************
|
|
* Backend.h *
|
|
*-----------*
|
|
* This is the header file for the CBackend implementation.
|
|
*------------------------------------------------------------------------------
|
|
* Copyright (C) 1999 Microsoft Corporation Date: 03/01/99
|
|
* All Rights Reserved
|
|
*
|
|
*********************************************************************** MC ****/
|
|
|
|
#ifndef Backend_H
|
|
#define Backend_H
|
|
|
|
#ifndef ReverbFX_H
|
|
#include "ReverbFX.h"
|
|
#endif
|
|
#ifndef FeedChain_H
|
|
#include "FeedChain.h"
|
|
#endif
|
|
#ifndef __spttseng_h__
|
|
#include "spttseng.h"
|
|
#endif
|
|
#ifndef SPDebug_h
|
|
#include <spdebug.h>
|
|
#endif
|
|
#ifndef SPCollec_h
|
|
#include <SPCollec.h>
|
|
#endif
|
|
|
|
#include "SpTtsEngDebug.h"
|
|
|
|
|
|
|
|
|
|
|
|
static const short MAX_TARGETS_PER_UNIT = 3; // Max number of knots allowed
|
|
static const short MIN_VOICE_PITCH = 10; // Lowest voiced pitch (hertz)
|
|
static const short UNIT_SIL = 0; // Silence phon
|
|
static const short SPEECH_FRAME_SIZE = 5000; // Output audio uffer...
|
|
static const short SPEECH_FRAME_OVER = 1000; // ...plus pad
|
|
|
|
//----------------------------------------------------------
|
|
// find a yn corresponding to xn,
|
|
// given (x0, y0), (x1, y1), x0 <= xn <= x1
|
|
//----------------------------------------------------------
|
|
inline float LinInterp( float x0, float xn, float x1, float y0, float y1 )
|
|
{
|
|
return y0 + (y1-y0)*(xn-x0)/(x1-x0);
|
|
}
|
|
|
|
// Math marcos
|
|
#define ABS(x) ((x) >= 0 ? (x) : -(x))
|
|
#define MAX(x,y) (((x) >= (y)) ? (x) : (y))
|
|
#define MIN(x,y) (((x) <= (y)) ? (x) : (y))
|
|
|
|
static const float LINEAR_BKPT = 0.1f;
|
|
static const float LOG_RANGE = (-25.0f);
|
|
|
|
//********************************************************************
|
|
//
|
|
// CBackend keeps track of all the state information for the
|
|
// synthesis process.
|
|
//
|
|
//********************************************************************
|
|
class CBackend
|
|
{
|
|
public:
|
|
/*--- Constructors/Destructors ---*/
|
|
CBackend ();
|
|
~CBackend ();
|
|
|
|
/*=== Methods =======*/
|
|
HRESULT Init( IMSVoiceData* pVoiceDataObj,
|
|
CFeedChain *pSrcObj,
|
|
MSVOICEINFO* pVoiceInfo );
|
|
SPEECH_STATE GetSpeechState() {return m_SpeechState;}
|
|
void PrepareSpeech( ISpTTSEngineSite* outputSite );
|
|
HRESULT RenderFrame( );
|
|
|
|
|
|
private:
|
|
HRESULT StartNewUnit();
|
|
long ProsodyMod( UNITINFO *pCurUnit,
|
|
long cInEpochs,
|
|
float durationMpy);
|
|
void CleanUpSynth();
|
|
void ResRecons( float *pInRes,
|
|
long InSize,
|
|
float *pOutRes,
|
|
long OutSize,
|
|
float scale );
|
|
void LPCFilter( float *pCurLPC, float *pCurRes, long len, float gain );
|
|
void FreeSynth( MSUNITDATA* pSynth );
|
|
void PSOLA_Stretch( float *pInRes, long InSize,
|
|
float *pOutRes, long OutSize,
|
|
float *pWindow,
|
|
long cWindowSize );
|
|
void CvtToShort( float *pSrc, long blocksize, long stereoOut, float audioGain );
|
|
void Release( );
|
|
|
|
/*=== Member Data ===*/
|
|
CFeedChain *m_pSrcObj; // Backend gets its input from here
|
|
MSUNITDATA m_Synth; // Unit data from 'Voicedataobj'
|
|
float *m_pHistory; // LPC delays
|
|
unsigned long m_fModifiers;
|
|
float *m_pHistory2; // IIR delays
|
|
float *m_pFilter; // IIR/FIR coefficients
|
|
long m_cNumTaps; // Coefficient count
|
|
LP_CReverbFX m_pReverb; // Reverb object
|
|
|
|
|
|
long *m_pMap; // in/out epoch map
|
|
float *m_pOutEpoch; // epoch sizes
|
|
short *m_pRevFlag; // true = rev unvoiced
|
|
|
|
float *m_pInRes; // m_pSynth.pRes
|
|
float *m_pInEpoch; // m_pSynth.pEpoch
|
|
float *m_pLPC; // m_pSynth->pLPC
|
|
long m_cOutSamples_Phon; // sample count
|
|
long m_durationTarget; // target sample total
|
|
long m_silMode;
|
|
float *m_pSynthTime; // pCurUnit->pTime
|
|
float *m_pSynthAmp; // pCurUnit->pAmp
|
|
long m_nKnots; // pCurUnit->nKnots
|
|
|
|
SPEECH_STATE m_SpeechState; // Either continue or done
|
|
long m_cOutSamples_Frame; // Audio output sample count for frame
|
|
float *m_pSpeechBuf; // Audio output sample buffer
|
|
ULONG m_cOutSamples_Total; // Audio output sample count for Speak
|
|
long m_EpochIndex; // Index for render
|
|
long m_cOutEpochs; // Count for render
|
|
|
|
|
|
long m_vibrato_Phase1; // Current vibrato phase index
|
|
float m_VibratoDepth; // Vibrato gain
|
|
float m_VibratoFreq; // Vibrato speed
|
|
long m_StereoOut; // TRUE = stereo output
|
|
long m_BytesPerSample; // 2 = mono, 4 = stereo
|
|
IMSVoiceData* m_pVoiceDataObj; // Voice object
|
|
ULONG m_cOrder; // LPC filter order
|
|
float m_SampleRate; // I/O rate
|
|
float* m_pWindow; // Hanning Window
|
|
long m_FFTSize; // FFT length
|
|
|
|
// User Controls
|
|
float m_UnitVolume; // 0 - 1.0 (linear)
|
|
long m_MasterVolume; // 0 - 100 (linear)
|
|
float m_linearScale; // Linear taper region scale
|
|
|
|
// SAPI audio sink
|
|
ISpTTSEngineSite* m_pOutputSite;
|
|
bool m_HasSpeech;
|
|
};
|
|
|
|
|
|
|
|
|
|
//--------------------------------
|
|
// Unimplemented
|
|
//--------------------------------
|
|
static const long BACKEND_BITFLAG_WHISPER = (1 << 0);
|
|
static const long BACKEND_BITFLAG_FIR = (1 << 1);
|
|
static const long BACKEND_BITFLAG_IIR = (1 << 2);
|
|
static const long BACKEND_BITFLAG_REVERB = (1 << 3);
|
|
static const float VIBRATO_DEFAULT_DEPTH = 0.05f;
|
|
static const float VIBRATO_DEFAULT_FREQ = 3.0f; // hz
|
|
|
|
#endif //--- This must be the last line in the file
|
|
|
|
|
|
|