windows-server-2003/enduser/speech/tts/msttsdrv/engine/backend.h


								/******************************************************************************

								* Backend.h *

								*-----------*

								*  This is the header file for the CBackend implementation.

								*------------------------------------------------------------------------------

								*  Copyright (C) 1999 Microsoft Corporation         Date: 03/01/99

								*  All Rights Reserved

								*

								*********************************************************************** MC ****/


								#ifndef Backend_H

								#define Backend_H


								#ifndef ReverbFX_H

								#include "ReverbFX.h"

								#endif

								#ifndef FeedChain_H

								#include "FeedChain.h"

								#endif

								#ifndef __spttseng_h__

								#include "spttseng.h"

								#endif

								#ifndef SPDebug_h

								#include <spdebug.h>

								#endif

								#ifndef SPCollec_h

								#include <SPCollec.h>

								#endif


								#include "SpTtsEngDebug.h"


								static const short MAX_TARGETS_PER_UNIT = 3;          // Max number of knots allowed

								static const short MIN_VOICE_PITCH      = 10;         // Lowest voiced pitch (hertz)

								static const short UNIT_SIL             = 0;           // Silence phon

								static const short SPEECH_FRAME_SIZE	= 5000;        // Output audio uffer...

								static const short SPEECH_FRAME_OVER	= 1000;        // ...plus pad


								//----------------------------------------------------------

								// find a yn corresponding to xn,

								// given (x0, y0), (x1, y1), x0 <= xn <= x1

								//----------------------------------------------------------

								inline float LinInterp( float x0, float xn, float x1, float y0, float y1 )

								{

								    return y0 + (y1-y0)*(xn-x0)/(x1-x0);

								}


								// Math marcos

								#define ABS(x) ((x) >= 0 ? (x) : -(x))

								#define MAX(x,y) (((x) >= (y)) ? (x) : (y))

								#define MIN(x,y) (((x) <= (y)) ? (x) : (y))


								static const float LINEAR_BKPT  = 0.1f;

								static const float LOG_RANGE    = (-25.0f);


								//********************************************************************

								//

								//  CBackend keeps track of all the state information for the

								//  synthesis process.

								//

								//********************************************************************

								class CBackend

								{

								public:

								    /*--- Constructors/Destructors ---*/

								    CBackend ();

								    ~CBackend ();


								    /*=== Methods =======*/

								    HRESULT Init(   IMSVoiceData* pVoiceDataObj,

								                    CFeedChain *pSrcObj,

								                    MSVOICEINFO* pVoiceInfo );

									SPEECH_STATE	GetSpeechState() {return m_SpeechState;}

								    void    PrepareSpeech( ISpTTSEngineSite* outputSite );

								    HRESULT RenderFrame( );


								private:

								    HRESULT StartNewUnit();

								    long    ProsodyMod(    UNITINFO    *pCurUnit,

								                            long        cInEpochs,

								                            float       durationMpy,

								                            long        cMaxOutEpochs);

								    void    CleanUpSynth();

								    void    ResRecons(  float   *pInRes,

								                        long    InSize,

								                        float   *pOutRes,

								                        long    OutSize,

								                        float   scale );

								    void    LPCFilter( float *pCurLPC, float *pCurRes, long len, float gain );

								    void    FreeSynth( MSUNITDATA* pSynth );

								    void    PSOLA_Stretch(  float *pInRes, long InSize,

								                            float *pOutRes, long OutSize,

								                            float *pWindow,

								                            long  cWindowSize );

								    void    CvtToShort( float *pSrc, long blocksize, long stereoOut, float audioGain );

								    void    Release( );


								    /*=== Member Data ===*/

								    CFeedChain      *m_pSrcObj;             // Backend gets its input from here

								    MSUNITDATA      m_Synth;                // Unit data from 'Voicedataobj'

								    float          *m_pHistory;             // LPC delays

								    unsigned long   m_fModifiers;

								    float          *m_pHistory2;            // IIR delays

								    float          *m_pFilter;              // IIR/FIR coefficients

								    long            m_cNumTaps;             // Coefficient count

								    LP_CReverbFX    m_pReverb;              // Reverb object


								    long            *m_pMap;                // in/out epoch map

								    float           *m_pOutEpoch;           // epoch sizes

								    short           *m_pRevFlag;            // true = rev unvoiced


								    float           *m_pInRes;              // m_pSynth.pRes

								    float           *m_pInEpoch;            // m_pSynth.pEpoch

								    float           *m_pLPC;                // m_pSynth->pLPC

								    long            m_cOutSamples_Phon;     // sample count

								    long            m_durationTarget;       // target sample total

								    long            m_silMode;

								    float           *m_pSynthTime;          // pCurUnit->pTime

								    float           *m_pSynthAmp;           // pCurUnit->pAmp

								    long            m_nKnots;               // pCurUnit->nKnots


								    SPEECH_STATE    m_SpeechState;          // Either continue or done

								    long            m_cOutSamples_Frame;    // Audio output sample count for frame

								    float           *m_pSpeechBuf;          // Audio output sample buffer

								    ULONG           m_cOutSamples_Total;    // Audio output sample count for Speak

								    long            m_EpochIndex;           // Index for render

								    long            m_cOutEpochs;           // Count for render


								    long            m_vibrato_Phase1;       // Current vibrato phase index

								    float           m_VibratoDepth;         // Vibrato gain

								    float           m_VibratoFreq;          // Vibrato speed

								    long            m_StereoOut;            // TRUE = stereo output

								    long            m_BytesPerSample;       // 2 = mono, 4 = stereo

								    IMSVoiceData*   m_pVoiceDataObj;        // Voice object

								    ULONG           m_cOrder;               // LPC filter order

								    float           m_SampleRate;           // I/O rate

								    float*          m_pWindow;              // Hanning Window

								    long            m_FFTSize;              // FFT length


								    // User Controls

								    float           m_UnitVolume;           // 0 - 1.0 (linear)

								    long            m_MasterVolume;         // 0 - 100 (linear)

								    float           m_linearScale;          // Linear taper region scale


								    // SAPI audio sink

								    ISpTTSEngineSite*   m_pOutputSite;

									bool			m_HasSpeech;

								};


								//--------------------------------

								// Unimplemented

								//--------------------------------

								static const long BACKEND_BITFLAG_WHISPER     = (1 << 0);

								static const long BACKEND_BITFLAG_FIR         = (1 << 1);

								static const long BACKEND_BITFLAG_IIR         = (1 << 2);

								static const long BACKEND_BITFLAG_REVERB      = (1 << 3);

								static const float VIBRATO_DEFAULT_DEPTH      = 0.05f;

								static const float VIBRATO_DEFAULT_FREQ       = 3.0f;          // hz


								#endif //--- This must be the last line in the file