windows-server-2003/enduser/speech/tts/msttsdrv/engine/backend.h

/******************************************************************************
* Backend.h *
*-----------*
*  This is the header file for the CBackend implementation.
*------------------------------------------------------------------------------
*  Copyright (C) 1999 Microsoft Corporation         Date: 03/01/99
*  All Rights Reserved
*
*********************************************************************** MC ****/

#ifndef Backend_H
#define Backend_H

#ifndef ReverbFX_H
#include "ReverbFX.h"
#endif
#ifndef FeedChain_H
#include "FeedChain.h"
#endif
#ifndef __spttseng_h__
#include "spttseng.h"
#endif
#ifndef SPDebug_h
#include <spdebug.h>
#endif
#ifndef SPCollec_h
#include <SPCollec.h>
#endif

#include "SpTtsEngDebug.h"


static const short MAX_TARGETS_PER_UNIT = 3;          // Max number of knots allowed
static const short MIN_VOICE_PITCH      = 10;         // Lowest voiced pitch (hertz)
static const short UNIT_SIL             = 0;           // Silence phon
static const short SPEECH_FRAME_SIZE	= 5000;        // Output audio uffer...
static const short SPEECH_FRAME_OVER	= 1000;        // ...plus pad

//----------------------------------------------------------
// find a yn corresponding to xn, 
// given (x0, y0), (x1, y1), x0 <= xn <= x1
//----------------------------------------------------------
inline float LinInterp( float x0, float xn, float x1, float y0, float y1 )
{
    return y0 + (y1-y0)*(xn-x0)/(x1-x0);
}

// Math marcos
#define ABS(x) ((x) >= 0 ? (x) : -(x))
#define MAX(x,y) (((x) >= (y)) ? (x) : (y))
#define MIN(x,y) (((x) <= (y)) ? (x) : (y))

static const float LINEAR_BKPT  = 0.1f;
static const float LOG_RANGE    = (-25.0f);

//********************************************************************
//
//  CBackend keeps track of all the state information for the
//  synthesis process.
//
//********************************************************************
class CBackend 
{
public:
    /*--- Constructors/Destructors ---*/
    CBackend ();
    ~CBackend ();

    /*=== Methods =======*/
    HRESULT Init(   IMSVoiceData* pVoiceDataObj, 
                    CFeedChain *pSrcObj, 
                    MSVOICEINFO* pVoiceInfo );
	SPEECH_STATE	GetSpeechState() {return m_SpeechState;}
    void    PrepareSpeech( ISpTTSEngineSite* outputSite );
    HRESULT RenderFrame( );


private:
    HRESULT StartNewUnit();
    long    ProsodyMod(    UNITINFO    *pCurUnit, 
                            long        cInEpochs, 
                            float       durationMpy,
                            long        cMaxOutEpochs);
    void    CleanUpSynth();
    void    ResRecons(  float   *pInRes,
                        long    InSize,
                        float   *pOutRes,
                        long    OutSize,
                        float   scale );
    void    LPCFilter( float *pCurLPC, float *pCurRes, long len, float gain );
    void    FreeSynth( MSUNITDATA* pSynth );
    void    PSOLA_Stretch(  float *pInRes, long InSize, 
                            float *pOutRes, long OutSize,
                            float *pWindow, 
                            long  cWindowSize );
    void    CvtToShort( float *pSrc, long blocksize, long stereoOut, float audioGain );
    void    Release( );
   
    /*=== Member Data ===*/
    CFeedChain      *m_pSrcObj;             // Backend gets its input from here
    MSUNITDATA      m_Synth;                // Unit data from 'Voicedataobj'
    float          *m_pHistory;             // LPC delays
    unsigned long   m_fModifiers;
    float          *m_pHistory2;            // IIR delays
    float          *m_pFilter;              // IIR/FIR coefficients
    long            m_cNumTaps;             // Coefficient count            
    LP_CReverbFX    m_pReverb;              // Reverb object


    long            *m_pMap;                // in/out epoch map
    float           *m_pOutEpoch;           // epoch sizes
    short           *m_pRevFlag;            // true = rev unvoiced

    float           *m_pInRes;              // m_pSynth.pRes
    float           *m_pInEpoch;            // m_pSynth.pEpoch
    float           *m_pLPC;                // m_pSynth->pLPC
    long            m_cOutSamples_Phon;     // sample count
    long            m_durationTarget;       // target sample total
    long            m_silMode;
    float           *m_pSynthTime;          // pCurUnit->pTime
    float           *m_pSynthAmp;           // pCurUnit->pAmp
    long            m_nKnots;               // pCurUnit->nKnots

    SPEECH_STATE    m_SpeechState;          // Either continue or done
    long            m_cOutSamples_Frame;    // Audio output sample count for frame
    float           *m_pSpeechBuf;          // Audio output sample buffer
    ULONG           m_cOutSamples_Total;    // Audio output sample count for Speak
    long            m_EpochIndex;           // Index for render
    long            m_cOutEpochs;           // Count for render


    long            m_vibrato_Phase1;       // Current vibrato phase index
    float           m_VibratoDepth;         // Vibrato gain
    float           m_VibratoFreq;          // Vibrato speed
    long            m_StereoOut;            // TRUE = stereo output
    long            m_BytesPerSample;       // 2 = mono, 4 = stereo
    IMSVoiceData*   m_pVoiceDataObj;        // Voice object
    ULONG           m_cOrder;               // LPC filter order
    float           m_SampleRate;           // I/O rate
    float*          m_pWindow;              // Hanning Window
    long            m_FFTSize;              // FFT length

    // User Controls
    float           m_UnitVolume;           // 0 - 1.0 (linear)
    long            m_MasterVolume;         // 0 - 100 (linear)
    float           m_linearScale;          // Linear taper region scale

    // SAPI audio sink
    ISpTTSEngineSite*   m_pOutputSite;
	bool			m_HasSpeech;
};


//--------------------------------
// Unimplemented
//--------------------------------
static const long BACKEND_BITFLAG_WHISPER     = (1 << 0);
static const long BACKEND_BITFLAG_FIR         = (1 << 1);
static const long BACKEND_BITFLAG_IIR         = (1 << 2);
static const long BACKEND_BITFLAG_REVERB      = (1 << 3);
static const float VIBRATO_DEFAULT_DEPTH      = 0.05f;
static const float VIBRATO_DEFAULT_FREQ       = 3.0f;          // hz

#endif //--- This must be the last line in the file