/******************************************************************************* * Backend.cpp * *-------------* * Description: * This module is the implementation file for the CBackend class. *------------------------------------------------------------------------------- * Created By: mc Date: 03/12/99 * Copyright (C) 1999 Microsoft Corporation * All Rights Reserved * *******************************************************************************/ #include "stdafx.h" #ifndef __spttseng_h__ #include "spttseng.h" #endif #ifndef Backend_H #include "Backend.h" #endif #ifndef FeedChain_H #include "FeedChain.h" #endif #ifndef SPDebug_h #include #endif //----------------------------- // Data.cpp //----------------------------- extern const short g_IPAToAllo[]; extern const short g_AlloToViseme[]; //-------------------------------------- // DEBUG: Save utterance WAV file //-------------------------------------- //#define SAVE_WAVE_FILE 1 const unsigned char g_SineWaveTbl[] = { 0x7b,0x7e,0x81,0x84,0x87,0x89,0x8c,0x8f,0x92,0x95,0x98,0x9b,0x9d,0xa0,0xa3,0xa6, 0xa8,0xab,0xae,0xb0,0xb3,0xb5,0xb8,0xbb,0xbd,0xbf,0xc2,0xc4,0xc7,0xc9,0xcb,0xcd, 0xcf,0xd1,0xd3,0xd5,0xd7,0xd9,0xdb,0xdd,0xdf,0xe0,0xe2,0xe3,0xe5,0xe6,0xe8,0xe9, 0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,0xf1,0xf2,0xf2,0xf3,0xf3,0xf4,0xf4,0xf4,0xf4, 0xf5,0xf5,0xf5,0xf5,0xf4,0xf4,0xf4,0xf4,0xf3,0xf3,0xf2,0xf1,0xf1,0xf0,0xef,0xee, 0xed,0xec,0xeb,0xea,0xe9,0xe7,0xe6,0xe5,0xe3,0xe1,0xe0,0xde,0xdc,0xdb,0xd9,0xd7, 0xd5,0xd3,0xd1,0xcf,0xcd,0xcb,0xc8,0xc6,0xc4,0xc1,0xbf,0xbc,0xba,0xb7,0xb5,0xb2, 0xb0,0xad,0xaa,0xa8,0xa5,0xa2,0x9f,0x9d,0x9a,0x97,0x94,0x91,0x8f,0x8c,0x89,0x86, 0x83,0x80,0x7d,0x7a,0x77,0x75,0x72,0x6f,0x6c,0x69,0x66,0x64,0x61,0x5e,0x5b,0x58, 0x56,0x53,0x50,0x4e,0x4b,0x49,0x46,0x44,0x41,0x3f,0x3c,0x3a,0x38,0x35,0x33,0x31, 0x2f,0x2d,0x2b,0x29,0x27,0x25,0x23,0x21,0x1f,0x1e,0x1c,0x1b,0x19,0x18,0x16,0x15, 0x14,0x13,0x12,0x11,0x10,0x0f,0x0e,0x0d,0x0c,0x0c,0x0b,0x0b,0x0a,0x0a,0x0a,0x0a, 0x09,0x09,0x09,0x09,0x0a,0x0a,0x0a,0x0a,0x0b,0x0b,0x0c,0x0d,0x0d,0x0e,0x0f,0x10, 0x11,0x12,0x13,0x14,0x15,0x17,0x18,0x1a,0x1b,0x1d,0x1e,0x20,0x22,0x23,0x25,0x27, 0x29,0x2b,0x2d,0x2f,0x31,0x34,0x36,0x38,0x3a,0x3d,0x3f,0x42,0x44,0x47,0x49,0x4c, 0x4e,0x51,0x54,0x56,0x59,0x5c,0x5f,0x61,0x64,0x67,0x6a,0x6d,0x6f,0x72,0x75,0x78 }; /*void PredictEpochDist( float duration, long nKnots, float SampleRate, float *pTime, float *pF0) { long curSamplesOut, endSample, j; float epochFreq; long epochLen, epochCount; curSamplesOut = 0; endSample = (long) (SampleRate * duration ); epochCount = 0; while( curSamplesOut < endSample ) { j = 1; //--------------------------------------------------- // Align to appropriate knot bassed on // current output sample //--------------------------------------------------- while( (j < nKnots - 1) && (curSamplesOut > pTime[j]) ) j++; //--------------------------------------------------- // Calculate exact pitch thru linear interpolation //--------------------------------------------------- epochFreq = LinInterp( pTime[j - 1], curSamplesOut, pTime[j], pF0[j - 1], pF0[j] ); //--------------------------------------------------- // Calc sample count for curent epoch //--------------------------------------------------- epochLen = (long) (SampleRate / epochFreq); epochCount++; curSamplesOut += epochLen; } } */ /***************************************************************************** * CBackend::CBackend * *--------------------* * Description: Constructor * ********************************************************************** MC ***/ CBackend::CBackend( ) { SPDBG_FUNC( "CBackend::CBackend" ); m_pHistory = NULL; m_pHistory2 = NULL; m_pFilter = NULL; m_pReverb = NULL; m_pOutEpoch = NULL; m_pMap = NULL; m_pRevFlag = NULL; m_pSpeechBuf = NULL; m_VibratoDepth = 0; m_UnitVolume = 1.0f; m_MasterVolume = SPMAX_VOLUME; memset( &m_Synth, 0, sizeof(MSUNITDATA) ); } /* CBackend::CBackend */ /***************************************************************************** * CBackend::~CBackend * *---------------------* * Description: Destructor * ********************************************************************** MC ***/ CBackend::~CBackend( ) { SPDBG_FUNC( "CBackend::~CBackend" ); Release(); } /* CBackend::~CBackend */ /***************************************************************************** * CBackend::Release * *---------------------* * Description: * Free memory allocaterd by Backend * ********************************************************************** MC ***/ void CBackend::Release( ) { SPDBG_FUNC( "CBackend::Release" ); CleanUpSynth( ); if( m_pSpeechBuf) { delete m_pSpeechBuf; m_pSpeechBuf = NULL; } if( m_pHistory ) { delete m_pHistory; m_pHistory = NULL; } if( m_pHistory2 ) { delete m_pHistory2; m_pHistory2 = NULL; } if( m_pReverb ) { delete m_pReverb; m_pReverb = NULL; } } /* CBackend::Release */ /***************************************************************************** * CBackend::Init * *----------------* * Description: * Opens a backend instance, keeping a pointer of the acoustic * inventory. * ********************************************************************** MC ***/ HRESULT CBackend::Init( IMSVoiceData* pVoiceDataObj, CFeedChain *pSrcObj, MSVOICEINFO* pVoiceInfo ) { SPDBG_FUNC( "CBackend::Init" ); long LPCsize = 0; HRESULT hr = S_OK; m_pVoiceDataObj = pVoiceDataObj; m_SampleRate = (float)pVoiceInfo->SampleRate; m_pSrcObj = pSrcObj; m_cOrder = pVoiceInfo->LPCOrder; m_pWindow = pVoiceInfo->pWindow; m_FFTSize = pVoiceInfo->FFTSize; m_VibratoDepth = ((float)pVoiceInfo->VibratoDepth) / 100.0f; m_VibratoDepth = 0; // NOTE: disable vibrato m_VibratoFreq = pVoiceInfo->VibratoFreq; if( pVoiceInfo->eReverbType > REVERB_TYPE_OFF ) { m_StereoOut = true; m_BytesPerSample = 4; } else { m_StereoOut = false; m_BytesPerSample = 2; } //--------------------------------------- // Allocate AUDIO buffer //--------------------------------------- m_pSpeechBuf = new float[SPEECH_FRAME_SIZE + SPEECH_FRAME_OVER]; if( m_pSpeechBuf == NULL ) { //-------------------------------------- // Out of memory! //-------------------------------------- hr = E_OUTOFMEMORY; } if( SUCCEEDED(hr) ) { //--------------------------------------- // Allocate HISTORY buffer //--------------------------------------- LPCsize = m_cOrder + 1; m_pHistory = new float[LPCsize]; if( m_pHistory == NULL ) { //-------------------------------------- // Out of memory! //-------------------------------------- hr = E_OUTOFMEMORY; } } if( SUCCEEDED(hr) ) { memset( m_pHistory, 0, LPCsize * sizeof(float) ); m_pOutEpoch = NULL; m_pMap = NULL; m_pRevFlag = NULL; m_fModifiers = 0; m_vibrato_Phase1 = 0; //-------------------------------- // Reverb Effect //-------------------------------- //pVoiceInfo->eReverbType = REVERB_TYPE_HALL; if( pVoiceInfo->eReverbType > REVERB_TYPE_OFF ) { //-------------------------------- // Create ReverbFX object //-------------------------------- if( m_pReverb == NULL ) { m_pReverb = new CReverbFX; if( m_pReverb ) { short result; result = m_pReverb->Reverb_Init( pVoiceInfo->eReverbType, (long)m_SampleRate, m_StereoOut ); if( result != KREVERB_NOERROR ) { //-------------------------------------------- // Not enough memory to do reverb // Recover gracefully //-------------------------------------------- delete m_pReverb; m_pReverb = NULL; } /*else { //-------------------------------------------------------- // Init was successful, ready to do reverb now //-------------------------------------------------------- }*/ } } } //---------------------------- // Linear taper region scale //---------------------------- m_linearScale = (float) pow( 10.0, (double)((1.0f - LINEAR_BKPT) * LOG_RANGE) / 20.0 ); #ifdef SAVE_WAVE_FILE m_SaveFile = (PCSaveWAV) new CSaveWAV; // No check needed, if this fails, we simply don't save file. if( m_SaveFile ) { m_SaveFile->OpenWavFile( (long)m_SampleRate ); } #endif } else { if( m_pSpeechBuf ) { delete m_pSpeechBuf; m_pSpeechBuf = NULL; } if( m_pHistory ) { delete m_pHistory; m_pHistory = NULL; } } return hr; } /* CBackend::Init */ /***************************************************************************** * CBackend::FreeSynth * *---------------------* * Description: * Return TRUE if consoants can be clustered. * ********************************************************************** MC ***/ void CBackend::FreeSynth( MSUNITDATA* pSynth ) { SPDBG_FUNC( "CBackend::FreeSynth" ); if( pSynth->pEpoch ) { delete pSynth->pEpoch; pSynth->pEpoch = NULL; } if( pSynth->pRes ) { delete pSynth->pRes; pSynth->pRes = NULL; } if( pSynth->pLPC ) { delete pSynth->pLPC; pSynth->pLPC = NULL; } } /* CBackend::FreeSynth */ /***************************************************************************** * ExpConverter * *--------------* * Description: * Convert linear to exponential taper * 'ref' is a linear value between 0.0 to 1.0 * ********************************************************************** MC ***/ static float ExpConverter( float ref, float linearScale ) { SPDBG_FUNC( "ExpConverter" ); float audioGain; if( ref < LINEAR_BKPT) { //---------------------------------------- // Linear taper below LINEAR_BKPT //---------------------------------------- audioGain = linearScale * (ref / LINEAR_BKPT); } else { //---------------------------------------- // Log taper above LINEAR_BKPT //---------------------------------------- audioGain = (float) pow( 10.0, (double)((1.0f - ref) * LOG_RANGE) / 20.0 ); } return audioGain; } /* ExpConverter */ /***************************************************************************** * CBackend::CvtToShort * *----------------------* * Description: * Convert (in place) FLOAT audio to SHORT. * ********************************************************************** MC ***/ void CBackend::CvtToShort( float *pSrc, long blocksize, long stereoOut, float audioGain ) { SPDBG_FUNC( "CBackend::CvtToShort" ); long i; short *pDest; float fSamp; pDest = (short*)pSrc; for( i = 0; i < blocksize; ++i ) { //------------------------ // Read float sample... //------------------------ fSamp = (*pSrc++) * audioGain; //------------------------ // ...clip to 16-bits... //------------------------ if( fSamp > 32767 ) { fSamp = 32767; } else if( fSamp < (-32768) ) { fSamp = (-32768); } //------------------------ // ...save as SHORT //------------------------ *pDest++ = (short)fSamp; if( stereoOut ) { *pDest++ = (short)(0 - (int)fSamp); } } } /* CBackend::CvtToShort */ /***************************************************************************** * CBackend::PSOLA_Stretch * *-------------------------* * Description: * Does PSOLA epoch stretching or compressing * ********************************************************************** MC ***/ void CBackend::PSOLA_Stretch( float *pInRes, long InSize, float *pOutRes, long OutSize, float *pWindow, long cWindowSize ) { SPDBG_FUNC( "CBackend::PSOLA_Stretch" ); long i, lim; float window, delta, kf; memset( pOutRes, 0, sizeof(float) * OutSize ); lim = MIN(InSize, OutSize ); delta = (float)cWindowSize / (float)lim; kf = 0.5f; pOutRes[0] = pInRes[0]; for( i = 1; i < lim; ++i ) { kf += delta; window = pWindow[(long) kf]; pOutRes[i] += pInRes[i] * window; pOutRes[OutSize - i] += pInRes[InSize - i] * window; } } /* CBackend::PSOLA_Stretch */ /***************************************************************************** * CBackend::PrepareSpeech * *-------------------------* * Description: * ********************************************************************** MC ***/ void CBackend::PrepareSpeech( ISpTTSEngineSite* outputSite ) { SPDBG_FUNC( "CBackend::PrepareSpeech" ); //m_pUnits = pUnits; //m_unitCount = unitCount; //m_CurUnitIndex = 0; m_pOutputSite = outputSite; m_silMode = true; m_durationTarget = 0; m_cOutSamples_Phon = 1; m_cOutEpochs = 0; // Pull model big-bang m_SpeechState = SPEECH_CONTINUE; m_cOutSamples_Total = 0; m_HasSpeech = false; } /* CBackend::PrepareSpeech */ /***************************************************************************** * CBackend::ProsodyMod * *----------------------* * Description: * Calculate the epoch sequence for the synthesized speech * * INPUT: * * OUTPUT: * FIlls 'pOutEpoch', 'pMap', and 'pRevFlag' * Returns new epoch count * ********************************************************************** MC ***/ long CBackend::ProsodyMod( UNITINFO *pCurUnit, long cInEpochs, float durationMpy, long cMaxOutEpochs ) { SPDBG_FUNC( "CBackend::ProsodyMod" ); long iframe, framesize, framesizeOut, j; long cntOut, csamplesOut, cOutEpochs; BOOL fUnvoiced; short fReverse; float totalDuration; float durationIn; // Active accum of IN duration float durationOut; // Active accum of OUT duration aligned to IN domain float freqMpy; BOOL fAdvanceInput; float vibrato; unsigned char *SineWavePtr; float epochFreq; float *pTime; float *pF0; iframe = 0; durationIn = 0.0f; durationOut = 0.0f; csamplesOut = 0; cntOut = 0; cOutEpochs = 0; fReverse = false; pTime = pCurUnit->pTime; pF0 = pCurUnit->pF0; //------------------------------------ // Find total input duration //------------------------------------ totalDuration = 0; for( j = 0; j < cInEpochs; ++j ) { totalDuration += ABS(m_pInEpoch[j]); } /*PredictEpochDist( pCurUnit->duration, pCurUnit->nKnots, m_SampleRate, pTime, pF0 );*/ while( iframe < cInEpochs && cOutEpochs < cMaxOutEpochs) { //----------------------------------------- // Compute output frame length //----------------------------------------- if( m_pInEpoch[iframe] < 0 ) { //------------------------------------------------- // Since we can't change unvoiced pitch, // do not change frame size for unvoiced frames //------------------------------------------------- framesize = (long)((-m_pInEpoch[iframe]) + 0.5f); framesizeOut = framesize; fUnvoiced = true; } else { //--------------------------------------------------- // Modify frame size for voiced epoch // based on epoch frequency //--------------------------------------------------- j = 1; //--------------------------------------------------- // Align to appropriate knot bassed on // current output sample //--------------------------------------------------- while( (j < (long)pCurUnit->nKnots - 1) && (csamplesOut > pTime[j]) ) j++; //--------------------------------------------------- // Calculate exact pitch thru linear interpolation //--------------------------------------------------- epochFreq = LinInterp( pTime[j - 1], (float)csamplesOut, pTime[j], pF0[j - 1], pF0[j] ); SineWavePtr = (unsigned char*)&g_SineWaveTbl[0]; vibrato = (float)(((unsigned char)(*(SineWavePtr + (m_vibrato_Phase1 >> 16)))) - 128); vibrato *= m_VibratoDepth; //--------------------------------------------------- // Scale frame size using in/out ratio //--------------------------------------------------- epochFreq = epochFreq + vibrato; if( epochFreq < MIN_VOICE_PITCH ) { epochFreq = MIN_VOICE_PITCH; } framesize = (long)(m_pInEpoch[iframe] + 0.5f); framesizeOut = (long)(m_SampleRate / epochFreq); vibrato = ((float)256 / ((float)22050 / m_VibratoFreq)) * (float)framesizeOut; // 3 Hz //vibrato = ((float)256 / (float)7350) * (float)framesizeOut; // 3 Hz m_vibrato_Phase1 += (long)(vibrato * (float)65536); m_vibrato_Phase1 &= 0xFFFFFF; //--------------------------------------------------- // @@@@ REMOVED 2x LIMIT //--------------------------------------------------- /*if( framesizeOut > 2*framesize ) { framesizeOut = 2*framesize; } if( framesize > 2*framesizeOut ) { framesizeOut = framesize/2; }*/ freqMpy = (float) framesize / framesizeOut; fUnvoiced = false; } //------------------------------------------- // Generate next output frame //------------------------------------------- fAdvanceInput = false; if( durationOut + (0.5f * framesizeOut/durationMpy) <= durationIn + framesize ) { //----------------------------------------- // If UNvoiced and odd frame, // reverse residual //----------------------------------------- if( fUnvoiced && (cntOut & 1) ) { m_pRevFlag[cOutEpochs] = true; fReverse = true; } else { m_pRevFlag[cOutEpochs] = false; fReverse = false; } ++cntOut; durationOut += framesizeOut/durationMpy; csamplesOut += framesizeOut; m_pOutEpoch[cOutEpochs] = (float)framesizeOut; m_pMap[cOutEpochs] = iframe; cOutEpochs++; } else { fAdvanceInput = true; } //------------------------------------------- // Advance to next input frame //------------------------------------------- if( ((durationOut + (0.5f * framesizeOut/durationMpy)) > (durationIn + framesize)) || //(cntOut >= 3) || @@@@ REMOVED 2x LIMIT //(fReverse == true) || fAdvanceInput ) { durationIn += framesize; ++iframe; cntOut = 0; } } return cOutEpochs; } /* CBackend::ProsodyMod */ /***************************************************************************** * CBackend::LPCFilter * *---------------------* * Description: * LPC filter of order cOrder. It filters the residual signal * pRes, producing output pOutWave. This routine requires that * pOutWave has the true waveform history from [-cOrder,0] and * of course it has to be defined. * ********************************************************************** MC ***/ void CBackend::LPCFilter( float *pCurLPC, float *pCurRes, long len, float gain ) { SPDBG_FUNC( "CBackend::LPCFilter" ); INT t, j; for( t = 0; t < len; t++ ) { m_pHistory[0] = pCurLPC[0] * pCurRes[t]; for( j = m_cOrder; j > 0; j-- ) { m_pHistory[0] -= pCurLPC[j] * m_pHistory[j]; m_pHistory[j] = m_pHistory[j - 1]; } pCurRes[t] = m_pHistory[0] * gain; } } /* CBackend::LPCFilter */ /*void CBackend::LPCFilter( float *pCurLPC, float *pCurRes, long len ) { long t; for( t = 0; t < len; t++ ) { pCurRes[t] = pCurRes[t] * 10; } } */ /***************************************************************************** * CBackend::ResRecons * *---------------------* * Description: * Obtains output prosody modified residual * ********************************************************************** MC ***/ void CBackend::ResRecons( float *pInRes, long InSize, float *pOutRes, long OutSize, float scale ) { SPDBG_FUNC( "CBackend::ResRecons" ); long i, j; if( m_pRevFlag[m_EpochIndex] ) { //---------------------------------------------------- // Process repeated and reversed UNvoiced residual //---------------------------------------------------- for( i = 0, j = OutSize-1; i < OutSize; ++i, --j ) { pOutRes[i] = pInRes[j]; } } else if( InSize == OutSize ) { //---------------------------------------------------- // Unvoiced residual or voiced residual // with no pitch change //---------------------------------------------------- memcpy( pOutRes, pInRes, sizeof(float) *OutSize ); } else { //---------------------------------------------------- // Process voiced residual //---------------------------------------------------- PSOLA_Stretch( pInRes, InSize, pOutRes, OutSize, m_pWindow, m_FFTSize ); } //---------------------------------- // Amplify frame //---------------------------------- if( scale != 1.0f ) { for( i = 0 ; i < OutSize; ++i ) { pOutRes[i] *= scale; } } } /* CBackend::ResRecons */ /***************************************************************************** * CBackend::StartNewUnit * *------------------------* * Description: * Synthesize audio samples for a target unit * * INPUT: * pCurUnit - unit ID, F0, duration, etc. * * OUTPUT: * Sets 'pCurUnit->csamplesOut' with audio length * ********************************************************************** MC ***/ HRESULT CBackend::StartNewUnit( ) { SPDBG_FUNC( "CBackend::StartNewUnit" ); long cframeMax = 0, cInEpochs = 0, i; float totalDuration, durationOut, durationMpy = 0; UNITINFO *pCurUnit; HRESULT hr = S_OK; SPEVENT event; ULONGLONG clientInterest; USHORT volumeVal; // Check for VOLUME change if( m_pOutputSite->GetActions() & SPVES_VOLUME ) { hr = m_pOutputSite->GetVolume( &volumeVal ); if ( SUCCEEDED( hr ) ) { if( volumeVal > SPMAX_VOLUME ) { //--- Clip rate to engine maximum volumeVal = SPMAX_VOLUME; } else if ( volumeVal < SPMIN_VOLUME ) { //--- Clip rate to engine minimum volumeVal = SPMIN_VOLUME; } m_MasterVolume = volumeVal; } } //--------------------------------------- // Delete previous unit //--------------------------------------- CleanUpSynth( ); //--------------------------------------- // Get next phon //--------------------------------------- hr = m_pSrcObj->NextData( (void**)&pCurUnit, &m_SpeechState ); if( m_SpeechState == SPEECH_CONTINUE ) { m_HasSpeech = pCurUnit->hasSpeech; m_pOutputSite->GetEventInterest( &clientInterest ); //------------------------------------------------ // Post SENTENCE event //------------------------------------------------ if( (pCurUnit->flags & SENT_START_FLAG) && (clientInterest & SPFEI(SPEI_SENTENCE_BOUNDARY)) ) { event.elParamType = SPET_LPARAM_IS_UNDEFINED; event.eEventId = SPEI_SENTENCE_BOUNDARY; event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample; event.lParam = pCurUnit->sentencePosition; // Input word position event.wParam = pCurUnit->sentenceLen; // Input word length m_pOutputSite->AddEvents( &event, 1 ); } //------------------------------------------------ // Post PHONEME event //------------------------------------------------ if( clientInterest & SPFEI(SPEI_PHONEME) ) { event.elParamType = SPET_LPARAM_IS_UNDEFINED; event.eEventId = SPEI_PHONEME; event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample; event.lParam = ((ULONG)pCurUnit->AlloFeatures << 16) + g_IPAToAllo[pCurUnit->AlloID]; event.wParam = ((ULONG)(pCurUnit->duration * 1000.0f) << 16) + g_IPAToAllo[pCurUnit->NextAlloID]; m_pOutputSite->AddEvents( &event, 1 ); } //------------------------------------------------ // Post VISEME event //------------------------------------------------ if( clientInterest & SPFEI(SPEI_VISEME) ) { event.elParamType = SPET_LPARAM_IS_UNDEFINED; event.eEventId = SPEI_VISEME; event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample; event.lParam = ((ULONG)pCurUnit->AlloFeatures << 16) + g_AlloToViseme[pCurUnit->AlloID]; event.wParam = ((ULONG)(pCurUnit->duration * 1000.0f) << 16) + g_AlloToViseme[pCurUnit->NextAlloID]; m_pOutputSite->AddEvents( &event, 1 ); } //------------------------------------------------ // Post any bookmark events //------------------------------------------------ if( pCurUnit->pBMObj != NULL ) { CBookmarkList *pBMObj; BOOKMARK_ITEM* pMarker; //------------------------------------------------- // Retrieve marker strings from Bookmark list and // enter into Event list //------------------------------------------------- pBMObj = (CBookmarkList*)pCurUnit->pBMObj; //cMarkerCount = pBMObj->m_BMList.GetCount(); if( clientInterest & SPFEI(SPEI_TTS_BOOKMARK) ) { //--------------------------------------- // Send event for every bookmark in list //--------------------------------------- SPLISTPOS listPos; listPos = pBMObj->m_BMList.GetHeadPosition(); while( listPos ) { pMarker = (BOOKMARK_ITEM*)pBMObj->m_BMList.GetNext( listPos ); event.eEventId = SPEI_TTS_BOOKMARK; event.elParamType = SPET_LPARAM_IS_STRING; event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample; //--- Copy in bookmark string - has been NULL terminated in source already... event.lParam = pMarker->pBMItem; // Engine must convert string to long for wParam. event.wParam = _wtol((WCHAR *)pMarker->pBMItem); m_pOutputSite->AddEvents( &event, 1 ); } } //--------------------------------------------- // We don't need this Bookmark list any more //--------------------------------------------- delete pBMObj; pCurUnit->pBMObj = NULL; } pCurUnit->csamplesOut = 0; //****************************************************** // For SIL, fill buffer with zeros... //****************************************************** if( pCurUnit->UnitID == UNIT_SIL ) { //--------------------------------------------- // Calc SIL length //--------------------------------------------- m_durationTarget = (long)(m_SampleRate * pCurUnit->duration); m_cOutSamples_Phon = 0; m_silMode = true; //--------------------------------------------- // Clear LPC filter storage //--------------------------------------------- memset( m_pHistory, 0, sizeof(float)*(m_cOrder+1) ); //-------------------------------- // Success! //-------------------------------- // Debug macro - output unit data... TTSDBG_LOGUNITS; } //****************************************************** // ...otherwise fill buffer with inventory data //****************************************************** else { m_silMode = false; // Get unit data from voice hr = m_pVoiceDataObj->GetUnitData( pCurUnit->UnitID, &m_Synth ); if( SUCCEEDED(hr) ) { durationOut = 0.0f; cInEpochs = m_Synth.cNumEpochs; m_pInEpoch = m_Synth.pEpoch; //cframeMax = PeakValue( m_pInEpoch, cInEpochs ); totalDuration = (float)m_Synth.cNumSamples; //----------------------------------------------- // For debugging: Force duration to unit length //----------------------------------------------- /*float unitDur; unitDur = totalDuration / 22050.0f; if( pCurUnit->duration < unitDur ) { if( pCurUnit->speechRate < 1 ) { pCurUnit->duration = unitDur * pCurUnit->speechRate; } else { pCurUnit->duration = unitDur; } }*/ durationMpy = pCurUnit->duration; cframeMax = (long)pCurUnit->pF0[0]; for( i = 1; i < (long)pCurUnit->nKnots; i++ ) { //----------------------------------------- // Find the longest epoch //----------------------------------------- cframeMax = (long)(MAX(cframeMax,pCurUnit->pF0[i])); } cframeMax *= (long)(durationMpy * MAX_TARGETS_PER_UNIT); durationMpy = (m_SampleRate * durationMpy) / totalDuration; cframeMax += (long)(durationMpy * cInEpochs * MAX_TARGETS_PER_UNIT); // // mplumpe 11/18/97 : added to eliminate chance of crash. // cframeMax *= 2; //--------------------------------------------------- // New epochs adjusted for duration and pitch //--------------------------------------------------- m_pOutEpoch = new float[cframeMax]; if( !m_pOutEpoch ) { //-------------------------------------- // Out of memory! //-------------------------------------- hr = E_OUTOFMEMORY; pCurUnit->csamplesOut = 0; CleanUpSynth( ); } } if( SUCCEEDED(hr) ) { //--------------------------------------------------- // Index back to orig epoch //--------------------------------------------------- m_pMap = new long[cframeMax]; if( !m_pMap ) { //-------------------------------------- // Out of memory! //-------------------------------------- hr = E_OUTOFMEMORY; pCurUnit->csamplesOut = 0; CleanUpSynth( ); } } if( SUCCEEDED(hr) ) { //--------------------------------------------------- // TRUE = reverse residual //--------------------------------------------------- m_pRevFlag = new short[cframeMax]; if( !m_pRevFlag ) { //-------------------------------------- // Out of memory! //-------------------------------------- hr = E_OUTOFMEMORY; pCurUnit->csamplesOut = 0; CleanUpSynth( ); } } if( SUCCEEDED(hr) ) { //--------------------------------------------------------------------- // Compute synthesis epochs and corresponding mapping to analysis // fills in: m_pOutEpoch, m_pMap, m_pRevFlag //--------------------------------------------------------------------- m_cOutEpochs = ProsodyMod( pCurUnit, cInEpochs, durationMpy, cframeMax ); //------------------------------------------------ // Now that actual epoch sizes are known, // calculate total audio sample count // @@@@ NO LONGER NEEDED //------------------------------------------------ pCurUnit->csamplesOut = 0; for( i = 0; i < m_cOutEpochs; i++ ) { pCurUnit->csamplesOut += (long)(ABS(m_pOutEpoch[i])); } m_cOutSamples_Phon = 0; m_EpochIndex = 0; m_durationTarget = (long)(pCurUnit->duration * m_SampleRate); m_pInRes = m_Synth.pRes; m_pLPC = m_Synth.pLPC; m_pSynthTime = pCurUnit->pTime; m_pSynthAmp = pCurUnit->pAmp; m_nKnots = pCurUnit->nKnots; // NOTE: Maybe make log volume? m_UnitVolume = (float)pCurUnit->user_Volume / 100.0f; //------------------------------------------------ // Post WORD event //------------------------------------------------ if( (pCurUnit->flags & WORD_START_FLAG) && (clientInterest & SPFEI(SPEI_WORD_BOUNDARY)) ) { event.elParamType = SPET_LPARAM_IS_UNDEFINED; event.eEventId = SPEI_WORD_BOUNDARY; event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample; event.lParam = pCurUnit->srcPosition; // Input word position event.wParam = pCurUnit->srcLen; // Input word length m_pOutputSite->AddEvents( &event, 1 ); } //--- Debug macro - output unit data TTSDBG_LOGUNITS; } } } return hr; } /* CBackend::StartNewUnit */ /***************************************************************************** * CBackend::CleanUpSynth * *------------------------* * Description: * ********************************************************************** MC ***/ void CBackend::CleanUpSynth( ) { SPDBG_FUNC( "CBackend::CleanUpSynth" ); if( m_pOutEpoch ) { delete m_pOutEpoch; m_pOutEpoch = NULL; } if( m_pMap ) { delete m_pMap; m_pMap = NULL; } if( m_pRevFlag ) { delete m_pRevFlag; m_pRevFlag = NULL; } // NOTE: make object? FreeSynth( &m_Synth ); } /* CBackend::CleanUpSynth */ /***************************************************************************** * CBackend::RenderFrame * *-----------------------* * Description: * This this the central synthesis loop. Keep filling output audio * buffer until buffer frame is full or speech is done. To render * continous speech, get each unit one at a time from upstream buffer. * ********************************************************************** MC ***/ HRESULT CBackend::RenderFrame( ) { SPDBG_FUNC( "CBackend::RenderFrame" ); long InSize, OutSize; long iframe; float *pCurInRes, *pCurOutRes; long i, j; float ampMpy; HRESULT hr = S_OK; m_cOutSamples_Frame = 0; do { OutSize = 0; if( m_silMode ) { //------------------------------- // Silence mode //------------------------------- if( m_cOutSamples_Phon >= m_durationTarget ) { //--------------------------- // Get next unit //--------------------------- hr = StartNewUnit( ); if (FAILED(hr)) { //----------------------------------- // Try to end it gracefully... //----------------------------------- m_SpeechState = SPEECH_DONE; } TTSDBG_LOGSILEPOCH; } else { //--------------------------- // Continue with current SIL //--------------------------- m_pSpeechBuf[m_cOutSamples_Frame] = 0; OutSize = 1; } } else { if( m_EpochIndex < m_cOutEpochs ) { //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // // Continue with current phon // //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ //------------------------------------ // Find current input residual //------------------------------------ iframe = m_pMap[m_EpochIndex]; pCurInRes = m_pInRes; for( i = 0; i < iframe; i++) { pCurInRes += (long) ABS(m_pInEpoch[i]); } pCurOutRes = m_pSpeechBuf + m_cOutSamples_Frame; InSize = (long)(ABS(m_pInEpoch[iframe])); OutSize = (long)(ABS(m_pOutEpoch[m_EpochIndex])); if (m_cOutSamples_Frame + OutSize > SPEECH_FRAME_SIZE + SPEECH_FRAME_OVER) { m_pOutEpoch[m_EpochIndex] = SPEECH_FRAME_OVER-1; // still huge OutSize = (long)(ABS(m_pOutEpoch[m_EpochIndex])); } j = 1; while( (j < m_nKnots - 1) && (m_cOutSamples_Phon > m_pSynthTime[j]) ) { j++; } ampMpy = LinInterp( m_pSynthTime[j - 1], (float)m_cOutSamples_Phon, m_pSynthTime[j], m_pSynthAmp[j - 1], m_pSynthAmp[j] ); //ampMpy = 1; //-------------------------------------------- // Do stretching of residuals //-------------------------------------------- ResRecons( pCurInRes, InSize, pCurOutRes, OutSize, ampMpy ); //-------------------------------------------- // Do LPC reconstruction //-------------------------------------------- float *pCurLPC; float totalGain; totalGain = ExpConverter( ((float)m_MasterVolume / (float)SPMAX_VOLUME), m_linearScale ) * ExpConverter( m_UnitVolume, m_linearScale ); pCurLPC = m_pLPC + m_pMap[m_EpochIndex] * (1 + m_cOrder); pCurLPC[0] = 1.0f; LPCFilter( pCurLPC, &m_pSpeechBuf[m_cOutSamples_Frame], OutSize, totalGain ); m_EpochIndex++; } else { //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ // // Get next phon // //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ hr = StartNewUnit( ); if (FAILED(hr)) { //----------------------------------- // Try to end it gracefully... //----------------------------------- m_SpeechState = SPEECH_DONE; } TTSDBG_LOGSILEPOCH; } } m_cOutSamples_Frame += OutSize; m_cOutSamples_Phon += OutSize; m_cOutSamples_Total += OutSize; TTSDBG_LOGEPOCHS; } while( (m_cOutSamples_Frame < SPEECH_FRAME_SIZE) && (m_SpeechState == SPEECH_CONTINUE) ); if( SUCCEEDED(hr) ) { //---------------------------------------------- // Convert buffer from FLOAT to SHORT //---------------------------------------------- if( m_pReverb ) { //--------------------------------- // Add REVERB //--------------------------------- m_pReverb->Reverb_Process( m_pSpeechBuf, m_cOutSamples_Frame, 1.0f ); } else { CvtToShort( m_pSpeechBuf, m_cOutSamples_Frame, m_StereoOut, 1.0f ); } //--- Debug Macro - output wave data to stream TTSDBG_LOGWAVE; } if( SUCCEEDED( hr ) ) { //------------------------------------ // Send this buffer to SAPI site //------------------------------------ DWORD cbWritten; //------------------------------------------------------------------------------------ // This was my lame hack to avoid sending buffers when nothing was spoken. // It was causing problems (among others) since StartNewUnit() was still sending // events - with no corresponding audio buffer! // // This was too simple of a scheme. Disable this feature for now... // ...until I come up with something more robust. (MC) //------------------------------------------------------------------------------------ //if( m_HasSpeech ) { hr = m_pOutputSite->Write( (void*)m_pSpeechBuf, m_cOutSamples_Frame * m_BytesPerSample, &cbWritten ); if( FAILED( hr ) ) { //---------------------------------------- // Abort! Unable to write audio data //---------------------------------------- m_SpeechState = SPEECH_DONE; } } } //------------------------------------ // Return render state //------------------------------------ return hr; } /* CBackend::RenderFrame */