Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1327 lines
43 KiB

  1. /*******************************************************************************
  2. * Backend.cpp *
  3. *-------------*
  4. * Description:
  5. * This module is the implementation file for the CBackend class.
  6. *-------------------------------------------------------------------------------
  7. * Created By: mc Date: 03/12/99
  8. * Copyright (C) 1999 Microsoft Corporation
  9. * All Rights Reserved
  10. *
  11. *******************************************************************************/
  12. #include "stdafx.h"
  13. #ifndef __spttseng_h__
  14. #include "spttseng.h"
  15. #endif
  16. #ifndef Backend_H
  17. #include "Backend.h"
  18. #endif
  19. #ifndef FeedChain_H
  20. #include "FeedChain.h"
  21. #endif
  22. #ifndef SPDebug_h
  23. #include <spdebug.h>
  24. #endif
  25. //-----------------------------
  26. // Data.cpp
  27. //-----------------------------
  28. extern const short g_IPAToAllo[];
  29. extern const short g_AlloToViseme[];
  30. //--------------------------------------
  31. // DEBUG: Save utterance WAV file
  32. //--------------------------------------
  33. //#define SAVE_WAVE_FILE 1
  34. const unsigned char g_SineWaveTbl[] =
  35. {
  36. 0x7b,0x7e,0x81,0x84,0x87,0x89,0x8c,0x8f,0x92,0x95,0x98,0x9b,0x9d,0xa0,0xa3,0xa6,
  37. 0xa8,0xab,0xae,0xb0,0xb3,0xb5,0xb8,0xbb,0xbd,0xbf,0xc2,0xc4,0xc7,0xc9,0xcb,0xcd,
  38. 0xcf,0xd1,0xd3,0xd5,0xd7,0xd9,0xdb,0xdd,0xdf,0xe0,0xe2,0xe3,0xe5,0xe6,0xe8,0xe9,
  39. 0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,0xf1,0xf2,0xf2,0xf3,0xf3,0xf4,0xf4,0xf4,0xf4,
  40. 0xf5,0xf5,0xf5,0xf5,0xf4,0xf4,0xf4,0xf4,0xf3,0xf3,0xf2,0xf1,0xf1,0xf0,0xef,0xee,
  41. 0xed,0xec,0xeb,0xea,0xe9,0xe7,0xe6,0xe5,0xe3,0xe1,0xe0,0xde,0xdc,0xdb,0xd9,0xd7,
  42. 0xd5,0xd3,0xd1,0xcf,0xcd,0xcb,0xc8,0xc6,0xc4,0xc1,0xbf,0xbc,0xba,0xb7,0xb5,0xb2,
  43. 0xb0,0xad,0xaa,0xa8,0xa5,0xa2,0x9f,0x9d,0x9a,0x97,0x94,0x91,0x8f,0x8c,0x89,0x86,
  44. 0x83,0x80,0x7d,0x7a,0x77,0x75,0x72,0x6f,0x6c,0x69,0x66,0x64,0x61,0x5e,0x5b,0x58,
  45. 0x56,0x53,0x50,0x4e,0x4b,0x49,0x46,0x44,0x41,0x3f,0x3c,0x3a,0x38,0x35,0x33,0x31,
  46. 0x2f,0x2d,0x2b,0x29,0x27,0x25,0x23,0x21,0x1f,0x1e,0x1c,0x1b,0x19,0x18,0x16,0x15,
  47. 0x14,0x13,0x12,0x11,0x10,0x0f,0x0e,0x0d,0x0c,0x0c,0x0b,0x0b,0x0a,0x0a,0x0a,0x0a,
  48. 0x09,0x09,0x09,0x09,0x0a,0x0a,0x0a,0x0a,0x0b,0x0b,0x0c,0x0d,0x0d,0x0e,0x0f,0x10,
  49. 0x11,0x12,0x13,0x14,0x15,0x17,0x18,0x1a,0x1b,0x1d,0x1e,0x20,0x22,0x23,0x25,0x27,
  50. 0x29,0x2b,0x2d,0x2f,0x31,0x34,0x36,0x38,0x3a,0x3d,0x3f,0x42,0x44,0x47,0x49,0x4c,
  51. 0x4e,0x51,0x54,0x56,0x59,0x5c,0x5f,0x61,0x64,0x67,0x6a,0x6d,0x6f,0x72,0x75,0x78
  52. };
  53. /*void PredictEpochDist( float duration,
  54. long nKnots,
  55. float SampleRate,
  56. float *pTime,
  57. float *pF0)
  58. {
  59. long curSamplesOut, endSample, j;
  60. float epochFreq;
  61. long epochLen, epochCount;
  62. curSamplesOut = 0;
  63. endSample = (long) (SampleRate * duration );
  64. epochCount = 0;
  65. while( curSamplesOut < endSample )
  66. {
  67. j = 1;
  68. //---------------------------------------------------
  69. // Align to appropriate knot bassed on
  70. // current output sample
  71. //---------------------------------------------------
  72. while( (j < nKnots - 1) && (curSamplesOut > pTime[j]) )
  73. j++;
  74. //---------------------------------------------------
  75. // Calculate exact pitch thru linear interpolation
  76. //---------------------------------------------------
  77. epochFreq = LinInterp( pTime[j - 1], curSamplesOut, pTime[j], pF0[j - 1], pF0[j] );
  78. //---------------------------------------------------
  79. // Calc sample count for curent epoch
  80. //---------------------------------------------------
  81. epochLen = (long) (SampleRate / epochFreq);
  82. epochCount++;
  83. curSamplesOut += epochLen;
  84. }
  85. }
  86. */
  87. /*****************************************************************************
  88. * CBackend::CBackend *
  89. *--------------------*
  90. * Description: Constructor
  91. *
  92. ********************************************************************** MC ***/
  93. CBackend::CBackend( )
  94. {
  95. SPDBG_FUNC( "CBackend::CBackend" );
  96. m_pHistory = NULL;
  97. m_pHistory2 = NULL;
  98. m_pFilter = NULL;
  99. m_pReverb = NULL;
  100. m_pOutEpoch = NULL;
  101. m_pMap = NULL;
  102. m_pRevFlag = NULL;
  103. m_pSpeechBuf = NULL;
  104. m_VibratoDepth = 0;
  105. m_UnitVolume = 1.0f;
  106. m_MasterVolume = SPMAX_VOLUME;
  107. memset( &m_Synth, 0, sizeof(MSUNITDATA) );
  108. } /* CBackend::CBackend */
  109. /*****************************************************************************
  110. * CBackend::~CBackend *
  111. *---------------------*
  112. * Description: Destructor
  113. *
  114. ********************************************************************** MC ***/
  115. CBackend::~CBackend( )
  116. {
  117. SPDBG_FUNC( "CBackend::~CBackend" );
  118. Release();
  119. } /* CBackend::~CBackend */
  120. /*****************************************************************************
  121. * CBackend::Release *
  122. *---------------------*
  123. * Description:
  124. * Free memory allocaterd by Backend
  125. *
  126. ********************************************************************** MC ***/
  127. void CBackend::Release( )
  128. {
  129. SPDBG_FUNC( "CBackend::Release" );
  130. CleanUpSynth( );
  131. if( m_pSpeechBuf)
  132. {
  133. delete m_pSpeechBuf;
  134. m_pSpeechBuf = NULL;
  135. }
  136. if( m_pHistory )
  137. {
  138. delete m_pHistory;
  139. m_pHistory = NULL;
  140. }
  141. if( m_pHistory2 )
  142. {
  143. delete m_pHistory2;
  144. m_pHistory2 = NULL;
  145. }
  146. if( m_pReverb )
  147. {
  148. delete m_pReverb;
  149. m_pReverb = NULL;
  150. }
  151. } /* CBackend::Release */
  152. /*****************************************************************************
  153. * CBackend::Init *
  154. *----------------*
  155. * Description:
  156. * Opens a backend instance, keeping a pointer of the acoustic
  157. * inventory.
  158. *
  159. ********************************************************************** MC ***/
  160. HRESULT CBackend::Init( IMSVoiceData* pVoiceDataObj, CFeedChain *pSrcObj, MSVOICEINFO* pVoiceInfo )
  161. {
  162. SPDBG_FUNC( "CBackend::Init" );
  163. long LPCsize = 0;
  164. HRESULT hr = S_OK;
  165. m_pVoiceDataObj = pVoiceDataObj;
  166. m_SampleRate = (float)pVoiceInfo->SampleRate;
  167. m_pSrcObj = pSrcObj;
  168. m_cOrder = pVoiceInfo->LPCOrder;
  169. m_pWindow = pVoiceInfo->pWindow;
  170. m_FFTSize = pVoiceInfo->FFTSize;
  171. m_VibratoDepth = ((float)pVoiceInfo->VibratoDepth) / 100.0f;
  172. m_VibratoDepth = 0; // NOTE: disable vibrato
  173. m_VibratoFreq = pVoiceInfo->VibratoFreq;
  174. if( pVoiceInfo->eReverbType > REVERB_TYPE_OFF )
  175. {
  176. m_StereoOut = true;
  177. m_BytesPerSample = 4;
  178. }
  179. else
  180. {
  181. m_StereoOut = false;
  182. m_BytesPerSample = 2;
  183. }
  184. //---------------------------------------
  185. // Allocate AUDIO buffer
  186. //---------------------------------------
  187. m_pSpeechBuf = new float[SPEECH_FRAME_SIZE + SPEECH_FRAME_OVER];
  188. if( m_pSpeechBuf == NULL )
  189. {
  190. //--------------------------------------
  191. // Out of memory!
  192. //--------------------------------------
  193. hr = E_OUTOFMEMORY;
  194. }
  195. if( SUCCEEDED(hr) )
  196. {
  197. //---------------------------------------
  198. // Allocate HISTORY buffer
  199. //---------------------------------------
  200. LPCsize = m_cOrder + 1;
  201. m_pHistory = new float[LPCsize];
  202. if( m_pHistory == NULL )
  203. {
  204. //--------------------------------------
  205. // Out of memory!
  206. //--------------------------------------
  207. hr = E_OUTOFMEMORY;
  208. }
  209. }
  210. if( SUCCEEDED(hr) )
  211. {
  212. memset( m_pHistory, 0, LPCsize * sizeof(float) );
  213. m_pOutEpoch = NULL;
  214. m_pMap = NULL;
  215. m_pRevFlag = NULL;
  216. m_fModifiers = 0;
  217. m_vibrato_Phase1 = 0;
  218. //--------------------------------
  219. // Reverb Effect
  220. //--------------------------------
  221. //pVoiceInfo->eReverbType = REVERB_TYPE_HALL;
  222. if( pVoiceInfo->eReverbType > REVERB_TYPE_OFF )
  223. {
  224. //--------------------------------
  225. // Create ReverbFX object
  226. //--------------------------------
  227. if( m_pReverb == NULL )
  228. {
  229. m_pReverb = new CReverbFX;
  230. if( m_pReverb )
  231. {
  232. short result;
  233. result = m_pReverb->Reverb_Init( pVoiceInfo->eReverbType, (long)m_SampleRate, m_StereoOut );
  234. if( result != KREVERB_NOERROR )
  235. {
  236. //--------------------------------------------
  237. // Not enough memory to do reverb
  238. // Recover gracefully
  239. //--------------------------------------------
  240. delete m_pReverb;
  241. m_pReverb = NULL;
  242. }
  243. /*else
  244. {
  245. //--------------------------------------------------------
  246. // Init was successful, ready to do reverb now
  247. //--------------------------------------------------------
  248. }*/
  249. }
  250. }
  251. }
  252. //----------------------------
  253. // Linear taper region scale
  254. //----------------------------
  255. m_linearScale = (float) pow( 10.0, (double)((1.0f - LINEAR_BKPT) * LOG_RANGE) / 20.0 );
  256. #ifdef SAVE_WAVE_FILE
  257. m_SaveFile = (PCSaveWAV) new CSaveWAV; // No check needed, if this fails, we simply don't save file.
  258. if( m_SaveFile )
  259. {
  260. m_SaveFile->OpenWavFile( (long)m_SampleRate );
  261. }
  262. #endif
  263. }
  264. else
  265. {
  266. if( m_pSpeechBuf )
  267. {
  268. delete m_pSpeechBuf;
  269. m_pSpeechBuf = NULL;
  270. }
  271. if( m_pHistory )
  272. {
  273. delete m_pHistory;
  274. m_pHistory = NULL;
  275. }
  276. }
  277. return hr;
  278. } /* CBackend::Init */
  279. /*****************************************************************************
  280. * CBackend::FreeSynth *
  281. *---------------------*
  282. * Description:
  283. * Return TRUE if consoants can be clustered.
  284. *
  285. ********************************************************************** MC ***/
  286. void CBackend::FreeSynth( MSUNITDATA* pSynth )
  287. {
  288. SPDBG_FUNC( "CBackend::FreeSynth" );
  289. if( pSynth->pEpoch )
  290. {
  291. delete pSynth->pEpoch;
  292. pSynth->pEpoch = NULL;
  293. }
  294. if( pSynth->pRes )
  295. {
  296. delete pSynth->pRes;
  297. pSynth->pRes = NULL;
  298. }
  299. if( pSynth->pLPC )
  300. {
  301. delete pSynth->pLPC;
  302. pSynth->pLPC = NULL;
  303. }
  304. } /* CBackend::FreeSynth */
  305. /*****************************************************************************
  306. * ExpConverter *
  307. *--------------*
  308. * Description:
  309. * Convert linear to exponential taper
  310. * 'ref' is a linear value between 0.0 to 1.0
  311. *
  312. ********************************************************************** MC ***/
  313. static float ExpConverter( float ref, float linearScale )
  314. {
  315. SPDBG_FUNC( "ExpConverter" );
  316. float audioGain;
  317. if( ref < LINEAR_BKPT)
  318. {
  319. //----------------------------------------
  320. // Linear taper below LINEAR_BKPT
  321. //----------------------------------------
  322. audioGain = linearScale * (ref / LINEAR_BKPT);
  323. }
  324. else
  325. {
  326. //----------------------------------------
  327. // Log taper above LINEAR_BKPT
  328. //----------------------------------------
  329. audioGain = (float) pow( 10.0, (double)((1.0f - ref) * LOG_RANGE) / 20.0 );
  330. }
  331. return audioGain;
  332. } /* ExpConverter */
  333. /*****************************************************************************
  334. * CBackend::CvtToShort *
  335. *----------------------*
  336. * Description:
  337. * Convert (in place) FLOAT audio to SHORT.
  338. *
  339. ********************************************************************** MC ***/
  340. void CBackend::CvtToShort( float *pSrc, long blocksize, long stereoOut, float audioGain )
  341. {
  342. SPDBG_FUNC( "CBackend::CvtToShort" );
  343. long i;
  344. short *pDest;
  345. float fSamp;
  346. pDest = (short*)pSrc;
  347. for( i = 0; i < blocksize; ++i )
  348. {
  349. //------------------------
  350. // Read float sample...
  351. //------------------------
  352. fSamp = (*pSrc++) * audioGain;
  353. //------------------------
  354. // ...clip to 16-bits...
  355. //------------------------
  356. if( fSamp > 32767 )
  357. {
  358. fSamp = 32767;
  359. }
  360. else if( fSamp < (-32768) )
  361. {
  362. fSamp = (-32768);
  363. }
  364. //------------------------
  365. // ...save as SHORT
  366. //------------------------
  367. *pDest++ = (short)fSamp;
  368. if( stereoOut )
  369. {
  370. *pDest++ = (short)(0 - (int)fSamp);
  371. }
  372. }
  373. } /* CBackend::CvtToShort */
  374. /*****************************************************************************
  375. * CBackend::PSOLA_Stretch *
  376. *-------------------------*
  377. * Description:
  378. * Does PSOLA epoch stretching or compressing
  379. *
  380. ********************************************************************** MC ***/
  381. void CBackend::PSOLA_Stretch( float *pInRes, long InSize,
  382. float *pOutRes, long OutSize,
  383. float *pWindow,
  384. long cWindowSize )
  385. {
  386. SPDBG_FUNC( "CBackend::PSOLA_Stretch" );
  387. long i, lim;
  388. float window, delta, kf;
  389. memset( pOutRes, 0, sizeof(float) * OutSize );
  390. lim = MIN(InSize, OutSize );
  391. delta = (float)cWindowSize / (float)lim;
  392. kf = 0.5f;
  393. pOutRes[0] = pInRes[0];
  394. for( i = 1; i < lim; ++i )
  395. {
  396. kf += delta;
  397. window = pWindow[(long) kf];
  398. pOutRes[i] += pInRes[i] * window;
  399. pOutRes[OutSize - i] += pInRes[InSize - i] * window;
  400. }
  401. } /* CBackend::PSOLA_Stretch */
  402. /*****************************************************************************
  403. * CBackend::PrepareSpeech *
  404. *-------------------------*
  405. * Description:
  406. *
  407. ********************************************************************** MC ***/
  408. void CBackend::PrepareSpeech( ISpTTSEngineSite* outputSite )
  409. {
  410. SPDBG_FUNC( "CBackend::PrepareSpeech" );
  411. //m_pUnits = pUnits;
  412. //m_unitCount = unitCount;
  413. //m_CurUnitIndex = 0;
  414. m_pOutputSite = outputSite;
  415. m_silMode = true;
  416. m_durationTarget = 0;
  417. m_cOutSamples_Phon = 1;
  418. m_cOutEpochs = 0; // Pull model big-bang
  419. m_SpeechState = SPEECH_CONTINUE;
  420. m_cOutSamples_Total = 0;
  421. m_HasSpeech = false;
  422. } /* CBackend::PrepareSpeech */
  423. /*****************************************************************************
  424. * CBackend::ProsodyMod *
  425. *----------------------*
  426. * Description:
  427. * Calculate the epoch sequence for the synthesized speech
  428. *
  429. * INPUT:
  430. *
  431. * OUTPUT:
  432. * FIlls 'pOutEpoch', 'pMap', and 'pRevFlag'
  433. * Returns new epoch count
  434. *
  435. ********************************************************************** MC ***/
  436. long CBackend::ProsodyMod( UNITINFO *pCurUnit,
  437. long cInEpochs,
  438. float durationMpy )
  439. {
  440. SPDBG_FUNC( "CBackend::ProsodyMod" );
  441. long iframe, framesize, framesizeOut, j;
  442. long cntOut, csamplesOut, cOutEpochs;
  443. BOOL fUnvoiced;
  444. short fReverse;
  445. float totalDuration;
  446. float durationIn; // Active accum of IN duration
  447. float durationOut; // Active accum of OUT duration aligned to IN domain
  448. float freqMpy;
  449. BOOL fAdvanceInput;
  450. float vibrato;
  451. unsigned char *SineWavePtr;
  452. float epochFreq;
  453. float *pTime;
  454. float *pF0;
  455. iframe = 0;
  456. durationIn = 0.0f;
  457. durationOut = 0.0f;
  458. csamplesOut = 0;
  459. cntOut = 0;
  460. cOutEpochs = 0;
  461. fReverse = false;
  462. pTime = pCurUnit->pTime;
  463. pF0 = pCurUnit->pF0;
  464. //------------------------------------
  465. // Find total input duration
  466. //------------------------------------
  467. totalDuration = 0;
  468. for( j = 0; j < cInEpochs; ++j )
  469. {
  470. totalDuration += ABS(m_pInEpoch[j]);
  471. }
  472. /*PredictEpochDist( pCurUnit->duration,
  473. pCurUnit->nKnots,
  474. m_SampleRate,
  475. pTime,
  476. pF0 );*/
  477. while( iframe < cInEpochs )
  478. {
  479. //-----------------------------------------
  480. // Compute output frame length
  481. //-----------------------------------------
  482. if( m_pInEpoch[iframe] < 0 )
  483. {
  484. //-------------------------------------------------
  485. // Since we can't change unvoiced pitch,
  486. // do not change frame size for unvoiced frames
  487. //-------------------------------------------------
  488. framesize = (long)((-m_pInEpoch[iframe]) + 0.5f);
  489. framesizeOut = framesize;
  490. fUnvoiced = true;
  491. }
  492. else
  493. {
  494. //---------------------------------------------------
  495. // Modify frame size for voiced epoch
  496. // based on epoch frequency
  497. //---------------------------------------------------
  498. j = 1;
  499. //---------------------------------------------------
  500. // Align to appropriate knot bassed on
  501. // current output sample
  502. //---------------------------------------------------
  503. while( (j < (long)pCurUnit->nKnots - 1) && (csamplesOut > pTime[j]) )
  504. j++;
  505. //---------------------------------------------------
  506. // Calculate exact pitch thru linear interpolation
  507. //---------------------------------------------------
  508. epochFreq = LinInterp( pTime[j - 1], (float)csamplesOut, pTime[j], pF0[j - 1], pF0[j] );
  509. SineWavePtr = (unsigned char*)&g_SineWaveTbl[0];
  510. vibrato = (float)(((unsigned char)(*(SineWavePtr + (m_vibrato_Phase1 >> 16)))) - 128);
  511. vibrato *= m_VibratoDepth;
  512. //---------------------------------------------------
  513. // Scale frame size using in/out ratio
  514. //---------------------------------------------------
  515. epochFreq = epochFreq + vibrato;
  516. if( epochFreq < MIN_VOICE_PITCH )
  517. {
  518. epochFreq = MIN_VOICE_PITCH;
  519. }
  520. framesize = (long)(m_pInEpoch[iframe] + 0.5f);
  521. framesizeOut = (long)(m_SampleRate / epochFreq);
  522. vibrato = ((float)256 / ((float)22050 / m_VibratoFreq)) * (float)framesizeOut; // 3 Hz
  523. //vibrato = ((float)256 / (float)7350) * (float)framesizeOut; // 3 Hz
  524. m_vibrato_Phase1 += (long)(vibrato * (float)65536);
  525. m_vibrato_Phase1 &= 0xFFFFFF;
  526. //---------------------------------------------------
  527. // @@@@ REMOVED 2x LIMIT
  528. //---------------------------------------------------
  529. /*if( framesizeOut > 2*framesize )
  530. {
  531. framesizeOut = 2*framesize;
  532. }
  533. if( framesize > 2*framesizeOut )
  534. {
  535. framesizeOut = framesize/2;
  536. }*/
  537. freqMpy = (float) framesize / framesizeOut;
  538. fUnvoiced = false;
  539. }
  540. //-------------------------------------------
  541. // Generate next output frame
  542. //-------------------------------------------
  543. fAdvanceInput = false;
  544. if( durationOut + (0.5f * framesizeOut/durationMpy) <= durationIn + framesize )
  545. {
  546. //-----------------------------------------
  547. // If UNvoiced and odd frame,
  548. // reverse residual
  549. //-----------------------------------------
  550. if( fUnvoiced && (cntOut & 1) )
  551. {
  552. m_pRevFlag[cOutEpochs] = true;
  553. fReverse = true;
  554. }
  555. else
  556. {
  557. m_pRevFlag[cOutEpochs] = false;
  558. fReverse = false;
  559. }
  560. ++cntOut;
  561. durationOut += framesizeOut/durationMpy;
  562. csamplesOut += framesizeOut;
  563. m_pOutEpoch[cOutEpochs] = (float)framesizeOut;
  564. m_pMap[cOutEpochs] = iframe;
  565. cOutEpochs++;
  566. }
  567. else
  568. {
  569. fAdvanceInput = true;
  570. }
  571. //-------------------------------------------
  572. // Advance to next input frame
  573. //-------------------------------------------
  574. if( ((durationOut + (0.5f * framesizeOut/durationMpy)) > (durationIn + framesize)) ||
  575. //(cntOut >= 3) || @@@@ REMOVED 2x LIMIT
  576. //(fReverse == true) ||
  577. fAdvanceInput )
  578. {
  579. durationIn += framesize;
  580. ++iframe;
  581. cntOut = 0;
  582. }
  583. }
  584. return cOutEpochs;
  585. } /* CBackend::ProsodyMod */
  586. /*****************************************************************************
  587. * CBackend::LPCFilter *
  588. *---------------------*
  589. * Description:
  590. * LPC filter of order cOrder. It filters the residual signal
  591. * pRes, producing output pOutWave. This routine requires that
  592. * pOutWave has the true waveform history from [-cOrder,0] and
  593. * of course it has to be defined.
  594. *
  595. ********************************************************************** MC ***/
  596. void CBackend::LPCFilter( float *pCurLPC, float *pCurRes, long len, float gain )
  597. {
  598. SPDBG_FUNC( "CBackend::LPCFilter" );
  599. INT t, j;
  600. for( t = 0; t < len; t++ )
  601. {
  602. m_pHistory[0] = pCurLPC[0] * pCurRes[t];
  603. for( j = m_cOrder; j > 0; j-- )
  604. {
  605. m_pHistory[0] -= pCurLPC[j] * m_pHistory[j];
  606. m_pHistory[j] = m_pHistory[j - 1];
  607. }
  608. pCurRes[t] = m_pHistory[0] * gain;
  609. }
  610. } /* CBackend::LPCFilter */
  611. /*void CBackend::LPCFilter( float *pCurLPC, float *pCurRes, long len )
  612. {
  613. long t;
  614. for( t = 0; t < len; t++ )
  615. {
  616. pCurRes[t] = pCurRes[t] * 10;
  617. }
  618. }
  619. */
  620. /*****************************************************************************
  621. * CBackend::ResRecons *
  622. *---------------------*
  623. * Description:
  624. * Obtains output prosody modified residual
  625. *
  626. ********************************************************************** MC ***/
  627. void CBackend::ResRecons( float *pInRes,
  628. long InSize,
  629. float *pOutRes,
  630. long OutSize,
  631. float scale )
  632. {
  633. SPDBG_FUNC( "CBackend::ResRecons" );
  634. long i, j;
  635. if( m_pRevFlag[m_EpochIndex] )
  636. {
  637. //----------------------------------------------------
  638. // Process repeated and reversed UNvoiced residual
  639. //----------------------------------------------------
  640. for( i = 0, j = OutSize-1; i < OutSize; ++i, --j )
  641. {
  642. pOutRes[i] = pInRes[j];
  643. }
  644. }
  645. else if( InSize == OutSize )
  646. {
  647. //----------------------------------------------------
  648. // Unvoiced residual or voiced residual
  649. // with no pitch change
  650. //----------------------------------------------------
  651. memcpy( pOutRes, pInRes, sizeof(float) *OutSize );
  652. }
  653. else
  654. {
  655. //----------------------------------------------------
  656. // Process voiced residual
  657. //----------------------------------------------------
  658. PSOLA_Stretch( pInRes, InSize, pOutRes, OutSize, m_pWindow, m_FFTSize );
  659. }
  660. //----------------------------------
  661. // Amplify frame
  662. //----------------------------------
  663. if( scale != 1.0f )
  664. {
  665. for( i = 0 ; i < OutSize; ++i )
  666. {
  667. pOutRes[i] *= scale;
  668. }
  669. }
  670. } /* CBackend::ResRecons */
  671. /*****************************************************************************
  672. * CBackend::StartNewUnit *
  673. *------------------------*
  674. * Description:
  675. * Synthesize audio samples for a target unit
  676. *
  677. * INPUT:
  678. * pCurUnit - unit ID, F0, duration, etc.
  679. *
  680. * OUTPUT:
  681. * Sets 'pCurUnit->csamplesOut' with audio length
  682. *
  683. ********************************************************************** MC ***/
  684. HRESULT CBackend::StartNewUnit( )
  685. {
  686. SPDBG_FUNC( "CBackend::StartNewUnit" );
  687. long cframeMax = 0, cInEpochs = 0, i;
  688. float totalDuration, durationOut, durationMpy = 0;
  689. UNITINFO *pCurUnit;
  690. HRESULT hr = S_OK;
  691. SPEVENT event;
  692. ULONGLONG clientInterest;
  693. USHORT volumeVal;
  694. // Check for VOLUME change
  695. if( m_pOutputSite->GetActions() & SPVES_VOLUME )
  696. {
  697. hr = m_pOutputSite->GetVolume( &volumeVal );
  698. if ( SUCCEEDED( hr ) )
  699. {
  700. if( volumeVal > SPMAX_VOLUME )
  701. {
  702. //--- Clip rate to engine maximum
  703. volumeVal = SPMAX_VOLUME;
  704. }
  705. else if ( volumeVal < SPMIN_VOLUME )
  706. {
  707. //--- Clip rate to engine minimum
  708. volumeVal = SPMIN_VOLUME;
  709. }
  710. m_MasterVolume = volumeVal;
  711. }
  712. }
  713. //---------------------------------------
  714. // Delete previous unit
  715. //---------------------------------------
  716. CleanUpSynth( );
  717. //---------------------------------------
  718. // Get next phon
  719. //---------------------------------------
  720. hr = m_pSrcObj->NextData( (void**)&pCurUnit, &m_SpeechState );
  721. if( m_SpeechState == SPEECH_CONTINUE )
  722. {
  723. m_HasSpeech = pCurUnit->hasSpeech;
  724. m_pOutputSite->GetEventInterest( &clientInterest );
  725. //------------------------------------------------
  726. // Post SENTENCE event
  727. //------------------------------------------------
  728. if( (pCurUnit->flags & SENT_START_FLAG) && (clientInterest & SPFEI(SPEI_SENTENCE_BOUNDARY)) )
  729. {
  730. event.elParamType = SPET_LPARAM_IS_UNDEFINED;
  731. event.eEventId = SPEI_SENTENCE_BOUNDARY;
  732. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  733. event.lParam = pCurUnit->sentencePosition; // Input word position
  734. event.wParam = pCurUnit->sentenceLen; // Input word length
  735. m_pOutputSite->AddEvents( &event, 1 );
  736. }
  737. //------------------------------------------------
  738. // Post PHONEME event
  739. //------------------------------------------------
  740. if( clientInterest & SPFEI(SPEI_PHONEME) )
  741. {
  742. event.elParamType = SPET_LPARAM_IS_UNDEFINED;
  743. event.eEventId = SPEI_PHONEME;
  744. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  745. event.lParam = ((ULONG)pCurUnit->AlloFeatures << 16) + g_IPAToAllo[pCurUnit->AlloID];
  746. event.wParam = ((ULONG)(pCurUnit->duration * 1000.0f) << 16) + g_IPAToAllo[pCurUnit->NextAlloID];
  747. m_pOutputSite->AddEvents( &event, 1 );
  748. }
  749. //------------------------------------------------
  750. // Post VISEME event
  751. //------------------------------------------------
  752. if( clientInterest & SPFEI(SPEI_VISEME) )
  753. {
  754. event.elParamType = SPET_LPARAM_IS_UNDEFINED;
  755. event.eEventId = SPEI_VISEME;
  756. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  757. event.lParam = ((ULONG)pCurUnit->AlloFeatures << 16) + g_AlloToViseme[pCurUnit->AlloID];
  758. event.wParam = ((ULONG)(pCurUnit->duration * 1000.0f) << 16) + g_AlloToViseme[pCurUnit->NextAlloID];
  759. m_pOutputSite->AddEvents( &event, 1 );
  760. }
  761. //------------------------------------------------
  762. // Post any bookmark events
  763. //------------------------------------------------
  764. if( pCurUnit->pBMObj != NULL )
  765. {
  766. CBookmarkList *pBMObj;
  767. BOOKMARK_ITEM* pMarker;
  768. //-------------------------------------------------
  769. // Retrieve marker strings from Bookmark list and
  770. // enter into Event list
  771. //-------------------------------------------------
  772. pBMObj = (CBookmarkList*)pCurUnit->pBMObj;
  773. //cMarkerCount = pBMObj->m_BMList.GetCount();
  774. if( clientInterest & SPFEI(SPEI_TTS_BOOKMARK) )
  775. {
  776. //---------------------------------------
  777. // Send event for every bookmark in list
  778. //---------------------------------------
  779. SPLISTPOS listPos;
  780. listPos = pBMObj->m_BMList.GetHeadPosition();
  781. while( listPos )
  782. {
  783. pMarker = (BOOKMARK_ITEM*)pBMObj->m_BMList.GetNext( listPos );
  784. event.eEventId = SPEI_TTS_BOOKMARK;
  785. event.elParamType = SPET_LPARAM_IS_STRING;
  786. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  787. //--- Copy in bookmark string - has been NULL terminated in source already...
  788. event.lParam = pMarker->pBMItem;
  789. // Engine must convert string to long for wParam.
  790. event.wParam = _wtol((WCHAR *)pMarker->pBMItem);
  791. m_pOutputSite->AddEvents( &event, 1 );
  792. }
  793. }
  794. //---------------------------------------------
  795. // We don't need this Bookmark list any more
  796. //---------------------------------------------
  797. delete pBMObj;
  798. pCurUnit->pBMObj = NULL;
  799. }
  800. pCurUnit->csamplesOut = 0;
  801. //******************************************************
  802. // For SIL, fill buffer with zeros...
  803. //******************************************************
  804. if( pCurUnit->UnitID == UNIT_SIL )
  805. {
  806. //---------------------------------------------
  807. // Calc SIL length
  808. //---------------------------------------------
  809. m_durationTarget = (long)(m_SampleRate * pCurUnit->duration);
  810. m_cOutSamples_Phon = 0;
  811. m_silMode = true;
  812. //---------------------------------------------
  813. // Clear LPC filter storage
  814. //---------------------------------------------
  815. memset( m_pHistory, 0, sizeof(float)*(m_cOrder+1) );
  816. //--------------------------------
  817. // Success!
  818. //--------------------------------
  819. // Debug macro - output unit data...
  820. TTSDBG_LOGUNITS;
  821. }
  822. //******************************************************
  823. // ...otherwise fill buffer with inventory data
  824. //******************************************************
  825. else
  826. {
  827. m_silMode = false;
  828. // Get unit data from voice
  829. hr = m_pVoiceDataObj->GetUnitData( pCurUnit->UnitID, &m_Synth );
  830. if( SUCCEEDED(hr) )
  831. {
  832. durationOut = 0.0f;
  833. cInEpochs = m_Synth.cNumEpochs;
  834. m_pInEpoch = m_Synth.pEpoch;
  835. //cframeMax = PeakValue( m_pInEpoch, cInEpochs );
  836. totalDuration = (float)m_Synth.cNumSamples;
  837. //-----------------------------------------------
  838. // For debugging: Force duration to unit length
  839. //-----------------------------------------------
  840. /*float unitDur;
  841. unitDur = totalDuration / 22050.0f;
  842. if( pCurUnit->duration < unitDur )
  843. {
  844. if( pCurUnit->speechRate < 1 )
  845. {
  846. pCurUnit->duration = unitDur * pCurUnit->speechRate;
  847. }
  848. else
  849. {
  850. pCurUnit->duration = unitDur;
  851. }
  852. }*/
  853. durationMpy = pCurUnit->duration;
  854. cframeMax = (long)pCurUnit->pF0[0];
  855. for( i = 1; i < (long)pCurUnit->nKnots; i++ )
  856. {
  857. //-----------------------------------------
  858. // Find the longest epoch
  859. //-----------------------------------------
  860. cframeMax = (long)(MAX(cframeMax,pCurUnit->pF0[i]));
  861. }
  862. cframeMax *= (long)(durationMpy * MAX_TARGETS_PER_UNIT);
  863. durationMpy = (m_SampleRate * durationMpy) / totalDuration;
  864. cframeMax += (long)(durationMpy * cInEpochs * MAX_TARGETS_PER_UNIT);
  865. //
  866. // mplumpe 11/18/97 : added to eliminate chance of crash.
  867. //
  868. cframeMax *= 2;
  869. //---------------------------------------------------
  870. // New epochs adjusted for duration and pitch
  871. //---------------------------------------------------
  872. m_pOutEpoch = new float[cframeMax];
  873. if( !m_pOutEpoch )
  874. {
  875. //--------------------------------------
  876. // Out of memory!
  877. //--------------------------------------
  878. hr = E_OUTOFMEMORY;
  879. pCurUnit->csamplesOut = 0;
  880. CleanUpSynth( );
  881. }
  882. }
  883. if( SUCCEEDED(hr) )
  884. {
  885. //---------------------------------------------------
  886. // Index back to orig epoch
  887. //---------------------------------------------------
  888. m_pMap = new long[cframeMax];
  889. if( !m_pMap )
  890. {
  891. //--------------------------------------
  892. // Out of memory!
  893. //--------------------------------------
  894. hr = E_OUTOFMEMORY;
  895. pCurUnit->csamplesOut = 0;
  896. CleanUpSynth( );
  897. }
  898. }
  899. if( SUCCEEDED(hr) )
  900. {
  901. //---------------------------------------------------
  902. // TRUE = reverse residual
  903. //---------------------------------------------------
  904. m_pRevFlag = new short[cframeMax];
  905. if( !m_pRevFlag )
  906. {
  907. //--------------------------------------
  908. // Out of memory!
  909. //--------------------------------------
  910. hr = E_OUTOFMEMORY;
  911. pCurUnit->csamplesOut = 0;
  912. CleanUpSynth( );
  913. }
  914. }
  915. if( SUCCEEDED(hr) )
  916. {
  917. //---------------------------------------------------------------------
  918. // Compute synthesis epochs and corresponding mapping to analysis
  919. // fills in: m_pOutEpoch, m_pMap, m_pRevFlag
  920. //---------------------------------------------------------------------
  921. m_cOutEpochs = ProsodyMod( pCurUnit, cInEpochs, durationMpy );
  922. //------------------------------------------------
  923. // Now that actual epoch sizes are known,
  924. // calculate total audio sample count
  925. // @@@@ NO LONGER NEEDED
  926. //------------------------------------------------
  927. pCurUnit->csamplesOut = 0;
  928. for( i = 0; i < m_cOutEpochs; i++ )
  929. {
  930. pCurUnit->csamplesOut += (long)(ABS(m_pOutEpoch[i]));
  931. }
  932. m_cOutSamples_Phon = 0;
  933. m_EpochIndex = 0;
  934. m_durationTarget = (long)(pCurUnit->duration * m_SampleRate);
  935. m_pInRes = m_Synth.pRes;
  936. m_pLPC = m_Synth.pLPC;
  937. m_pSynthTime = pCurUnit->pTime;
  938. m_pSynthAmp = pCurUnit->pAmp;
  939. m_nKnots = pCurUnit->nKnots;
  940. // NOTE: Maybe make log volume?
  941. m_UnitVolume = (float)pCurUnit->user_Volume / 100.0f;
  942. //------------------------------------------------
  943. // Post WORD event
  944. //------------------------------------------------
  945. if( (pCurUnit->flags & WORD_START_FLAG) && (clientInterest & SPFEI(SPEI_WORD_BOUNDARY)) )
  946. {
  947. event.elParamType = SPET_LPARAM_IS_UNDEFINED;
  948. event.eEventId = SPEI_WORD_BOUNDARY;
  949. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  950. event.lParam = pCurUnit->srcPosition; // Input word position
  951. event.wParam = pCurUnit->srcLen; // Input word length
  952. m_pOutputSite->AddEvents( &event, 1 );
  953. }
  954. //--- Debug macro - output unit data
  955. TTSDBG_LOGUNITS;
  956. }
  957. }
  958. }
  959. return hr;
  960. } /* CBackend::StartNewUnit */
  961. /*****************************************************************************
  962. * CBackend::CleanUpSynth *
  963. *------------------------*
  964. * Description:
  965. *
  966. ********************************************************************** MC ***/
  967. void CBackend::CleanUpSynth( )
  968. {
  969. SPDBG_FUNC( "CBackend::CleanUpSynth" );
  970. if( m_pOutEpoch )
  971. {
  972. delete m_pOutEpoch;
  973. m_pOutEpoch = NULL;
  974. }
  975. if( m_pMap )
  976. {
  977. delete m_pMap;
  978. m_pMap = NULL;
  979. }
  980. if( m_pRevFlag )
  981. {
  982. delete m_pRevFlag;
  983. m_pRevFlag = NULL;
  984. }
  985. // NOTE: make object?
  986. FreeSynth( &m_Synth );
  987. } /* CBackend::CleanUpSynth */
  988. /*****************************************************************************
  989. * CBackend::RenderFrame *
  990. *-----------------------*
  991. * Description:
  992. * This this the central synthesis loop. Keep filling output audio
  993. * buffer until buffer frame is full or speech is done. To render
  994. * continous speech, get each unit one at a time from upstream buffer.
  995. *
  996. ********************************************************************** MC ***/
  997. HRESULT CBackend::RenderFrame( )
  998. {
  999. SPDBG_FUNC( "CBackend::RenderFrame" );
  1000. long InSize, OutSize;
  1001. long iframe;
  1002. float *pCurInRes, *pCurOutRes;
  1003. long i, j;
  1004. float ampMpy;
  1005. HRESULT hr = S_OK;
  1006. m_cOutSamples_Frame = 0;
  1007. do
  1008. {
  1009. OutSize = 0;
  1010. if( m_silMode )
  1011. {
  1012. //-------------------------------
  1013. // Silence mode
  1014. //-------------------------------
  1015. if( m_cOutSamples_Phon >= m_durationTarget )
  1016. {
  1017. //---------------------------
  1018. // Get next unit
  1019. //---------------------------
  1020. hr = StartNewUnit( );
  1021. if (FAILED(hr))
  1022. {
  1023. //-----------------------------------
  1024. // Try to end it gracefully...
  1025. //-----------------------------------
  1026. m_SpeechState = SPEECH_DONE;
  1027. }
  1028. TTSDBG_LOGSILEPOCH;
  1029. }
  1030. else
  1031. {
  1032. //---------------------------
  1033. // Continue with current SIL
  1034. //---------------------------
  1035. m_pSpeechBuf[m_cOutSamples_Frame] = 0;
  1036. OutSize = 1;
  1037. }
  1038. }
  1039. else
  1040. {
  1041. if( m_EpochIndex < m_cOutEpochs )
  1042. {
  1043. //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1044. //
  1045. // Continue with current phon
  1046. //
  1047. //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1048. //------------------------------------
  1049. // Find current input residual
  1050. //------------------------------------
  1051. iframe = m_pMap[m_EpochIndex];
  1052. pCurInRes = m_pInRes;
  1053. for( i = 0; i < iframe; i++)
  1054. {
  1055. pCurInRes += (long) ABS(m_pInEpoch[i]);
  1056. }
  1057. pCurOutRes = m_pSpeechBuf + m_cOutSamples_Frame;
  1058. InSize = (long)(ABS(m_pInEpoch[iframe]));
  1059. OutSize = (long)(ABS(m_pOutEpoch[m_EpochIndex]));
  1060. j = 1;
  1061. while( (j < m_nKnots - 1) && (m_cOutSamples_Phon > m_pSynthTime[j]) )
  1062. {
  1063. j++;
  1064. }
  1065. ampMpy = LinInterp( m_pSynthTime[j - 1], (float)m_cOutSamples_Phon, m_pSynthTime[j], m_pSynthAmp[j - 1], m_pSynthAmp[j] );
  1066. //ampMpy = 1;
  1067. //--------------------------------------------
  1068. // Do stretching of residuals
  1069. //--------------------------------------------
  1070. ResRecons( pCurInRes, InSize, pCurOutRes, OutSize, ampMpy );
  1071. //--------------------------------------------
  1072. // Do LPC reconstruction
  1073. //--------------------------------------------
  1074. float *pCurLPC;
  1075. float totalGain;
  1076. totalGain = ExpConverter( ((float)m_MasterVolume / (float)SPMAX_VOLUME), m_linearScale )
  1077. * ExpConverter( m_UnitVolume, m_linearScale );
  1078. pCurLPC = m_pLPC + m_pMap[m_EpochIndex] * (1 + m_cOrder);
  1079. pCurLPC[0] = 1.0f;
  1080. LPCFilter( pCurLPC, &m_pSpeechBuf[m_cOutSamples_Frame], OutSize, totalGain );
  1081. m_EpochIndex++;
  1082. }
  1083. else
  1084. {
  1085. //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1086. //
  1087. // Get next phon
  1088. //
  1089. //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1090. hr = StartNewUnit( );
  1091. if (FAILED(hr))
  1092. {
  1093. //-----------------------------------
  1094. // Try to end it gracefully...
  1095. //-----------------------------------
  1096. m_SpeechState = SPEECH_DONE;
  1097. }
  1098. TTSDBG_LOGSILEPOCH;
  1099. }
  1100. }
  1101. m_cOutSamples_Frame += OutSize;
  1102. m_cOutSamples_Phon += OutSize;
  1103. m_cOutSamples_Total += OutSize;
  1104. TTSDBG_LOGEPOCHS;
  1105. }
  1106. while( (m_cOutSamples_Frame < SPEECH_FRAME_SIZE) && (m_SpeechState == SPEECH_CONTINUE) );
  1107. if( SUCCEEDED(hr) )
  1108. {
  1109. //----------------------------------------------
  1110. // Convert buffer from FLOAT to SHORT
  1111. //----------------------------------------------
  1112. if( m_pReverb )
  1113. {
  1114. //---------------------------------
  1115. // Add REVERB
  1116. //---------------------------------
  1117. m_pReverb->Reverb_Process( m_pSpeechBuf, m_cOutSamples_Frame, 1.0f );
  1118. }
  1119. else
  1120. {
  1121. CvtToShort( m_pSpeechBuf, m_cOutSamples_Frame, m_StereoOut, 1.0f );
  1122. }
  1123. //--- Debug Macro - output wave data to stream
  1124. TTSDBG_LOGWAVE;
  1125. }
  1126. if( SUCCEEDED( hr ) )
  1127. {
  1128. //------------------------------------
  1129. // Send this buffer to SAPI site
  1130. //------------------------------------
  1131. DWORD cbWritten;
  1132. //------------------------------------------------------------------------------------
  1133. // This was my lame hack to avoid sending buffers when nothing was spoken.
  1134. // It was causing problems (among others) since StartNewUnit() was still sending
  1135. // events - with no corresponding audio buffer!
  1136. //
  1137. // This was too simple of a scheme. Disable this feature for now...
  1138. // ...until I come up with something more robust. (MC)
  1139. //------------------------------------------------------------------------------------
  1140. //if( m_HasSpeech )
  1141. {
  1142. hr = m_pOutputSite->Write( (void*)m_pSpeechBuf,
  1143. m_cOutSamples_Frame * m_BytesPerSample,
  1144. &cbWritten );
  1145. if( FAILED( hr ) )
  1146. {
  1147. //----------------------------------------
  1148. // Abort! Unable to write audio data
  1149. //----------------------------------------
  1150. m_SpeechState = SPEECH_DONE;
  1151. }
  1152. }
  1153. }
  1154. //------------------------------------
  1155. // Return render state
  1156. //------------------------------------
  1157. return hr;
  1158. } /* CBackend::RenderFrame */