Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1333 lines
44 KiB

  1. /*******************************************************************************
  2. * Backend.cpp *
  3. *-------------*
  4. * Description:
  5. * This module is the implementation file for the CBackend class.
  6. *-------------------------------------------------------------------------------
  7. * Created By: mc Date: 03/12/99
  8. * Copyright (C) 1999 Microsoft Corporation
  9. * All Rights Reserved
  10. *
  11. *******************************************************************************/
  12. #include "stdafx.h"
  13. #ifndef __spttseng_h__
  14. #include "spttseng.h"
  15. #endif
  16. #ifndef Backend_H
  17. #include "Backend.h"
  18. #endif
  19. #ifndef FeedChain_H
  20. #include "FeedChain.h"
  21. #endif
  22. #ifndef SPDebug_h
  23. #include <spdebug.h>
  24. #endif
  25. //-----------------------------
  26. // Data.cpp
  27. //-----------------------------
  28. extern const short g_IPAToAllo[];
  29. extern const short g_AlloToViseme[];
  30. //--------------------------------------
  31. // DEBUG: Save utterance WAV file
  32. //--------------------------------------
  33. //#define SAVE_WAVE_FILE 1
  34. const unsigned char g_SineWaveTbl[] =
  35. {
  36. 0x7b,0x7e,0x81,0x84,0x87,0x89,0x8c,0x8f,0x92,0x95,0x98,0x9b,0x9d,0xa0,0xa3,0xa6,
  37. 0xa8,0xab,0xae,0xb0,0xb3,0xb5,0xb8,0xbb,0xbd,0xbf,0xc2,0xc4,0xc7,0xc9,0xcb,0xcd,
  38. 0xcf,0xd1,0xd3,0xd5,0xd7,0xd9,0xdb,0xdd,0xdf,0xe0,0xe2,0xe3,0xe5,0xe6,0xe8,0xe9,
  39. 0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,0xf1,0xf2,0xf2,0xf3,0xf3,0xf4,0xf4,0xf4,0xf4,
  40. 0xf5,0xf5,0xf5,0xf5,0xf4,0xf4,0xf4,0xf4,0xf3,0xf3,0xf2,0xf1,0xf1,0xf0,0xef,0xee,
  41. 0xed,0xec,0xeb,0xea,0xe9,0xe7,0xe6,0xe5,0xe3,0xe1,0xe0,0xde,0xdc,0xdb,0xd9,0xd7,
  42. 0xd5,0xd3,0xd1,0xcf,0xcd,0xcb,0xc8,0xc6,0xc4,0xc1,0xbf,0xbc,0xba,0xb7,0xb5,0xb2,
  43. 0xb0,0xad,0xaa,0xa8,0xa5,0xa2,0x9f,0x9d,0x9a,0x97,0x94,0x91,0x8f,0x8c,0x89,0x86,
  44. 0x83,0x80,0x7d,0x7a,0x77,0x75,0x72,0x6f,0x6c,0x69,0x66,0x64,0x61,0x5e,0x5b,0x58,
  45. 0x56,0x53,0x50,0x4e,0x4b,0x49,0x46,0x44,0x41,0x3f,0x3c,0x3a,0x38,0x35,0x33,0x31,
  46. 0x2f,0x2d,0x2b,0x29,0x27,0x25,0x23,0x21,0x1f,0x1e,0x1c,0x1b,0x19,0x18,0x16,0x15,
  47. 0x14,0x13,0x12,0x11,0x10,0x0f,0x0e,0x0d,0x0c,0x0c,0x0b,0x0b,0x0a,0x0a,0x0a,0x0a,
  48. 0x09,0x09,0x09,0x09,0x0a,0x0a,0x0a,0x0a,0x0b,0x0b,0x0c,0x0d,0x0d,0x0e,0x0f,0x10,
  49. 0x11,0x12,0x13,0x14,0x15,0x17,0x18,0x1a,0x1b,0x1d,0x1e,0x20,0x22,0x23,0x25,0x27,
  50. 0x29,0x2b,0x2d,0x2f,0x31,0x34,0x36,0x38,0x3a,0x3d,0x3f,0x42,0x44,0x47,0x49,0x4c,
  51. 0x4e,0x51,0x54,0x56,0x59,0x5c,0x5f,0x61,0x64,0x67,0x6a,0x6d,0x6f,0x72,0x75,0x78
  52. };
  53. /*void PredictEpochDist( float duration,
  54. long nKnots,
  55. float SampleRate,
  56. float *pTime,
  57. float *pF0)
  58. {
  59. long curSamplesOut, endSample, j;
  60. float epochFreq;
  61. long epochLen, epochCount;
  62. curSamplesOut = 0;
  63. endSample = (long) (SampleRate * duration );
  64. epochCount = 0;
  65. while( curSamplesOut < endSample )
  66. {
  67. j = 1;
  68. //---------------------------------------------------
  69. // Align to appropriate knot bassed on
  70. // current output sample
  71. //---------------------------------------------------
  72. while( (j < nKnots - 1) && (curSamplesOut > pTime[j]) )
  73. j++;
  74. //---------------------------------------------------
  75. // Calculate exact pitch thru linear interpolation
  76. //---------------------------------------------------
  77. epochFreq = LinInterp( pTime[j - 1], curSamplesOut, pTime[j], pF0[j - 1], pF0[j] );
  78. //---------------------------------------------------
  79. // Calc sample count for curent epoch
  80. //---------------------------------------------------
  81. epochLen = (long) (SampleRate / epochFreq);
  82. epochCount++;
  83. curSamplesOut += epochLen;
  84. }
  85. }
  86. */
  87. /*****************************************************************************
  88. * CBackend::CBackend *
  89. *--------------------*
  90. * Description: Constructor
  91. *
  92. ********************************************************************** MC ***/
  93. CBackend::CBackend( )
  94. {
  95. SPDBG_FUNC( "CBackend::CBackend" );
  96. m_pHistory = NULL;
  97. m_pHistory2 = NULL;
  98. m_pFilter = NULL;
  99. m_pReverb = NULL;
  100. m_pOutEpoch = NULL;
  101. m_pMap = NULL;
  102. m_pRevFlag = NULL;
  103. m_pSpeechBuf = NULL;
  104. m_VibratoDepth = 0;
  105. m_UnitVolume = 1.0f;
  106. m_MasterVolume = SPMAX_VOLUME;
  107. memset( &m_Synth, 0, sizeof(MSUNITDATA) );
  108. } /* CBackend::CBackend */
  109. /*****************************************************************************
  110. * CBackend::~CBackend *
  111. *---------------------*
  112. * Description: Destructor
  113. *
  114. ********************************************************************** MC ***/
  115. CBackend::~CBackend( )
  116. {
  117. SPDBG_FUNC( "CBackend::~CBackend" );
  118. Release();
  119. } /* CBackend::~CBackend */
  120. /*****************************************************************************
  121. * CBackend::Release *
  122. *---------------------*
  123. * Description:
  124. * Free memory allocaterd by Backend
  125. *
  126. ********************************************************************** MC ***/
  127. void CBackend::Release( )
  128. {
  129. SPDBG_FUNC( "CBackend::Release" );
  130. CleanUpSynth( );
  131. if( m_pSpeechBuf)
  132. {
  133. delete m_pSpeechBuf;
  134. m_pSpeechBuf = NULL;
  135. }
  136. if( m_pHistory )
  137. {
  138. delete m_pHistory;
  139. m_pHistory = NULL;
  140. }
  141. if( m_pHistory2 )
  142. {
  143. delete m_pHistory2;
  144. m_pHistory2 = NULL;
  145. }
  146. if( m_pReverb )
  147. {
  148. delete m_pReverb;
  149. m_pReverb = NULL;
  150. }
  151. } /* CBackend::Release */
  152. /*****************************************************************************
  153. * CBackend::Init *
  154. *----------------*
  155. * Description:
  156. * Opens a backend instance, keeping a pointer of the acoustic
  157. * inventory.
  158. *
  159. ********************************************************************** MC ***/
  160. HRESULT CBackend::Init( IMSVoiceData* pVoiceDataObj, CFeedChain *pSrcObj, MSVOICEINFO* pVoiceInfo )
  161. {
  162. SPDBG_FUNC( "CBackend::Init" );
  163. long LPCsize = 0;
  164. HRESULT hr = S_OK;
  165. m_pVoiceDataObj = pVoiceDataObj;
  166. m_SampleRate = (float)pVoiceInfo->SampleRate;
  167. m_pSrcObj = pSrcObj;
  168. m_cOrder = pVoiceInfo->LPCOrder;
  169. m_pWindow = pVoiceInfo->pWindow;
  170. m_FFTSize = pVoiceInfo->FFTSize;
  171. m_VibratoDepth = ((float)pVoiceInfo->VibratoDepth) / 100.0f;
  172. m_VibratoDepth = 0; // NOTE: disable vibrato
  173. m_VibratoFreq = pVoiceInfo->VibratoFreq;
  174. if( pVoiceInfo->eReverbType > REVERB_TYPE_OFF )
  175. {
  176. m_StereoOut = true;
  177. m_BytesPerSample = 4;
  178. }
  179. else
  180. {
  181. m_StereoOut = false;
  182. m_BytesPerSample = 2;
  183. }
  184. //---------------------------------------
  185. // Allocate AUDIO buffer
  186. //---------------------------------------
  187. m_pSpeechBuf = new float[SPEECH_FRAME_SIZE + SPEECH_FRAME_OVER];
  188. if( m_pSpeechBuf == NULL )
  189. {
  190. //--------------------------------------
  191. // Out of memory!
  192. //--------------------------------------
  193. hr = E_OUTOFMEMORY;
  194. }
  195. if( SUCCEEDED(hr) )
  196. {
  197. //---------------------------------------
  198. // Allocate HISTORY buffer
  199. //---------------------------------------
  200. LPCsize = m_cOrder + 1;
  201. m_pHistory = new float[LPCsize];
  202. if( m_pHistory == NULL )
  203. {
  204. //--------------------------------------
  205. // Out of memory!
  206. //--------------------------------------
  207. hr = E_OUTOFMEMORY;
  208. }
  209. }
  210. if( SUCCEEDED(hr) )
  211. {
  212. memset( m_pHistory, 0, LPCsize * sizeof(float) );
  213. m_pOutEpoch = NULL;
  214. m_pMap = NULL;
  215. m_pRevFlag = NULL;
  216. m_fModifiers = 0;
  217. m_vibrato_Phase1 = 0;
  218. //--------------------------------
  219. // Reverb Effect
  220. //--------------------------------
  221. //pVoiceInfo->eReverbType = REVERB_TYPE_HALL;
  222. if( pVoiceInfo->eReverbType > REVERB_TYPE_OFF )
  223. {
  224. //--------------------------------
  225. // Create ReverbFX object
  226. //--------------------------------
  227. if( m_pReverb == NULL )
  228. {
  229. m_pReverb = new CReverbFX;
  230. if( m_pReverb )
  231. {
  232. short result;
  233. result = m_pReverb->Reverb_Init( pVoiceInfo->eReverbType, (long)m_SampleRate, m_StereoOut );
  234. if( result != KREVERB_NOERROR )
  235. {
  236. //--------------------------------------------
  237. // Not enough memory to do reverb
  238. // Recover gracefully
  239. //--------------------------------------------
  240. delete m_pReverb;
  241. m_pReverb = NULL;
  242. }
  243. /*else
  244. {
  245. //--------------------------------------------------------
  246. // Init was successful, ready to do reverb now
  247. //--------------------------------------------------------
  248. }*/
  249. }
  250. }
  251. }
  252. //----------------------------
  253. // Linear taper region scale
  254. //----------------------------
  255. m_linearScale = (float) pow( 10.0, (double)((1.0f - LINEAR_BKPT) * LOG_RANGE) / 20.0 );
  256. #ifdef SAVE_WAVE_FILE
  257. m_SaveFile = (PCSaveWAV) new CSaveWAV; // No check needed, if this fails, we simply don't save file.
  258. if( m_SaveFile )
  259. {
  260. m_SaveFile->OpenWavFile( (long)m_SampleRate );
  261. }
  262. #endif
  263. }
  264. else
  265. {
  266. if( m_pSpeechBuf )
  267. {
  268. delete m_pSpeechBuf;
  269. m_pSpeechBuf = NULL;
  270. }
  271. if( m_pHistory )
  272. {
  273. delete m_pHistory;
  274. m_pHistory = NULL;
  275. }
  276. }
  277. return hr;
  278. } /* CBackend::Init */
  279. /*****************************************************************************
  280. * CBackend::FreeSynth *
  281. *---------------------*
  282. * Description:
  283. * Return TRUE if consoants can be clustered.
  284. *
  285. ********************************************************************** MC ***/
  286. void CBackend::FreeSynth( MSUNITDATA* pSynth )
  287. {
  288. SPDBG_FUNC( "CBackend::FreeSynth" );
  289. if( pSynth->pEpoch )
  290. {
  291. delete pSynth->pEpoch;
  292. pSynth->pEpoch = NULL;
  293. }
  294. if( pSynth->pRes )
  295. {
  296. delete pSynth->pRes;
  297. pSynth->pRes = NULL;
  298. }
  299. if( pSynth->pLPC )
  300. {
  301. delete pSynth->pLPC;
  302. pSynth->pLPC = NULL;
  303. }
  304. } /* CBackend::FreeSynth */
  305. /*****************************************************************************
  306. * ExpConverter *
  307. *--------------*
  308. * Description:
  309. * Convert linear to exponential taper
  310. * 'ref' is a linear value between 0.0 to 1.0
  311. *
  312. ********************************************************************** MC ***/
  313. static float ExpConverter( float ref, float linearScale )
  314. {
  315. SPDBG_FUNC( "ExpConverter" );
  316. float audioGain;
  317. if( ref < LINEAR_BKPT)
  318. {
  319. //----------------------------------------
  320. // Linear taper below LINEAR_BKPT
  321. //----------------------------------------
  322. audioGain = linearScale * (ref / LINEAR_BKPT);
  323. }
  324. else
  325. {
  326. //----------------------------------------
  327. // Log taper above LINEAR_BKPT
  328. //----------------------------------------
  329. audioGain = (float) pow( 10.0, (double)((1.0f - ref) * LOG_RANGE) / 20.0 );
  330. }
  331. return audioGain;
  332. } /* ExpConverter */
  333. /*****************************************************************************
  334. * CBackend::CvtToShort *
  335. *----------------------*
  336. * Description:
  337. * Convert (in place) FLOAT audio to SHORT.
  338. *
  339. ********************************************************************** MC ***/
  340. void CBackend::CvtToShort( float *pSrc, long blocksize, long stereoOut, float audioGain )
  341. {
  342. SPDBG_FUNC( "CBackend::CvtToShort" );
  343. long i;
  344. short *pDest;
  345. float fSamp;
  346. pDest = (short*)pSrc;
  347. for( i = 0; i < blocksize; ++i )
  348. {
  349. //------------------------
  350. // Read float sample...
  351. //------------------------
  352. fSamp = (*pSrc++) * audioGain;
  353. //------------------------
  354. // ...clip to 16-bits...
  355. //------------------------
  356. if( fSamp > 32767 )
  357. {
  358. fSamp = 32767;
  359. }
  360. else if( fSamp < (-32768) )
  361. {
  362. fSamp = (-32768);
  363. }
  364. //------------------------
  365. // ...save as SHORT
  366. //------------------------
  367. *pDest++ = (short)fSamp;
  368. if( stereoOut )
  369. {
  370. *pDest++ = (short)(0 - (int)fSamp);
  371. }
  372. }
  373. } /* CBackend::CvtToShort */
  374. /*****************************************************************************
  375. * CBackend::PSOLA_Stretch *
  376. *-------------------------*
  377. * Description:
  378. * Does PSOLA epoch stretching or compressing
  379. *
  380. ********************************************************************** MC ***/
  381. void CBackend::PSOLA_Stretch( float *pInRes, long InSize,
  382. float *pOutRes, long OutSize,
  383. float *pWindow,
  384. long cWindowSize )
  385. {
  386. SPDBG_FUNC( "CBackend::PSOLA_Stretch" );
  387. long i, lim;
  388. float window, delta, kf;
  389. memset( pOutRes, 0, sizeof(float) * OutSize );
  390. lim = MIN(InSize, OutSize );
  391. delta = (float)cWindowSize / (float)lim;
  392. kf = 0.5f;
  393. pOutRes[0] = pInRes[0];
  394. for( i = 1; i < lim; ++i )
  395. {
  396. kf += delta;
  397. window = pWindow[(long) kf];
  398. pOutRes[i] += pInRes[i] * window;
  399. pOutRes[OutSize - i] += pInRes[InSize - i] * window;
  400. }
  401. } /* CBackend::PSOLA_Stretch */
  402. /*****************************************************************************
  403. * CBackend::PrepareSpeech *
  404. *-------------------------*
  405. * Description:
  406. *
  407. ********************************************************************** MC ***/
  408. void CBackend::PrepareSpeech( ISpTTSEngineSite* outputSite )
  409. {
  410. SPDBG_FUNC( "CBackend::PrepareSpeech" );
  411. //m_pUnits = pUnits;
  412. //m_unitCount = unitCount;
  413. //m_CurUnitIndex = 0;
  414. m_pOutputSite = outputSite;
  415. m_silMode = true;
  416. m_durationTarget = 0;
  417. m_cOutSamples_Phon = 1;
  418. m_cOutEpochs = 0; // Pull model big-bang
  419. m_SpeechState = SPEECH_CONTINUE;
  420. m_cOutSamples_Total = 0;
  421. m_HasSpeech = false;
  422. } /* CBackend::PrepareSpeech */
  423. /*****************************************************************************
  424. * CBackend::ProsodyMod *
  425. *----------------------*
  426. * Description:
  427. * Calculate the epoch sequence for the synthesized speech
  428. *
  429. * INPUT:
  430. *
  431. * OUTPUT:
  432. * FIlls 'pOutEpoch', 'pMap', and 'pRevFlag'
  433. * Returns new epoch count
  434. *
  435. ********************************************************************** MC ***/
  436. long CBackend::ProsodyMod( UNITINFO *pCurUnit,
  437. long cInEpochs,
  438. float durationMpy,
  439. long cMaxOutEpochs )
  440. {
  441. SPDBG_FUNC( "CBackend::ProsodyMod" );
  442. long iframe, framesize, framesizeOut, j;
  443. long cntOut, csamplesOut, cOutEpochs;
  444. BOOL fUnvoiced;
  445. short fReverse;
  446. float totalDuration;
  447. float durationIn; // Active accum of IN duration
  448. float durationOut; // Active accum of OUT duration aligned to IN domain
  449. float freqMpy;
  450. BOOL fAdvanceInput;
  451. float vibrato;
  452. unsigned char *SineWavePtr;
  453. float epochFreq;
  454. float *pTime;
  455. float *pF0;
  456. iframe = 0;
  457. durationIn = 0.0f;
  458. durationOut = 0.0f;
  459. csamplesOut = 0;
  460. cntOut = 0;
  461. cOutEpochs = 0;
  462. fReverse = false;
  463. pTime = pCurUnit->pTime;
  464. pF0 = pCurUnit->pF0;
  465. //------------------------------------
  466. // Find total input duration
  467. //------------------------------------
  468. totalDuration = 0;
  469. for( j = 0; j < cInEpochs; ++j )
  470. {
  471. totalDuration += ABS(m_pInEpoch[j]);
  472. }
  473. /*PredictEpochDist( pCurUnit->duration,
  474. pCurUnit->nKnots,
  475. m_SampleRate,
  476. pTime,
  477. pF0 );*/
  478. while( iframe < cInEpochs && cOutEpochs < cMaxOutEpochs)
  479. {
  480. //-----------------------------------------
  481. // Compute output frame length
  482. //-----------------------------------------
  483. if( m_pInEpoch[iframe] < 0 )
  484. {
  485. //-------------------------------------------------
  486. // Since we can't change unvoiced pitch,
  487. // do not change frame size for unvoiced frames
  488. //-------------------------------------------------
  489. framesize = (long)((-m_pInEpoch[iframe]) + 0.5f);
  490. framesizeOut = framesize;
  491. fUnvoiced = true;
  492. }
  493. else
  494. {
  495. //---------------------------------------------------
  496. // Modify frame size for voiced epoch
  497. // based on epoch frequency
  498. //---------------------------------------------------
  499. j = 1;
  500. //---------------------------------------------------
  501. // Align to appropriate knot bassed on
  502. // current output sample
  503. //---------------------------------------------------
  504. while( (j < (long)pCurUnit->nKnots - 1) && (csamplesOut > pTime[j]) )
  505. j++;
  506. //---------------------------------------------------
  507. // Calculate exact pitch thru linear interpolation
  508. //---------------------------------------------------
  509. epochFreq = LinInterp( pTime[j - 1], (float)csamplesOut, pTime[j], pF0[j - 1], pF0[j] );
  510. SineWavePtr = (unsigned char*)&g_SineWaveTbl[0];
  511. vibrato = (float)(((unsigned char)(*(SineWavePtr + (m_vibrato_Phase1 >> 16)))) - 128);
  512. vibrato *= m_VibratoDepth;
  513. //---------------------------------------------------
  514. // Scale frame size using in/out ratio
  515. //---------------------------------------------------
  516. epochFreq = epochFreq + vibrato;
  517. if( epochFreq < MIN_VOICE_PITCH )
  518. {
  519. epochFreq = MIN_VOICE_PITCH;
  520. }
  521. framesize = (long)(m_pInEpoch[iframe] + 0.5f);
  522. framesizeOut = (long)(m_SampleRate / epochFreq);
  523. vibrato = ((float)256 / ((float)22050 / m_VibratoFreq)) * (float)framesizeOut; // 3 Hz
  524. //vibrato = ((float)256 / (float)7350) * (float)framesizeOut; // 3 Hz
  525. m_vibrato_Phase1 += (long)(vibrato * (float)65536);
  526. m_vibrato_Phase1 &= 0xFFFFFF;
  527. //---------------------------------------------------
  528. // @@@@ REMOVED 2x LIMIT
  529. //---------------------------------------------------
  530. /*if( framesizeOut > 2*framesize )
  531. {
  532. framesizeOut = 2*framesize;
  533. }
  534. if( framesize > 2*framesizeOut )
  535. {
  536. framesizeOut = framesize/2;
  537. }*/
  538. freqMpy = (float) framesize / framesizeOut;
  539. fUnvoiced = false;
  540. }
  541. //-------------------------------------------
  542. // Generate next output frame
  543. //-------------------------------------------
  544. fAdvanceInput = false;
  545. if( durationOut + (0.5f * framesizeOut/durationMpy) <= durationIn + framesize )
  546. {
  547. //-----------------------------------------
  548. // If UNvoiced and odd frame,
  549. // reverse residual
  550. //-----------------------------------------
  551. if( fUnvoiced && (cntOut & 1) )
  552. {
  553. m_pRevFlag[cOutEpochs] = true;
  554. fReverse = true;
  555. }
  556. else
  557. {
  558. m_pRevFlag[cOutEpochs] = false;
  559. fReverse = false;
  560. }
  561. ++cntOut;
  562. durationOut += framesizeOut/durationMpy;
  563. csamplesOut += framesizeOut;
  564. m_pOutEpoch[cOutEpochs] = (float)framesizeOut;
  565. m_pMap[cOutEpochs] = iframe;
  566. cOutEpochs++;
  567. }
  568. else
  569. {
  570. fAdvanceInput = true;
  571. }
  572. //-------------------------------------------
  573. // Advance to next input frame
  574. //-------------------------------------------
  575. if( ((durationOut + (0.5f * framesizeOut/durationMpy)) > (durationIn + framesize)) ||
  576. //(cntOut >= 3) || @@@@ REMOVED 2x LIMIT
  577. //(fReverse == true) ||
  578. fAdvanceInput )
  579. {
  580. durationIn += framesize;
  581. ++iframe;
  582. cntOut = 0;
  583. }
  584. }
  585. return cOutEpochs;
  586. } /* CBackend::ProsodyMod */
  587. /*****************************************************************************
  588. * CBackend::LPCFilter *
  589. *---------------------*
  590. * Description:
  591. * LPC filter of order cOrder. It filters the residual signal
  592. * pRes, producing output pOutWave. This routine requires that
  593. * pOutWave has the true waveform history from [-cOrder,0] and
  594. * of course it has to be defined.
  595. *
  596. ********************************************************************** MC ***/
  597. void CBackend::LPCFilter( float *pCurLPC, float *pCurRes, long len, float gain )
  598. {
  599. SPDBG_FUNC( "CBackend::LPCFilter" );
  600. INT t, j;
  601. for( t = 0; t < len; t++ )
  602. {
  603. m_pHistory[0] = pCurLPC[0] * pCurRes[t];
  604. for( j = m_cOrder; j > 0; j-- )
  605. {
  606. m_pHistory[0] -= pCurLPC[j] * m_pHistory[j];
  607. m_pHistory[j] = m_pHistory[j - 1];
  608. }
  609. pCurRes[t] = m_pHistory[0] * gain;
  610. }
  611. } /* CBackend::LPCFilter */
  612. /*void CBackend::LPCFilter( float *pCurLPC, float *pCurRes, long len )
  613. {
  614. long t;
  615. for( t = 0; t < len; t++ )
  616. {
  617. pCurRes[t] = pCurRes[t] * 10;
  618. }
  619. }
  620. */
  621. /*****************************************************************************
  622. * CBackend::ResRecons *
  623. *---------------------*
  624. * Description:
  625. * Obtains output prosody modified residual
  626. *
  627. ********************************************************************** MC ***/
  628. void CBackend::ResRecons( float *pInRes,
  629. long InSize,
  630. float *pOutRes,
  631. long OutSize,
  632. float scale )
  633. {
  634. SPDBG_FUNC( "CBackend::ResRecons" );
  635. long i, j;
  636. if( m_pRevFlag[m_EpochIndex] )
  637. {
  638. //----------------------------------------------------
  639. // Process repeated and reversed UNvoiced residual
  640. //----------------------------------------------------
  641. for( i = 0, j = OutSize-1; i < OutSize; ++i, --j )
  642. {
  643. pOutRes[i] = pInRes[j];
  644. }
  645. }
  646. else if( InSize == OutSize )
  647. {
  648. //----------------------------------------------------
  649. // Unvoiced residual or voiced residual
  650. // with no pitch change
  651. //----------------------------------------------------
  652. memcpy( pOutRes, pInRes, sizeof(float) *OutSize );
  653. }
  654. else
  655. {
  656. //----------------------------------------------------
  657. // Process voiced residual
  658. //----------------------------------------------------
  659. PSOLA_Stretch( pInRes, InSize, pOutRes, OutSize, m_pWindow, m_FFTSize );
  660. }
  661. //----------------------------------
  662. // Amplify frame
  663. //----------------------------------
  664. if( scale != 1.0f )
  665. {
  666. for( i = 0 ; i < OutSize; ++i )
  667. {
  668. pOutRes[i] *= scale;
  669. }
  670. }
  671. } /* CBackend::ResRecons */
  672. /*****************************************************************************
  673. * CBackend::StartNewUnit *
  674. *------------------------*
  675. * Description:
  676. * Synthesize audio samples for a target unit
  677. *
  678. * INPUT:
  679. * pCurUnit - unit ID, F0, duration, etc.
  680. *
  681. * OUTPUT:
  682. * Sets 'pCurUnit->csamplesOut' with audio length
  683. *
  684. ********************************************************************** MC ***/
  685. HRESULT CBackend::StartNewUnit( )
  686. {
  687. SPDBG_FUNC( "CBackend::StartNewUnit" );
  688. long cframeMax = 0, cInEpochs = 0, i;
  689. float totalDuration, durationOut, durationMpy = 0;
  690. UNITINFO *pCurUnit;
  691. HRESULT hr = S_OK;
  692. SPEVENT event;
  693. ULONGLONG clientInterest;
  694. USHORT volumeVal;
  695. // Check for VOLUME change
  696. if( m_pOutputSite->GetActions() & SPVES_VOLUME )
  697. {
  698. hr = m_pOutputSite->GetVolume( &volumeVal );
  699. if ( SUCCEEDED( hr ) )
  700. {
  701. if( volumeVal > SPMAX_VOLUME )
  702. {
  703. //--- Clip rate to engine maximum
  704. volumeVal = SPMAX_VOLUME;
  705. }
  706. else if ( volumeVal < SPMIN_VOLUME )
  707. {
  708. //--- Clip rate to engine minimum
  709. volumeVal = SPMIN_VOLUME;
  710. }
  711. m_MasterVolume = volumeVal;
  712. }
  713. }
  714. //---------------------------------------
  715. // Delete previous unit
  716. //---------------------------------------
  717. CleanUpSynth( );
  718. //---------------------------------------
  719. // Get next phon
  720. //---------------------------------------
  721. hr = m_pSrcObj->NextData( (void**)&pCurUnit, &m_SpeechState );
  722. if( m_SpeechState == SPEECH_CONTINUE )
  723. {
  724. m_HasSpeech = pCurUnit->hasSpeech;
  725. m_pOutputSite->GetEventInterest( &clientInterest );
  726. //------------------------------------------------
  727. // Post SENTENCE event
  728. //------------------------------------------------
  729. if( (pCurUnit->flags & SENT_START_FLAG) && (clientInterest & SPFEI(SPEI_SENTENCE_BOUNDARY)) )
  730. {
  731. event.elParamType = SPET_LPARAM_IS_UNDEFINED;
  732. event.eEventId = SPEI_SENTENCE_BOUNDARY;
  733. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  734. event.lParam = pCurUnit->sentencePosition; // Input word position
  735. event.wParam = pCurUnit->sentenceLen; // Input word length
  736. m_pOutputSite->AddEvents( &event, 1 );
  737. }
  738. //------------------------------------------------
  739. // Post PHONEME event
  740. //------------------------------------------------
  741. if( clientInterest & SPFEI(SPEI_PHONEME) )
  742. {
  743. event.elParamType = SPET_LPARAM_IS_UNDEFINED;
  744. event.eEventId = SPEI_PHONEME;
  745. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  746. event.lParam = ((ULONG)pCurUnit->AlloFeatures << 16) + g_IPAToAllo[pCurUnit->AlloID];
  747. event.wParam = ((ULONG)(pCurUnit->duration * 1000.0f) << 16) + g_IPAToAllo[pCurUnit->NextAlloID];
  748. m_pOutputSite->AddEvents( &event, 1 );
  749. }
  750. //------------------------------------------------
  751. // Post VISEME event
  752. //------------------------------------------------
  753. if( clientInterest & SPFEI(SPEI_VISEME) )
  754. {
  755. event.elParamType = SPET_LPARAM_IS_UNDEFINED;
  756. event.eEventId = SPEI_VISEME;
  757. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  758. event.lParam = ((ULONG)pCurUnit->AlloFeatures << 16) + g_AlloToViseme[pCurUnit->AlloID];
  759. event.wParam = ((ULONG)(pCurUnit->duration * 1000.0f) << 16) + g_AlloToViseme[pCurUnit->NextAlloID];
  760. m_pOutputSite->AddEvents( &event, 1 );
  761. }
  762. //------------------------------------------------
  763. // Post any bookmark events
  764. //------------------------------------------------
  765. if( pCurUnit->pBMObj != NULL )
  766. {
  767. CBookmarkList *pBMObj;
  768. BOOKMARK_ITEM* pMarker;
  769. //-------------------------------------------------
  770. // Retrieve marker strings from Bookmark list and
  771. // enter into Event list
  772. //-------------------------------------------------
  773. pBMObj = (CBookmarkList*)pCurUnit->pBMObj;
  774. //cMarkerCount = pBMObj->m_BMList.GetCount();
  775. if( clientInterest & SPFEI(SPEI_TTS_BOOKMARK) )
  776. {
  777. //---------------------------------------
  778. // Send event for every bookmark in list
  779. //---------------------------------------
  780. SPLISTPOS listPos;
  781. listPos = pBMObj->m_BMList.GetHeadPosition();
  782. while( listPos )
  783. {
  784. pMarker = (BOOKMARK_ITEM*)pBMObj->m_BMList.GetNext( listPos );
  785. event.eEventId = SPEI_TTS_BOOKMARK;
  786. event.elParamType = SPET_LPARAM_IS_STRING;
  787. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  788. //--- Copy in bookmark string - has been NULL terminated in source already...
  789. event.lParam = pMarker->pBMItem;
  790. // Engine must convert string to long for wParam.
  791. event.wParam = _wtol((WCHAR *)pMarker->pBMItem);
  792. m_pOutputSite->AddEvents( &event, 1 );
  793. }
  794. }
  795. //---------------------------------------------
  796. // We don't need this Bookmark list any more
  797. //---------------------------------------------
  798. delete pBMObj;
  799. pCurUnit->pBMObj = NULL;
  800. }
  801. pCurUnit->csamplesOut = 0;
  802. //******************************************************
  803. // For SIL, fill buffer with zeros...
  804. //******************************************************
  805. if( pCurUnit->UnitID == UNIT_SIL )
  806. {
  807. //---------------------------------------------
  808. // Calc SIL length
  809. //---------------------------------------------
  810. m_durationTarget = (long)(m_SampleRate * pCurUnit->duration);
  811. m_cOutSamples_Phon = 0;
  812. m_silMode = true;
  813. //---------------------------------------------
  814. // Clear LPC filter storage
  815. //---------------------------------------------
  816. memset( m_pHistory, 0, sizeof(float)*(m_cOrder+1) );
  817. //--------------------------------
  818. // Success!
  819. //--------------------------------
  820. // Debug macro - output unit data...
  821. TTSDBG_LOGUNITS;
  822. }
  823. //******************************************************
  824. // ...otherwise fill buffer with inventory data
  825. //******************************************************
  826. else
  827. {
  828. m_silMode = false;
  829. // Get unit data from voice
  830. hr = m_pVoiceDataObj->GetUnitData( pCurUnit->UnitID, &m_Synth );
  831. if( SUCCEEDED(hr) )
  832. {
  833. durationOut = 0.0f;
  834. cInEpochs = m_Synth.cNumEpochs;
  835. m_pInEpoch = m_Synth.pEpoch;
  836. //cframeMax = PeakValue( m_pInEpoch, cInEpochs );
  837. totalDuration = (float)m_Synth.cNumSamples;
  838. //-----------------------------------------------
  839. // For debugging: Force duration to unit length
  840. //-----------------------------------------------
  841. /*float unitDur;
  842. unitDur = totalDuration / 22050.0f;
  843. if( pCurUnit->duration < unitDur )
  844. {
  845. if( pCurUnit->speechRate < 1 )
  846. {
  847. pCurUnit->duration = unitDur * pCurUnit->speechRate;
  848. }
  849. else
  850. {
  851. pCurUnit->duration = unitDur;
  852. }
  853. }*/
  854. durationMpy = pCurUnit->duration;
  855. cframeMax = (long)pCurUnit->pF0[0];
  856. for( i = 1; i < (long)pCurUnit->nKnots; i++ )
  857. {
  858. //-----------------------------------------
  859. // Find the longest epoch
  860. //-----------------------------------------
  861. cframeMax = (long)(MAX(cframeMax,pCurUnit->pF0[i]));
  862. }
  863. cframeMax *= (long)(durationMpy * MAX_TARGETS_PER_UNIT);
  864. durationMpy = (m_SampleRate * durationMpy) / totalDuration;
  865. cframeMax += (long)(durationMpy * cInEpochs * MAX_TARGETS_PER_UNIT);
  866. //
  867. // mplumpe 11/18/97 : added to eliminate chance of crash.
  868. //
  869. cframeMax *= 2;
  870. //---------------------------------------------------
  871. // New epochs adjusted for duration and pitch
  872. //---------------------------------------------------
  873. m_pOutEpoch = new float[cframeMax];
  874. if( !m_pOutEpoch )
  875. {
  876. //--------------------------------------
  877. // Out of memory!
  878. //--------------------------------------
  879. hr = E_OUTOFMEMORY;
  880. pCurUnit->csamplesOut = 0;
  881. CleanUpSynth( );
  882. }
  883. }
  884. if( SUCCEEDED(hr) )
  885. {
  886. //---------------------------------------------------
  887. // Index back to orig epoch
  888. //---------------------------------------------------
  889. m_pMap = new long[cframeMax];
  890. if( !m_pMap )
  891. {
  892. //--------------------------------------
  893. // Out of memory!
  894. //--------------------------------------
  895. hr = E_OUTOFMEMORY;
  896. pCurUnit->csamplesOut = 0;
  897. CleanUpSynth( );
  898. }
  899. }
  900. if( SUCCEEDED(hr) )
  901. {
  902. //---------------------------------------------------
  903. // TRUE = reverse residual
  904. //---------------------------------------------------
  905. m_pRevFlag = new short[cframeMax];
  906. if( !m_pRevFlag )
  907. {
  908. //--------------------------------------
  909. // Out of memory!
  910. //--------------------------------------
  911. hr = E_OUTOFMEMORY;
  912. pCurUnit->csamplesOut = 0;
  913. CleanUpSynth( );
  914. }
  915. }
  916. if( SUCCEEDED(hr) )
  917. {
  918. //---------------------------------------------------------------------
  919. // Compute synthesis epochs and corresponding mapping to analysis
  920. // fills in: m_pOutEpoch, m_pMap, m_pRevFlag
  921. //---------------------------------------------------------------------
  922. m_cOutEpochs = ProsodyMod( pCurUnit, cInEpochs, durationMpy, cframeMax );
  923. //------------------------------------------------
  924. // Now that actual epoch sizes are known,
  925. // calculate total audio sample count
  926. // @@@@ NO LONGER NEEDED
  927. //------------------------------------------------
  928. pCurUnit->csamplesOut = 0;
  929. for( i = 0; i < m_cOutEpochs; i++ )
  930. {
  931. pCurUnit->csamplesOut += (long)(ABS(m_pOutEpoch[i]));
  932. }
  933. m_cOutSamples_Phon = 0;
  934. m_EpochIndex = 0;
  935. m_durationTarget = (long)(pCurUnit->duration * m_SampleRate);
  936. m_pInRes = m_Synth.pRes;
  937. m_pLPC = m_Synth.pLPC;
  938. m_pSynthTime = pCurUnit->pTime;
  939. m_pSynthAmp = pCurUnit->pAmp;
  940. m_nKnots = pCurUnit->nKnots;
  941. // NOTE: Maybe make log volume?
  942. m_UnitVolume = (float)pCurUnit->user_Volume / 100.0f;
  943. //------------------------------------------------
  944. // Post WORD event
  945. //------------------------------------------------
  946. if( (pCurUnit->flags & WORD_START_FLAG) && (clientInterest & SPFEI(SPEI_WORD_BOUNDARY)) )
  947. {
  948. event.elParamType = SPET_LPARAM_IS_UNDEFINED;
  949. event.eEventId = SPEI_WORD_BOUNDARY;
  950. event.ullAudioStreamOffset = m_cOutSamples_Total * m_BytesPerSample;
  951. event.lParam = pCurUnit->srcPosition; // Input word position
  952. event.wParam = pCurUnit->srcLen; // Input word length
  953. m_pOutputSite->AddEvents( &event, 1 );
  954. }
  955. //--- Debug macro - output unit data
  956. TTSDBG_LOGUNITS;
  957. }
  958. }
  959. }
  960. return hr;
  961. } /* CBackend::StartNewUnit */
  962. /*****************************************************************************
  963. * CBackend::CleanUpSynth *
  964. *------------------------*
  965. * Description:
  966. *
  967. ********************************************************************** MC ***/
  968. void CBackend::CleanUpSynth( )
  969. {
  970. SPDBG_FUNC( "CBackend::CleanUpSynth" );
  971. if( m_pOutEpoch )
  972. {
  973. delete m_pOutEpoch;
  974. m_pOutEpoch = NULL;
  975. }
  976. if( m_pMap )
  977. {
  978. delete m_pMap;
  979. m_pMap = NULL;
  980. }
  981. if( m_pRevFlag )
  982. {
  983. delete m_pRevFlag;
  984. m_pRevFlag = NULL;
  985. }
  986. // NOTE: make object?
  987. FreeSynth( &m_Synth );
  988. } /* CBackend::CleanUpSynth */
  989. /*****************************************************************************
  990. * CBackend::RenderFrame *
  991. *-----------------------*
  992. * Description:
  993. * This this the central synthesis loop. Keep filling output audio
  994. * buffer until buffer frame is full or speech is done. To render
  995. * continous speech, get each unit one at a time from upstream buffer.
  996. *
  997. ********************************************************************** MC ***/
  998. HRESULT CBackend::RenderFrame( )
  999. {
  1000. SPDBG_FUNC( "CBackend::RenderFrame" );
  1001. long InSize, OutSize;
  1002. long iframe;
  1003. float *pCurInRes, *pCurOutRes;
  1004. long i, j;
  1005. float ampMpy;
  1006. HRESULT hr = S_OK;
  1007. m_cOutSamples_Frame = 0;
  1008. do
  1009. {
  1010. OutSize = 0;
  1011. if( m_silMode )
  1012. {
  1013. //-------------------------------
  1014. // Silence mode
  1015. //-------------------------------
  1016. if( m_cOutSamples_Phon >= m_durationTarget )
  1017. {
  1018. //---------------------------
  1019. // Get next unit
  1020. //---------------------------
  1021. hr = StartNewUnit( );
  1022. if (FAILED(hr))
  1023. {
  1024. //-----------------------------------
  1025. // Try to end it gracefully...
  1026. //-----------------------------------
  1027. m_SpeechState = SPEECH_DONE;
  1028. }
  1029. TTSDBG_LOGSILEPOCH;
  1030. }
  1031. else
  1032. {
  1033. //---------------------------
  1034. // Continue with current SIL
  1035. //---------------------------
  1036. m_pSpeechBuf[m_cOutSamples_Frame] = 0;
  1037. OutSize = 1;
  1038. }
  1039. }
  1040. else
  1041. {
  1042. if( m_EpochIndex < m_cOutEpochs )
  1043. {
  1044. //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1045. //
  1046. // Continue with current phon
  1047. //
  1048. //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1049. //------------------------------------
  1050. // Find current input residual
  1051. //------------------------------------
  1052. iframe = m_pMap[m_EpochIndex];
  1053. pCurInRes = m_pInRes;
  1054. for( i = 0; i < iframe; i++)
  1055. {
  1056. pCurInRes += (long) ABS(m_pInEpoch[i]);
  1057. }
  1058. pCurOutRes = m_pSpeechBuf + m_cOutSamples_Frame;
  1059. InSize = (long)(ABS(m_pInEpoch[iframe]));
  1060. OutSize = (long)(ABS(m_pOutEpoch[m_EpochIndex]));
  1061. if (m_cOutSamples_Frame + OutSize > SPEECH_FRAME_SIZE + SPEECH_FRAME_OVER)
  1062. {
  1063. m_pOutEpoch[m_EpochIndex] = SPEECH_FRAME_OVER-1; // still huge
  1064. OutSize = (long)(ABS(m_pOutEpoch[m_EpochIndex]));
  1065. }
  1066. j = 1;
  1067. while( (j < m_nKnots - 1) && (m_cOutSamples_Phon > m_pSynthTime[j]) )
  1068. {
  1069. j++;
  1070. }
  1071. ampMpy = LinInterp( m_pSynthTime[j - 1], (float)m_cOutSamples_Phon, m_pSynthTime[j], m_pSynthAmp[j - 1], m_pSynthAmp[j] );
  1072. //ampMpy = 1;
  1073. //--------------------------------------------
  1074. // Do stretching of residuals
  1075. //--------------------------------------------
  1076. ResRecons( pCurInRes, InSize, pCurOutRes, OutSize, ampMpy );
  1077. //--------------------------------------------
  1078. // Do LPC reconstruction
  1079. //--------------------------------------------
  1080. float *pCurLPC;
  1081. float totalGain;
  1082. totalGain = ExpConverter( ((float)m_MasterVolume / (float)SPMAX_VOLUME), m_linearScale )
  1083. * ExpConverter( m_UnitVolume, m_linearScale );
  1084. pCurLPC = m_pLPC + m_pMap[m_EpochIndex] * (1 + m_cOrder);
  1085. pCurLPC[0] = 1.0f;
  1086. LPCFilter( pCurLPC, &m_pSpeechBuf[m_cOutSamples_Frame], OutSize, totalGain );
  1087. m_EpochIndex++;
  1088. }
  1089. else
  1090. {
  1091. //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1092. //
  1093. // Get next phon
  1094. //
  1095. //-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1096. hr = StartNewUnit( );
  1097. if (FAILED(hr))
  1098. {
  1099. //-----------------------------------
  1100. // Try to end it gracefully...
  1101. //-----------------------------------
  1102. m_SpeechState = SPEECH_DONE;
  1103. }
  1104. TTSDBG_LOGSILEPOCH;
  1105. }
  1106. }
  1107. m_cOutSamples_Frame += OutSize;
  1108. m_cOutSamples_Phon += OutSize;
  1109. m_cOutSamples_Total += OutSize;
  1110. TTSDBG_LOGEPOCHS;
  1111. }
  1112. while( (m_cOutSamples_Frame < SPEECH_FRAME_SIZE) && (m_SpeechState == SPEECH_CONTINUE) );
  1113. if( SUCCEEDED(hr) )
  1114. {
  1115. //----------------------------------------------
  1116. // Convert buffer from FLOAT to SHORT
  1117. //----------------------------------------------
  1118. if( m_pReverb )
  1119. {
  1120. //---------------------------------
  1121. // Add REVERB
  1122. //---------------------------------
  1123. m_pReverb->Reverb_Process( m_pSpeechBuf, m_cOutSamples_Frame, 1.0f );
  1124. }
  1125. else
  1126. {
  1127. CvtToShort( m_pSpeechBuf, m_cOutSamples_Frame, m_StereoOut, 1.0f );
  1128. }
  1129. //--- Debug Macro - output wave data to stream
  1130. TTSDBG_LOGWAVE;
  1131. }
  1132. if( SUCCEEDED( hr ) )
  1133. {
  1134. //------------------------------------
  1135. // Send this buffer to SAPI site
  1136. //------------------------------------
  1137. DWORD cbWritten;
  1138. //------------------------------------------------------------------------------------
  1139. // This was my lame hack to avoid sending buffers when nothing was spoken.
  1140. // It was causing problems (among others) since StartNewUnit() was still sending
  1141. // events - with no corresponding audio buffer!
  1142. //
  1143. // This was too simple of a scheme. Disable this feature for now...
  1144. // ...until I come up with something more robust. (MC)
  1145. //------------------------------------------------------------------------------------
  1146. //if( m_HasSpeech )
  1147. {
  1148. hr = m_pOutputSite->Write( (void*)m_pSpeechBuf,
  1149. m_cOutSamples_Frame * m_BytesPerSample,
  1150. &cbWritten );
  1151. if( FAILED( hr ) )
  1152. {
  1153. //----------------------------------------
  1154. // Abort! Unable to write audio data
  1155. //----------------------------------------
  1156. m_SpeechState = SPEECH_DONE;
  1157. }
  1158. }
  1159. }
  1160. //------------------------------------
  1161. // Return render state
  1162. //------------------------------------
  1163. return hr;
  1164. } /* CBackend::RenderFrame */