Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

885 lines
28 KiB

  1. /*********************************************************************
  2. Silence.Cpp - Code for detecting silence on an incoming audio stream
  3. begun 5/14/94 by Mike Rozak
  4. Modified 12/10/96 by John Merrill to fix up alignment problems
  5. */
  6. #include "stdafx.h"
  7. #include <malloc.h>
  8. #include "silence.h"
  9. // temporary
  10. #pragma warning(disable: 4100 4244)
  11. /*********************************************************************
  12. LowPassFilter - This low-pass filters 16-bit mono PCM data from one
  13. buffer into another.
  14. inputs
  15. short *lpSrc - Source buffer
  16. DWORD dwNumSamples - Number of samples in the source buffer
  17. short *lpDst - Destination buffer. This will be filled in
  18. with a low-passed version. It will have about an 8
  19. sample lag. This must be as large as lpSrc.
  20. short *psMax - Filled in with the new maximum.
  21. If NULL then nothing is copied.
  22. short *psMin - Filled in with the new minimum
  23. If NULL then nothing is copied.
  24. short *psAvg - Filled in with the new average
  25. If NULL then nothing is copied.
  26. DWORD dwSamplesPerSec
  27. returns
  28. DWORD - Number of samples returned. This will be <= dwNumSamples,
  29. possible dwNumSamples - 7.
  30. */
  31. DWORD LowPassFilter (short *lpSrc, DWORD dwNumSamples, short *lpDst,
  32. short *psMax, short *psMin, short *psAvg, DWORD dwSamplesPerSec)
  33. {
  34. SPDBG_FUNC( "LowPassFilter" );
  35. DWORD i;
  36. long lSum;
  37. short sSum, sMax, sMin;
  38. short *lpLag;
  39. BOOL fLow = (dwSamplesPerSec < 13000);
  40. #define SHIFTRIGHT (fLow ? 3 : 4) // # bits to shift right.
  41. #define WINDOWSIZE (1 << SHIFTRIGHT) // # samples
  42. if (dwNumSamples < (DWORD) (WINDOWSIZE+1))
  43. return 0;
  44. // take the first 8 samples and average them together.
  45. lSum = 0;
  46. for (i = 0; i < (DWORD) WINDOWSIZE; i++)
  47. lSum += lpSrc[i];
  48. sSum = (short) (lSum >> SHIFTRIGHT);
  49. //loop through the rest of the samples
  50. lpLag = lpSrc;
  51. lpSrc += WINDOWSIZE;
  52. dwNumSamples -= WINDOWSIZE;
  53. lSum = 0; // total
  54. sMax = -32768;
  55. sMin = 32767;
  56. for (i = 0;dwNumSamples; lpSrc++, lpDst++, lpLag++, i++, dwNumSamples--) {
  57. sSum = sSum - (*lpLag >> SHIFTRIGHT) + (*lpSrc >> SHIFTRIGHT);
  58. // sSum = *lpSrc; // Dont do any filtering at all
  59. *lpDst = sSum;
  60. lSum += sSum;
  61. if (sSum > sMax)
  62. sMax = sSum;
  63. if (sSum < sMin)
  64. sMin = sSum;
  65. };
  66. // whow much did we do
  67. if (psMax)
  68. *psMax = sMax;
  69. if (psMin)
  70. *psMin = sMin;
  71. if (psAvg && i)
  72. *psAvg = (short) (lSum / (long) i);
  73. return i;
  74. }
  75. /*********************************************************************
  76. QuantSamples - This quantizes the samples to +1, 0, or -1 (in place),
  77. depedning if the given value is:
  78. > sPositive then +1
  79. < sNegative then -1
  80. else 0
  81. inputs
  82. short *pSamples - Samples
  83. DWORD dwNumSamples - Number of samples
  84. short sPositive - Positive threshhold
  85. short sNegative - Negative threshhold
  86. returns
  87. none
  88. */
  89. void QuantSamples (short *pSamples, DWORD dwNumSamples,
  90. short sPositive, short sNegative)
  91. {
  92. SPDBG_FUNC( "QuantSamples" );
  93. while (dwNumSamples) {
  94. if (*pSamples > sPositive)
  95. *pSamples = 1;
  96. else if (*pSamples < sNegative)
  97. *pSamples = -1;
  98. else
  99. *pSamples = 0;
  100. pSamples++;
  101. dwNumSamples--;
  102. };
  103. }
  104. /*********************************************************************
  105. FindZC - This searches through the samples for the first zero crossing.
  106. The returned point will have its previous sample at <= 0, and the
  107. new one at >0.
  108. inputs
  109. short *pSamples - Samples;
  110. DWORD dwNumSamples - Number of samples
  111. returns
  112. DWORD - first sampe number which is positive, or 0 if cant find
  113. */
  114. DWORD FindZC (short *pSamples, DWORD dwNumSamples)
  115. {
  116. SPDBG_FUNC( "FindZC" );
  117. DWORD i;
  118. for (i = 1; i < dwNumSamples; i++)
  119. if ((pSamples[i] > 0) && (pSamples[i-1] <= 0))
  120. return i;
  121. // else cant find
  122. return 0;
  123. }
  124. /*********************************************************************
  125. CompareSegments - This compares two wave segments and sees how much
  126. alike they are, returning a confidence that they are the same.
  127. inputs
  128. short *pA - Samples. This assumes that the samples
  129. are -1, 0, or +1.
  130. short *pB - Samples for B. Should be -1, 0, or +1
  131. DWORD dwNumSamples - Number of samples in each of them
  132. returns
  133. WORD - Confidence from 0 to 0xffff (highest confidence)
  134. Notes about the algo: Each sample will score a "similarity point"
  135. for like signs, or if one of the values is a 0.
  136. */
  137. WORD CompareSegments (short *pA, short *pB, DWORD dwNumSamples)
  138. {
  139. SPDBG_FUNC( "CompareSegments" );
  140. DWORD dwSimilar = 0;
  141. DWORD dwLeft;
  142. for (dwLeft = dwNumSamples; dwLeft; pA++, pB++, dwLeft--)
  143. if ((*pA == *pB) || (*pA == 0) || (*pB == 0))
  144. dwSimilar++;
  145. return (WORD) ((dwSimilar * 0xffff) / dwNumSamples);
  146. }
  147. /*********************************************************************
  148. FindMostLikelyWaveLen - This Searches through wave data and finds the
  149. most likeley wavelength for voiced audio. it returns a condifence
  150. score from 0 to ffff (ffff is 100% positive).
  151. inputs
  152. short *pSamples - Samples
  153. DWORD dwNumSamples - Number of samples
  154. DWORD dwMinWaveLen - Minimum accepatble wavelength
  155. DWORD dwMaxWaveLen - Maximum acceptable wavelength
  156. WORD *pwConfidence - Filled in with confidence rating.
  157. returns
  158. DWORD - Wavelength found. 0 if can't deteermine anything
  159. */
  160. DWORD FindMostLikelyWaveLen (short *pSamples, DWORD dwNumSamples,
  161. DWORD dwMinWaveLen, DWORD dwMaxWaveLen, WORD *pwConfidence)
  162. {
  163. SPDBG_FUNC( "FindMostLikelyWaveLen" );
  164. #define NUMCOMP (3)
  165. DWORD dwFirstZC, i;
  166. DWORD dwBestWaveLen;
  167. WORD wBestConfidence;
  168. DWORD dwCurZC, dwCurWaveLen, dwTemp;
  169. WORD wConf, wTemp;
  170. // Step one, find the first zero crossing
  171. dwFirstZC = FindZC (pSamples, dwNumSamples);
  172. if (!dwFirstZC)
  173. return 0; // error
  174. // Start at a minimum-wavelength away and start finding a wave
  175. // which repeats three times and compares well.
  176. dwBestWaveLen = 0; // best wavelength found so far
  177. wBestConfidence = 0; // confidence of the best wavelength
  178. dwCurWaveLen = dwMinWaveLen;
  179. while (dwCurWaveLen <= dwMaxWaveLen) {
  180. // Try the first comparison
  181. dwCurZC = dwFirstZC + dwCurWaveLen;
  182. if (dwCurZC >= dwNumSamples)
  183. break; // no more samples left
  184. // find first zero crossing from the current wavelen
  185. dwTemp = FindZC (pSamples + dwCurZC, dwNumSamples - dwCurZC);
  186. if (!dwTemp)
  187. break; // no more samples left
  188. dwCurZC += dwTemp;
  189. dwCurWaveLen += dwTemp;
  190. // Make sure that we have three wavelength's worth
  191. if ((dwFirstZC + (NUMCOMP+1)*dwCurWaveLen) >= dwNumSamples)
  192. break; // cant compare this
  193. // Do two confidence tests and multiply them toegther to
  194. // get the confidence for this wavelength
  195. wConf = 0xffff;
  196. for (i = 0; i < NUMCOMP; i++) {
  197. wTemp = CompareSegments (pSamples + dwFirstZC /* + i * dwCurWaveLen */,
  198. pSamples + (dwFirstZC + (i+1) * dwCurWaveLen), dwCurWaveLen);
  199. wConf = (WORD) (((DWORD) wConf * (DWORD) wTemp) >> 16);
  200. };
  201. // If we're more confident about this one than others then use it
  202. if (wConf >= wBestConfidence) {
  203. wBestConfidence = wConf;
  204. dwBestWaveLen = dwCurWaveLen;
  205. };
  206. // Up the current wavelength just a tad
  207. dwCurWaveLen++;
  208. };
  209. *pwConfidence = wBestConfidence;
  210. return dwBestWaveLen;
  211. }
  212. /*********************************************************************
  213. IsSegmentVoiced - This detects if the segment if voiced or not.
  214. inputs
  215. short *pSamples - Sample data
  216. DWORD dwNumSamples - number of samples
  217. DWORD dwSamplesPerSec - Number of sample sper second
  218. WORD wMinConfidence - Minimum condifence
  219. returns
  220. BOOL - TRUE if its definately voiced, FALSE if not or cant tell
  221. */
  222. BOOL CSilence::IsSegmentVoiced (short *pSamples, DWORD dwNumSamples,
  223. DWORD dwSamplesPerSec, WORD wMinConfidence, short *asFiltered)
  224. {
  225. SPDBG_FUNC( "CSilence::IsSegmentVoiced" );
  226. //#define FILTERNUM (1024) // max # samples i nthe filter
  227. //#define MAXVOICEHZ (300) // maximum voicce pitchm in hz
  228. //#define MINVOICEHZ (50) // minimum voice pitch in hz
  229. // #define MINCONFIDENCE (0x6000) // minimum confidence
  230. // This means that 70% of the samples line up from one wavelength
  231. // to another
  232. DWORD dwNumFilter;
  233. //short asFiltered[FILTERNUM];
  234. short sMax, sMin, sAvg;
  235. DWORD dwWaveLen;
  236. WORD wConfidence;
  237. short sPositive, sNegative;
  238. // Filter it first so we just get the voiced audio range
  239. if (dwNumSamples > FILTERNUM)
  240. dwNumSamples = FILTERNUM;
  241. dwNumFilter = LowPassFilter (pSamples, dwNumSamples, asFiltered,
  242. &sMax, &sMin, &sAvg, m_dwSamplesPerSec);
  243. // Truncate the wave samples to +1, 0, -1
  244. sPositive = sAvg;
  245. sNegative = sAvg;
  246. QuantSamples (asFiltered, dwNumFilter, sPositive, sNegative);
  247. // look through the voiced wavelengths for a frequency
  248. dwWaveLen = FindMostLikelyWaveLen (asFiltered, dwNumFilter,
  249. dwSamplesPerSec / m_dwHighFreq, dwSamplesPerSec / MINVOICEHZ,
  250. &wConfidence);
  251. return (dwWaveLen && (wConfidence >= wMinConfidence));
  252. }
  253. /*********************************************************************
  254. TrimMaxAmp - This extracts the maximum amplitude range of the wave file
  255. segment.
  256. inputs
  257. short * lpS - samples to look through
  258. WORD dwNum - number of samples
  259. returns
  260. WORD - maximum amplitude range
  261. */
  262. WORD NEAR PASCAL TrimMaxAmp (short * lpS, DWORD dwNum)
  263. {
  264. SPDBG_FUNC( "TrimMaxAmp" );
  265. DWORD i;
  266. short sMin, sMax, sTemp;
  267. sMin = 32767;
  268. sMax = (short) -32768;
  269. for (i = dwNum; i; i--) {
  270. sTemp = *(lpS++);
  271. if (sTemp < sMin)
  272. sMin = sTemp;
  273. if (sTemp > sMax)
  274. sMax = sTemp;
  275. };
  276. // If we're clipping at all then claim that we've maxed out.
  277. // Some sound cards have bad DC offsets
  278. if ((sMax >= 0x7f00) || (sMin <= -0x7f00))
  279. return 0xffff;
  280. return (WORD) (sMax - sMin);
  281. }
  282. /********************************************************************
  283. TrimMaxAmpDelta - This extracts the maximum amplitude range and
  284. calculates the maximum delta of the wave file
  285. segment.
  286. inputs
  287. PBLOCKCHAR pBlockChar - Pointer to a block characteristic
  288. structure which is filled in.
  289. short * lpS - deltas to look through
  290. WORD dwNum - number of samples
  291. returns
  292. nothing
  293. */
  294. void TrimMaxAmpDelta(PBLOCKCHAR pBlockChar, short *lpS, DWORD dwNum)
  295. {
  296. SPDBG_FUNC( "TrimMaxAmpDelta" );
  297. DWORD i;
  298. WORD wMax = 0;
  299. WORD wTemp;
  300. short sMin, sMax, sCur, sLast;
  301. // BUGFIX: 4303 Merge TrimMaxAmp and TrimMaxDelta
  302. sLast = sMin = sMax = *(lpS++);
  303. for (i = dwNum - 1; i; i--, sLast = sCur) {
  304. sCur = *(lpS++);
  305. // TrimMaxAmp
  306. if (sCur < sMin)
  307. sMin = sCur;
  308. if (sCur > sMax)
  309. sMax = sCur;
  310. // TrimMaxDelta
  311. wTemp = sCur > sLast ? (WORD) (sCur - sLast) : (WORD) (sLast - sCur);
  312. if (wTemp > wMax)
  313. wMax = wTemp;
  314. }
  315. // If we're clipping at all then claim that we've maxed out.
  316. // Some sound cards have bad DC offsets
  317. pBlockChar->wMaxLevel = ((sMax >= 0x7F00) || (sMin <= -0x7F00)) ? 0xFFFF : (WORD) (sMax - sMin);
  318. pBlockChar->wMaxDelta = wMax;
  319. } /* End of TrimMaxAmpDelta() */
  320. /*********************************************************************
  321. GetBlockChar - This gets the characteristics of a block of audio.
  322. This characteristics can then be used to determine if the block
  323. is silent or not.
  324. inputs
  325. short *lpS - sample data
  326. DWORD dwNum - number of samples
  327. PBLOCKCHAR pBlockChar - Pointer to a block characteristic
  328. structure which is filled in.
  329. BOOL fTestVoiced - Voicce testing will only be done if
  330. this is TTRUE (in order to save processor).
  331. returns
  332. none
  333. */
  334. void GetBlockChar(short *lpS, DWORD dwNum, PBLOCKCHAR pBlockChar, BOOL fTestVoiced)
  335. {
  336. SPDBG_FUNC( "GetBlockChar" );
  337. // BUGFIX: 4303 Merge TrimMaxAmp and TrimMaxDelta
  338. TrimMaxAmpDelta(pBlockChar, lpS, dwNum);
  339. pBlockChar->bIsVoiced = pBlockChar->bHighLevel =
  340. pBlockChar->bHighDelta = SIL_UNKNOWN;
  341. }
  342. /*********************************************************************
  343. IsBlockSound - This detects whether the block is silent or not.
  344. inputs
  345. PBLOCKCHAR pBlockInQuestion - Block in question. This has the
  346. bHighLevel and bHighDelta flags modified
  347. PBLOCKCHAR pBlockSilence - Silent block
  348. BOOL fInUtterance - TRUE if we're in an utterance (which
  349. means be more sensative), FALSE if we're not
  350. returns
  351. BOOL - TTRUE if has sound, FALSE if it is silent
  352. */
  353. BOOL IsBlockSound (PBLOCKCHAR pBlockInQuestion, PBLOCKCHAR pBlockSilence,
  354. BOOL fInUtterance)
  355. {
  356. SPDBG_FUNC( "IsBlockSound" );
  357. #ifdef SOFTEND // Use so that catches a soft ending to phrases
  358. #define SENSINV_THRESHHOLD_LEVEL(x) (((x)/4)*3)
  359. #define SENSINV_THRESHHOLD_DELTA(x) (((x)/4)*3)
  360. #else
  361. #define SENSINV_THRESHHOLD_LEVEL(x) ((x)/2)
  362. #define SENSINV_THRESHHOLD_DELTA(x) ((x)/2)
  363. #endif
  364. #define NORMINV_THRESHHOLD_LEVEL(x) ((x)/2)
  365. #define NORMINV_THRESHHOLD_DELTA(x) ((x)/2)
  366. if (fInUtterance) {
  367. pBlockInQuestion->bHighLevel =
  368. SENSINV_THRESHHOLD_LEVEL(pBlockInQuestion->wMaxLevel) >= pBlockSilence->wMaxLevel;
  369. pBlockInQuestion->bHighDelta =
  370. SENSINV_THRESHHOLD_DELTA(pBlockInQuestion->wMaxDelta) >= pBlockSilence->wMaxDelta;
  371. }
  372. else {
  373. pBlockInQuestion->bHighLevel =
  374. NORMINV_THRESHHOLD_LEVEL(pBlockInQuestion->wMaxLevel) >= pBlockSilence->wMaxLevel;
  375. pBlockInQuestion->bHighDelta =
  376. NORMINV_THRESHHOLD_DELTA(pBlockInQuestion->wMaxDelta) >= pBlockSilence->wMaxDelta;
  377. };
  378. return pBlockInQuestion->bHighLevel || pBlockInQuestion->bHighDelta;
  379. }
  380. /*********************************************************************
  381. ReEvaluateSilence - This takes the values used for silence and re-evaluates
  382. them based upon new data which indicates what silence is. It
  383. automatically adjusts to the noise level in the room over a few seconds.
  384. NOTE: This should not be called when an utterance is happening, or
  385. when it might be starting.
  386. inputs
  387. PBLOCKCHAR pSilence - This is the silence block, and should
  388. start out with values in it. It will be modified
  389. so to incorporate the new silence information.
  390. PBLOCKCHAR pNew - New block which is known to be silence.
  391. BYTE bWeight - This is the weighting of the new block
  392. in influencing the old block, in a value from 0 to 255.
  393. 256 means that the value of the new silence completely
  394. overpowers the old one, 0 means that it doesnt have
  395. any affect.
  396. returns
  397. none
  398. */
  399. void ReEvaluateSilence (PBLOCKCHAR pSilence, PBLOCKCHAR pNew,
  400. BYTE bWeight)
  401. {
  402. SPDBG_FUNC( "ReEvaluateSilence" );
  403. #define ADJUST(wOrig,wNew,bWt) \
  404. (WORD) (( \
  405. ((DWORD) (wOrig) * (DWORD) (256 - (bWt))) + \
  406. ((DWORD) (wNew) * (DWORD) (bWt)) \
  407. ) >> 8);
  408. pSilence->wMaxLevel = ADJUST (pSilence->wMaxLevel,
  409. pNew->wMaxLevel, bWeight);
  410. pSilence->wMaxDelta = ADJUST (pSilence->wMaxDelta,
  411. pNew->wMaxDelta, bWeight);
  412. // If it's way too silence (and too good to be true) then assume
  413. // a default silece
  414. // if (!pNew->wMaxLevel && !pNew->wMaxDelta) {
  415. // if (pSilence->wMaxLevel < 2500)
  416. // pSilence->wMaxLevel = 2500;
  417. // if (pSilence->wMaxDelta < 400)
  418. // pSilence->wMaxDelta = 400;
  419. // }
  420. }
  421. /*********************************************************************
  422. WhatsTheNewState - This takes in a stream of bit-field indicating which
  423. of the last 32 blocks were detected as having sound, and what our
  424. state was the last time this was called (utterance or not). It then
  425. figureous out if we're still in an utterance, or we just entered one.
  426. It also says how many buffers ago that was.
  427. inputs
  428. DWORD dwSoundBits - This is a bit-field of the last 32
  429. audio blocks. A 1 in the field indicates that there was
  430. sound there, a 0 indicates no sound. The low bit
  431. corresponds to the most recent block, and high bit
  432. the oldest.
  433. DWORD dwVoicedBits - Just like sound bits except that it indicates
  434. voiced sections of sound.
  435. BOOL fWasInUtterance - This is true is we had an utterance
  436. the last time this called, FALSE if there was silence
  437. BOOL fLongUtterance - If this is a long utterance then dont
  438. react for 1/4 second, otherwise use 1/8 second for
  439. short utterance
  440. WORD wBlocksPerSec - How many of the above-mentioned blocks
  441. fit into a second.
  442. WORD *wStarted - If a transition occurs from no utterance to
  443. an utterance, then this fills in the number of of blocks
  444. ago that the utterance started, into *wStarted. Otherwise
  445. it is not changed.
  446. WORD wReaction - Reaction time (in blocks) after an utterance is
  447. finished
  448. returns
  449. BOOL - TRUE if we're in an utterance now, FALSE if we're in silence
  450. */
  451. BOOL CSilence::WhatsTheNewState (DWORD dwSoundBits, DWORD dwVoicedBits,
  452. BOOL fWasInUtterance, BOOL fLongUtterance,
  453. WORD wBlocksPerSec, WORD *wStarted, WORD wReaction)
  454. {
  455. SPDBG_FUNC( "CSilence::WhatsTheNewState" );
  456. WORD wCount, wOneBits;
  457. WORD wTimeToCheck;
  458. DWORD dwTemp, dwMask;
  459. if (fWasInUtterance)
  460. wTimeToCheck = wReaction;
  461. else
  462. wTimeToCheck = (wBlocksPerSec/4); // 1/4 second
  463. if (!wTimeToCheck)
  464. wTimeToCheck = 1;
  465. for (wOneBits = 0, wCount = wTimeToCheck, dwTemp = dwSoundBits;
  466. wCount;
  467. dwTemp /= 2, wCount--)
  468. if (dwTemp & 0x01)
  469. wOneBits++;
  470. if (fWasInUtterance) {
  471. // If we were in an utterance, then we still are in an utterance
  472. // UNLESS the number of bits which are turned on for the last
  473. // 0.5 seconds is less that 1/4 of what should be turned on.
  474. if ( (wOneBits >= 1))
  475. return TRUE;
  476. else
  477. return FALSE;
  478. }
  479. else {
  480. // We are in silence. We cannot possible go into an utterance
  481. // until the current block is voicced
  482. if (!(dwVoicedBits & 0x01))
  483. return FALSE;
  484. // If we were in silence then we're still in silence
  485. // UNLESS the number of bits which are turned on for the last
  486. // 0.5 seconds is more than 1/2 of what should be turned on.
  487. // If so, then start the utterance 0.75 seconds ago.
  488. if (wOneBits >= (wTimeToCheck / 2)) {
  489. // we're not in an utterance
  490. // Look back until get 1/8 second of silence, and include
  491. // that in the data returned
  492. dwTemp = dwSoundBits;
  493. // dwMask = (1 << (wBlocksPerSec / 8)) - 1;
  494. // for (wCount = wBlocksPerSec/8; dwTemp & dwMask; dwTemp >>= 1, wCount++);
  495. dwMask = (1 << (wBlocksPerSec / m_wAddSilenceDiv)) - 1;
  496. for (wCount = wBlocksPerSec/m_wAddSilenceDiv; dwTemp & dwMask; dwTemp >>= 1, wCount++);
  497. *wStarted = wCount;
  498. return TRUE;
  499. }
  500. else
  501. return FALSE;
  502. };
  503. }
  504. /*********************************************************************
  505. CSilence::CSilence - This creates the silence class.
  506. inputs
  507. WORD wBlocksPerSec - Number of blocks per second. The blocks
  508. will be passed down through AddBlock().
  509. returns
  510. class
  511. */
  512. CSilence::CSilence (WORD wBlocksPerSec)
  513. {
  514. SPDBG_FUNC( "CSilence::CSilence" );
  515. m_wBlocksPerSec = min(wBlocksPerSec, 32); // no more than the # bits in a DWORD
  516. m_wBlocksInQueue = m_wBlocksPerSec; // 1 second worth.
  517. m_wLatestBlock = 0;
  518. m_paBlockInfo = NULL;
  519. m_dwSoundBits = m_dwVoicedBits = 0;
  520. m_fFirstBlock = TRUE;
  521. m_fInUtterance = FALSE;
  522. m_dwUtteranceLength = 0;
  523. m_dwSamplesPerSec = 11025;
  524. }
  525. /*********************************************************************
  526. CSilence::~CSilence - Free up everything.
  527. */
  528. CSilence::~CSilence (void)
  529. {
  530. SPDBG_FUNC( "CSilence::~CSilence" );
  531. WORD i;
  532. if (m_paBlockInfo) {
  533. for (i = 0; i < m_wBlocksInQueue; i++)
  534. if (m_paBlockInfo[i].pSamples)
  535. free(m_paBlockInfo[i].pSamples);
  536. free(m_paBlockInfo);
  537. }
  538. if (m_pASFiltered)
  539. free(m_pASFiltered);
  540. }
  541. /*********************************************************************
  542. CSilence::Init - This initializes the silence code. It basically
  543. allocates memory. It should be called immediately after the object
  544. is created and then not again.
  545. inputs
  546. none
  547. returns
  548. BOOL - TRUE if succeded, else out of memory
  549. */
  550. BOOL CSilence::Init(BOOL fPhoneOptimized, DWORD dwSamplesPerSec)
  551. {
  552. SPDBG_FUNC( "CSilence::Init" );
  553. m_dwSamplesPerSec = dwSamplesPerSec;
  554. if (fPhoneOptimized) {
  555. m_wAddSilenceDiv = (WORD) PHADD_BEGIN_SILENCE;
  556. m_dwHighFreq = PHMAXVOICEHZ;
  557. }
  558. else {
  559. m_wAddSilenceDiv = (WORD) PCADD_BEGIN_SILENCE;
  560. m_dwHighFreq = PCMAXVOICEHZ;
  561. }
  562. if ((m_pASFiltered = (short *) malloc((sizeof(short)) * FILTERNUM)) == NULL)
  563. return (FALSE);
  564. // Initialize memory for the blocks and clear it.
  565. if (m_paBlockInfo)
  566. return (TRUE);
  567. m_paBlockInfo = (PBINFO) malloc(m_wBlocksInQueue * sizeof(BINFO));
  568. if (!m_paBlockInfo)
  569. return (FALSE);
  570. if (m_wBlocksInQueue && m_paBlockInfo)
  571. memset(m_paBlockInfo, 0, m_wBlocksInQueue * sizeof(BINFO));
  572. return (TRUE);
  573. } /* End of Init() */
  574. /*********************************************************************
  575. CSilence::AddBlock - This does the following:
  576. - Add the block the the queue. Free up an old block if needed.
  577. The block should be 1/wBlocksPerSec long (about).
  578. - Analyze the block to see if its got sound or is quiet.
  579. - Fill in *wVU with a VU level.
  580. - Return TRUE if we're in an utterance, FALSE if its silence now.
  581. If TRUE then app should call GetBlock() until no more blocks left,
  582. and pass them to the SR engine.
  583. inputs
  584. short *pSamples - Pointer to samples. This memory should
  585. be allocaed with malloc(), and may be freed by the
  586. object.
  587. DWORD dwNumSamples - Number of samples
  588. WORD *wVU - This is fille in with the VU meter for the block
  589. QWORD qwTimeStamp - Time stamp for this buffer.
  590. returns
  591. BOOL - TRUE if an utterance is taking place, FALSE if its silent
  592. */
  593. BOOL CSilence::AddBlock (short *pSamples, DWORD dwNumSamples,
  594. WORD *wVU, QWORD qwTimeStamp)
  595. {
  596. SPDBG_FUNC( "CSilence::AddBlock" );
  597. BLOCKCHAR bcNew;
  598. BOOL fSound, fUtt;
  599. PBINFO pbInfo;
  600. WORD wUttStart, i;
  601. // Dont add empty blocks
  602. if (!dwNumSamples) {
  603. if (pSamples)
  604. free (pSamples);
  605. return m_fInUtterance;
  606. };
  607. // Analyze the block for characteristics.
  608. GetBlockChar (pSamples, dwNumSamples, &bcNew, !m_fInUtterance);
  609. // fill in the vu
  610. *wVU = bcNew.wMaxLevel;
  611. // see if it's silent or not
  612. if (m_fFirstBlock) {
  613. // first block, so of course its silent
  614. m_bcSilence = bcNew;
  615. m_fFirstBlock = FALSE;
  616. fSound = FALSE;
  617. // BUGFIX 2466 - If it's way too silence (and too good to be true) then assume
  618. // a default silece
  619. if ((m_bcSilence.wMaxLevel < 500) || (m_bcSilence.wMaxDelta < 100)) {
  620. m_bcSilence.wMaxLevel = 2500;
  621. m_bcSilence.wMaxDelta = 400;
  622. };
  623. // If it's way too loud then cut down
  624. if ((m_bcSilence.wMaxLevel > 2500) || (m_bcSilence.wMaxDelta > 1500)) {
  625. m_bcSilence.wMaxLevel = min (m_bcSilence.wMaxLevel, 2500);
  626. m_bcSilence.wMaxDelta = min (m_bcSilence.wMaxDelta, 1500);
  627. };
  628. }
  629. else {
  630. fSound = IsBlockSound (&bcNew, &m_bcSilence, m_fInUtterance);
  631. };
  632. // Test to see if the block is voiced if:
  633. // - The amplitude level is more than background sound
  634. // - We're not yet in an utterance (to save processor)
  635. if (bcNew.bHighLevel && !m_fInUtterance) {
  636. WORD wNoise;
  637. wNoise = (m_dwSamplesPerSec <= 13000) ?
  638. m_wNoiseThresh :
  639. ((m_wNoiseThresh / 3) * 2);
  640. bcNew.bIsVoiced = this->IsSegmentVoiced (pSamples, dwNumSamples, m_dwSamplesPerSec, wNoise, m_pASFiltered) ?
  641. SIL_YES : SIL_NO;
  642. }
  643. // add the block
  644. m_dwVoicedBits = (m_dwVoicedBits << 1) |
  645. ( (bcNew.bIsVoiced == SIL_YES) ? 1 : 0 );
  646. m_dwSoundBits = (m_dwSoundBits << 1) | (fSound ? 1 : 0);
  647. m_wLatestBlock++;
  648. if (m_wLatestBlock >= m_wBlocksInQueue)
  649. m_wLatestBlock = 0;
  650. pbInfo = m_paBlockInfo + m_wLatestBlock;
  651. if (pbInfo->pSamples)
  652. free (pbInfo->pSamples);
  653. pbInfo->pSamples = pSamples;
  654. pbInfo->dwNumSamples = dwNumSamples;
  655. // BUGFIX: Alignment code. We need to store the timestamp for
  656. // the BEGINNING of the block, not the end!
  657. pbInfo->qwTimeStamp = qwTimeStamp - dwNumSamples * sizeof(WORD);
  658. // What's our utterance state?
  659. fUtt = this->WhatsTheNewState (m_dwSoundBits, m_dwVoicedBits, m_fInUtterance,
  660. m_dwUtteranceLength >= m_wBlocksPerSec,
  661. m_wBlocksPerSec, &wUttStart, m_wReaction);
  662. if (fUtt && !m_fInUtterance) {
  663. // We just entered an utterance, so wUttStart has a valid teerm
  664. // in it. Go through the buffer queue and free all buffers which
  665. // are older than wUttStart. Remembeer, this is a circular buffer
  666. for (i = 0; i < (m_wBlocksInQueue - wUttStart); i++) {
  667. pbInfo = m_paBlockInfo +
  668. ( (m_wLatestBlock + i + 1) % m_wBlocksInQueue);
  669. if (pbInfo->pSamples)
  670. free (pbInfo->pSamples);
  671. pbInfo->pSamples = NULL;
  672. };
  673. // Since we just entered an utterance clear the utterance length counter
  674. m_dwUtteranceLength = 0;
  675. };
  676. m_fInUtterance = fUtt;
  677. // Remember how long this utterance has done on. Long utterances
  678. // deserve more patience as far as silence goes
  679. m_dwUtteranceLength++;
  680. // Adjust the silence level if we're not in an utterance
  681. // Requiring !fSound so that we dont accidentally indclude any
  682. // utterance sections in the sound calculations
  683. if (!m_fInUtterance /* && !fSound */) {
  684. ReEvaluateSilence (&m_bcSilence, &bcNew,
  685. 255 / m_wBlocksPerSec);
  686. }
  687. else if (m_dwUtteranceLength >= ((DWORD)m_wBlocksPerSec * 30))
  688. // if we have a very long utterance (> 30 second) then it's not
  689. ReEvaluateSilence (&m_bcSilence, &bcNew, 255 / m_wBlocksPerSec);
  690. // done
  691. return m_fInUtterance;
  692. }
  693. /*********************************************************************
  694. CSilence::ExpectNoiseChange - Sent to the silence detection algorithm
  695. when it should expect the noise floor to go up/down.
  696. inputs
  697. WORD wValue - Amount that noise floor should change.
  698. 0x100 = no change. > 0x100 => louder, < 0x100 => quieter
  699. returns
  700. */
  701. void CSilence::ExpectNoiseChange (WORD wValue)
  702. {
  703. SPDBG_FUNC( "CSilence::ExpectNoiseChange" );
  704. DWORD dwTemp;
  705. dwTemp = ((DWORD) m_bcSilence.wMaxLevel * wValue) >> 8;
  706. if (dwTemp > 0xffff)
  707. dwTemp = 0xffff;
  708. m_bcSilence.wMaxLevel = (WORD) dwTemp;
  709. dwTemp = ((DWORD) m_bcSilence.wMaxDelta * wValue) >> 8;
  710. if (dwTemp > 0xffff)
  711. dwTemp = 0xffff;
  712. m_bcSilence.wMaxDelta = (WORD) dwTemp;
  713. }
  714. /*********************************************************************
  715. CSilence::GetBlock - This gets a block from the queue. This will fail
  716. if there are no more blocks left to get OR if there's not utterance.
  717. inputs
  718. DWORD *pdwNumSamples - If a block is returned then this
  719. will be filled in with the number of samples in the block.
  720. QWORD *pqwTimeStamp - Filled in woth the time-stamp for the
  721. buffer.
  722. returns
  723. short * - Pointer to a block of samples. This memory is the
  724. caller's property and can be freed with free().
  725. */
  726. short * CSilence::GetBlock (DWORD *pdwNumSamples, QWORD * pqwTimeStamp)
  727. {
  728. SPDBG_FUNC( "CSilence::GetBlock" );
  729. PBINFO pbInfo;
  730. WORD i, wCount;
  731. short *pSamples;
  732. if (!m_fInUtterance)
  733. return NULL;
  734. // find the first occurance
  735. i = (m_wLatestBlock + 1) % m_wBlocksInQueue;
  736. for (wCount = m_wBlocksInQueue; wCount;
  737. i = ((i < (m_wBlocksInQueue-1)) ? (i+1) : 0), wCount-- ) {
  738. pbInfo = m_paBlockInfo + i;
  739. if (pbInfo->pSamples) {
  740. *pdwNumSamples = pbInfo->dwNumSamples;
  741. *pqwTimeStamp = pbInfo->qwTimeStamp;
  742. pSamples = pbInfo->pSamples;
  743. pbInfo->pSamples = NULL;
  744. return pSamples;
  745. };
  746. };
  747. // if got here then couldnt find anything
  748. return NULL;
  749. }
  750. /*********************************************************************
  751. CSilence::KillUtterance - Kills an exitsing utterance.
  752. inputs
  753. none
  754. returns
  755. none
  756. */
  757. void CSilence::KillUtterance (void)
  758. {
  759. SPDBG_FUNC( "CSilence::KillUtterance" );
  760. m_fInUtterance = FALSE;
  761. m_dwSoundBits = 0;
  762. m_dwVoicedBits = 0;
  763. }