Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1121 lines
41 KiB

  1. #include "basetypes.h"
  2. #include "mathlib/ssemath.h"
  3. #include "soundsystem/lowlevel.h"
  4. #include "mix.h"
  5. #include "tier0/vprof.h"
  6. // simple inline to test alignemnt of a value
  7. inline bool IsAlign4( uint nAlign )
  8. {
  9. return ( nAlign & 3 ) == 0;
  10. }
  11. inline bool IsAligned16Bytes( void *p )
  12. {
  13. return ( uintp( p ) & 0xF ) ? false : true;
  14. }
  15. // this processes the low-level mix command list and produces pResults
  16. void ProcessAudioMix( CAudioMixResults *pResults, const CAudioMixState &mixState, CAudioMixDescription &mixSetup )
  17. {
  18. // set up with current counts
  19. pResults->m_pOutput.RemoveAll();
  20. pResults->m_pOutput.SetCount( mixSetup.m_nMixBufferMax );
  21. pResults->m_debugOutputs.SetCount( mixSetup.m_nDebugOutputCount );
  22. pResults->m_flOutputLevels.SetCount( mixSetup.m_nOutputLevelCount );
  23. // now run the commands
  24. VPROF("IAudioMix::Process");
  25. for ( int i = 0; i < mixSetup.m_commands.Count(); i++ )
  26. {
  27. audio_mix_command_t &cmd = mixSetup.m_commands[i];
  28. switch( cmd.m_nCommandId )
  29. {
  30. case AUDIO_MIX_CLEAR:
  31. SilenceBuffer( pResults->m_pOutput[ cmd.m_nOutput ].m_flData );
  32. break;
  33. case AUDIO_MIX_EXTRACT_SOURCE:
  34. ConvertSourceToFloat( *mixState.GetInput( cmd.m_nInput0 ), cmd.m_flParam1, pResults->m_pOutput[cmd.m_nOutput].m_flData, mixState.GetOutput( cmd.m_nInput0 ) );
  35. break;
  36. case AUDIO_MIX_ADVANCE_SOURCE:
  37. AdvanceSource( *mixState.GetInput( cmd.m_nInput0 ), cmd.m_flParam1, mixState.GetOutput( cmd.m_nInput0 ) );
  38. break;
  39. case AUDIO_MIX_MULTIPLY:
  40. ScaleBuffer( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0 );
  41. break;
  42. case AUDIO_MIX_PROCESS:
  43. {
  44. CAudioProcessor *pProc = mixSetup.m_processors[cmd.m_nInput1];
  45. pProc->Process( &pResults->m_pOutput[cmd.m_nInput0], &pResults->m_pOutput[cmd.m_nOutput], int(cmd.m_flParam0), mixState.DSPGlobals() );
  46. }
  47. break;
  48. case AUDIO_MIX_ACCUMULATE:
  49. MixBuffer( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0 );
  50. break;
  51. case AUDIO_MIX_ACCUMULATE_RAMP:
  52. MixBufferRamp( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0, cmd.m_flParam1 );
  53. break;
  54. case AUDIO_MIX_SUM:
  55. SumBuffer2x1( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0, pResults->m_pOutput[cmd.m_nInput1].m_flData, cmd.m_flParam1 );
  56. break;
  57. case AUDIO_MIX_SWAP:
  58. SwapBuffersInPlace( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData );
  59. break;
  60. case AUDIO_MIX_MEASURE_DEBUG_LEVEL:
  61. {
  62. int nChannelCount = cmd.m_nInput1;
  63. mix_debug_outputs_t &debugOut = pResults->m_debugOutputs[cmd.m_nOutput];
  64. debugOut.m_flLevel = 0.0f;
  65. const float flScale = 1.0f / 32768.0f;
  66. for ( int nChan = 0; nChan < nChannelCount; nChan++ )
  67. {
  68. debugOut.m_flChannelLevels[nChan] = flScale * BufferLevel( pResults->m_pOutput[cmd.m_nInput0 + nChan].m_flData );
  69. debugOut.m_flLevel = Max( debugOut.m_flLevel, debugOut.m_flChannelLevels[nChan] );
  70. }
  71. debugOut.m_nChannelCount = nChannelCount;
  72. }
  73. break;
  74. case AUDIO_MIX_OUTPUT_LEVEL:
  75. {
  76. int nChannelCount = cmd.m_nInput1;
  77. float flLevel = 0.0f;
  78. const float flScale = 1.0f / 32768.0f;
  79. for ( int nChan = 0; nChan < nChannelCount; nChan++ )
  80. {
  81. float flOut = flScale * AvergeBufferAmplitude( pResults->m_pOutput[cmd.m_nInput0 + nChan].m_flData );
  82. flLevel = Max( flLevel, flOut );
  83. }
  84. pResults->m_flOutputLevels[cmd.m_nOutput] = clamp( flLevel, 0.0f, 1.0f );
  85. }
  86. break;
  87. default:
  88. Assert( 0 );
  89. //AssertMsg( 0, "Unknown mix command %d\n", int(cmd.m_nCommandId) );
  90. break;
  91. }
  92. }
  93. }
  94. void CAudioMixCommandList::ClearMultichannel( uint16 nTarget, int nCount )
  95. {
  96. for ( int i = 0; i < nCount; i++ )
  97. {
  98. audio_mix_command_t cmd;
  99. cmd.Init( AUDIO_MIX_CLEAR, nTarget + i );
  100. m_commands.AddToTail( cmd );
  101. }
  102. }
  103. void CAudioMixCommandList::ScaleMultichannel( uint16 nOutput, uint16 nInput, int nCount, float flVolume )
  104. {
  105. for ( int i = 0; i < nCount; i++ )
  106. {
  107. audio_mix_command_t cmd;
  108. cmd.Init( AUDIO_MIX_MULTIPLY, nOutput + i, nInput + i, flVolume );
  109. m_commands.AddToTail( cmd );
  110. }
  111. }
  112. void CAudioMixCommandList::AccumulateMultichannel( uint16 nOutput, int nOutputChannels, uint16 nInput, int nInputChannels, float flInputVolume )
  113. {
  114. if ( nOutputChannels == nInputChannels )
  115. {
  116. for ( int i = 0; i < nInputChannels; i++ )
  117. {
  118. AccumulateToBuffer( nOutput + i, nInput + i, flInputVolume );
  119. }
  120. }
  121. else
  122. {
  123. // need to downmix or expand channels
  124. if ( nOutputChannels == 2 )
  125. {
  126. // downmix 6 ch to 2 ch
  127. Assert( nInputChannels == 6 ); // other cases should have been handled above or there's more code to write
  128. // out.left += 0.5 * (in.left + in.center*0.5) + 0.5 * in.rear_left
  129. AccumulateToBuffer( nOutput + 0, nInput + 0, flInputVolume * 0.5f );
  130. AccumulateToBuffer( nOutput + 0, nInput + 2, flInputVolume * 0.25f );
  131. AccumulateToBuffer( nOutput + 0, nInput + 4, flInputVolume * 0.5f );
  132. // out.right += 0.5 * (in.right + in.center*0.5) + 0.5 * in.rear_right
  133. AccumulateToBuffer( nOutput + 1, nInput + 1, flInputVolume * 0.5f );
  134. AccumulateToBuffer( nOutput + 1, nInput + 2, flInputVolume * 0.25f );
  135. AccumulateToBuffer( nOutput + 1, nInput + 5, flInputVolume * 0.5f );
  136. }
  137. else if ( nOutputChannels == 6 )
  138. {
  139. // expand 2ch to 6 ch
  140. Assert( nInputChannels == 2 );
  141. // out.left += in.left
  142. AccumulateToBuffer( nOutput + 0, nInput + 0, flInputVolume );
  143. // out.right += in.right
  144. AccumulateToBuffer( nOutput + 1, nInput + 1, flInputVolume );
  145. // out.center = 0.5f * (in.left + in.right)
  146. AccumulateToBuffer( nOutput + 2, nInput + 0, flInputVolume * 0.5f );
  147. AccumulateToBuffer( nOutput + 2, nInput + 1, flInputVolume * 0.5f );
  148. // out.rear_left += in.left
  149. AccumulateToBuffer( nOutput + 4, nInput + 0, flInputVolume );
  150. // out.rear_right += in.right
  151. AccumulateToBuffer( nOutput + 5, nInput + 1, flInputVolume );
  152. }
  153. else if ( nOutputChannels == 8 && (nInputChannels == 2 || nInputChannels == 6) )
  154. {
  155. // right now we just use this for solo/debug, copy
  156. for ( int i = 0; i < nInputChannels; i++ )
  157. {
  158. AccumulateToBuffer( nOutput + i, nInput + i, flInputVolume );
  159. }
  160. }
  161. else
  162. {
  163. // some other case we haven't implemented
  164. Assert(0);
  165. }
  166. }
  167. }
  168. FORCEINLINE shortx8 ShiftRightShortSIMD( const shortx8 &inputValue, const shortx8 &shiftBitCount )
  169. {
  170. return _mm_srl_epi16( inputValue, shiftBitCount );
  171. }
  172. FORCEINLINE shortx8 SignedExtractLowAsInt32( const shortx8 &a )
  173. {
  174. shortx8 signExtend = _mm_cmplt_epi16( a, _mm_setzero_si128() );
  175. return _mm_unpacklo_epi16( a, signExtend );
  176. }
  177. FORCEINLINE shortx8 SignedExtractHighAsInt32( const shortx8 &a )
  178. {
  179. shortx8 signExtend = _mm_cmplt_epi16( a, _mm_setzero_si128() );
  180. return _mm_unpackhi_epi16( a, signExtend );
  181. }
  182. FORCEINLINE shortx8 RoundtFloatToInt32( const fltx4 &input )
  183. {
  184. return _mm_cvtps_epi32( input );
  185. }
  186. FORCEINLINE shortx8 PackInt32x2ToShortx8( const shortx8 &input0, const shortx8 &input1 )
  187. {
  188. return _mm_packs_epi32( input0, input1 );
  189. }
  190. // Load 4 aligned words into a SIMD register
  191. FORCEINLINE shortx8 LoadAlignedShortx8SIMD( const void * RESTRICT pSIMD )
  192. {
  193. return _mm_load_si128( reinterpret_cast<const __m128i *>( pSIMD ) );
  194. }
  195. // Load 4 unaligned words into a SIMD register
  196. FORCEINLINE shortx8 LoadUnalignedShortx8SIMD( const void * RESTRICT pSIMD )
  197. {
  198. return _mm_loadu_si128( reinterpret_cast<const __m128i *>( pSIMD ) );
  199. }
  200. // create a stereo interleaved signed-16 buffer from two float-32 buffers
  201. void ConvertFloat32Int16_Clamp_Interleave2_Unaligned( short *pOut, float *pflInputLeft, float *pflInputRight, int nSampleCount )
  202. {
  203. if ( nSampleCount >= 8 )
  204. {
  205. int nSampleQuads = nSampleCount >> 2;
  206. // truncate sample count to remainder after 4-bundles
  207. nSampleCount &= 3;
  208. short *pWrite = pOut;
  209. for ( int i = 0; i < nSampleQuads; i++ )
  210. {
  211. // load 4 samples from left and four from right
  212. fltx4 leftSamples = LoadAlignedSIMD( pflInputLeft );
  213. pflInputLeft += 4;
  214. fltx4 rightSamples = LoadAlignedSIMD( pflInputRight );
  215. pflInputRight += 4;
  216. shortx8 nLeft = RoundtFloatToInt32( leftSamples );
  217. shortx8 nRight = RoundtFloatToInt32( rightSamples );
  218. // interleave into L/R pairs
  219. shortx8 nInterleavedLow = _mm_unpacklo_epi32( nLeft, nRight );
  220. shortx8 nInterleavedHigh = _mm_unpackhi_epi32( nLeft, nRight );
  221. // pack
  222. shortx8 nOut = PackInt32x2ToShortx8( nInterleavedLow, nInterleavedHigh );
  223. StoreUnalignedSIMD( pWrite, nOut );
  224. pWrite += 8;
  225. }
  226. }
  227. // now convert and clamp any remaining samples (not in SIMD 4-bundles)
  228. for ( int i = 0; i < nSampleCount; i++ )
  229. {
  230. int l = (int)pflInputLeft[i];
  231. if ( l < -32768 ) l = -32768;
  232. if ( l > 32767 ) l = 32767;
  233. int r = (int)pflInputRight[i];
  234. if ( r < -32768 ) r = -32768;
  235. if ( r > 32767 ) r = 32767;
  236. pOut[0] = l;
  237. pOut[1] = r;
  238. pOut += 2;
  239. }
  240. }
  241. void ConvertFloat32Int16_Clamp_Interleave2( short *pOut, float *pflInputLeft, float *pflInputRight, int nSampleCount )
  242. {
  243. if ( !IsAligned16Bytes(pOut) )
  244. {
  245. ConvertFloat32Int16_Clamp_Interleave2_Unaligned( pOut, pflInputLeft, pflInputRight, nSampleCount );
  246. return;
  247. }
  248. if ( nSampleCount >= 8 )
  249. {
  250. int nSampleQuads = nSampleCount >> 2;
  251. // truncate sample count to remainder after 4-bundles
  252. nSampleCount &= 3;
  253. short *pWrite = pOut;
  254. for ( int i = 0; i < nSampleQuads; i++ )
  255. {
  256. // load 4 samples from left and four from right
  257. fltx4 leftSamples = LoadAlignedSIMD( pflInputLeft );
  258. pflInputLeft += 4;
  259. fltx4 rightSamples = LoadAlignedSIMD( pflInputRight );
  260. pflInputRight += 4;
  261. shortx8 nLeft = RoundtFloatToInt32( leftSamples );
  262. shortx8 nRight = RoundtFloatToInt32( rightSamples );
  263. shortx8 nInterleavedLow = _mm_unpacklo_epi32( nLeft, nRight );
  264. shortx8 nInterleavedHigh = _mm_unpackhi_epi32( nLeft, nRight );
  265. shortx8 nOut = PackInt32x2ToShortx8( nInterleavedLow, nInterleavedHigh );
  266. StoreAlignedSIMD( pWrite, nOut );
  267. pWrite += 8;
  268. }
  269. }
  270. // now convert and clamp any remaining samples (not in SIMD 4-bundles)
  271. for ( int i = 0; i < nSampleCount; i++ )
  272. {
  273. int l = (int)pflInputLeft[i];
  274. if ( l < -32768 ) l = -32768;
  275. if ( l > 32767 ) l = 32767;
  276. int r = (int)pflInputRight[i];
  277. if ( r < -32768 ) r = -32768;
  278. if ( r > 32767 ) r = 32767;
  279. pOut[0] = l;
  280. pOut[1] = r;
  281. pOut += 2;
  282. }
  283. }
  284. // Faster SIMD version for 6-in, 6-out
  285. void ConvertFloat32Int16_Clamp_Interleave6( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
  286. {
  287. Assert( nOutputChannelCount == 6 && nInputChannelCount == 6 && IsAligned16Bytes( pflChannel0 ) );
  288. const float *pInput0 = pflChannel0;
  289. const float *pInput1 = pflChannel0 + nChannelStrideFloats;
  290. const float *pInput2 = pflChannel0 + 2*nChannelStrideFloats;
  291. const float *pInput3 = pflChannel0 + 3*nChannelStrideFloats;
  292. const float *pInput4 = pflChannel0 + 4*nChannelStrideFloats;
  293. const float *pInput5 = pflChannel0 + 5*nChannelStrideFloats;
  294. short *pWrite = pOut;
  295. // process 24 samples per loop, grab 6 bundles of 4, write out 3 bundles of 8
  296. for ( int i = 0; i < nSampleCount; i += 4 )
  297. {
  298. // grab 6 bundles of 4 samples
  299. fltx4 fl4Samples0 = LoadAlignedSIMD( pInput0 + i ); // 0 6 12 18
  300. fltx4 fl4Samples1 = LoadAlignedSIMD( pInput1 + i ); // 1 7 13 19
  301. fltx4 fl4Samples2 = LoadAlignedSIMD( pInput2 + i ); // 2 8 14 20
  302. fltx4 fl4Samples3 = LoadAlignedSIMD( pInput3 + i ); // 3 9 15 21
  303. fltx4 fl4Samples4 = LoadAlignedSIMD( pInput4 + i ); // 4 10 16 22
  304. fltx4 fl4Samples5 = LoadAlignedSIMD( pInput5 + i ); // 5 11 17 23
  305. // interleave into pairs
  306. fltx4 fl4Pair0 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 0 6 1 7
  307. fltx4 fl4Pair1 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 12 18 13 19
  308. fltx4 fl4Pair2 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 2 8 3 9
  309. fltx4 fl4Pair3 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 14 20 15 21
  310. fltx4 fl4Pair4 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 4 10 5 11
  311. fltx4 fl4Pair5 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 16 22 17 23
  312. // now put in final order
  313. fltx4 fl4Out0 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 0 1 2 3
  314. fltx4 fl4Out1 = _mm_shuffle_ps( fl4Pair4, fl4Pair0, MM_SHUFFLE_REV( 0, 2, 1, 3 ) ); // 4 5 6 7
  315. fltx4 fl4Out2 = _mm_shuffle_ps( fl4Pair2, fl4Pair4, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 8 9 10 11
  316. fltx4 fl4Out3 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 12 13 14 15
  317. fltx4 fl4Out4 = _mm_shuffle_ps( fl4Pair5, fl4Pair1, MM_SHUFFLE_REV( 0, 2, 1, 3 ) ); // 16 17 18 19
  318. fltx4 fl4Out5 = _mm_shuffle_ps( fl4Pair3, fl4Pair5, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 20 21 22 23
  319. // pack into 3 bundles of 8
  320. shortx8 nOut0 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out0 ), RoundtFloatToInt32( fl4Out1 ) );
  321. shortx8 nOut1 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out2 ), RoundtFloatToInt32( fl4Out3 ) );
  322. shortx8 nOut2 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out4 ), RoundtFloatToInt32( fl4Out5 ) );
  323. // NOTE: Optimize alignment?
  324. StoreUnalignedSIMD( pWrite, nOut0 );
  325. StoreUnalignedSIMD( pWrite + 8, nOut1 );
  326. StoreUnalignedSIMD( pWrite + 16, nOut2 );
  327. pWrite += 24;
  328. }
  329. }
  330. // Faster SIMD version for 8-in, 8-out
  331. void ConvertFloat32Int16_Clamp_Interleave8( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
  332. {
  333. Assert( nOutputChannelCount == 8 && nInputChannelCount == 8 && IsAligned16Bytes( pflChannel0 ) );
  334. const float *pInput0 = pflChannel0;
  335. const float *pInput1 = pflChannel0 + nChannelStrideFloats;
  336. const float *pInput2 = pflChannel0 + 2 * nChannelStrideFloats;
  337. const float *pInput3 = pflChannel0 + 3 * nChannelStrideFloats;
  338. const float *pInput4 = pflChannel0 + 4 * nChannelStrideFloats;
  339. const float *pInput5 = pflChannel0 + 5 * nChannelStrideFloats;
  340. const float *pInput6 = pflChannel0 + 6 * nChannelStrideFloats;
  341. const float *pInput7 = pflChannel0 + 7 * nChannelStrideFloats;
  342. short *pWrite = pOut;
  343. // process 32 samples per loop, grab 6 bundles of 4, write out 4 bundles of 8
  344. for ( int i = 0; i < nSampleCount; i += 4 )
  345. {
  346. // grab 8 bundles of 4 samples
  347. fltx4 fl4Samples0 = LoadAlignedSIMD( pInput0 + i ); // 0 8 16 24
  348. fltx4 fl4Samples1 = LoadAlignedSIMD( pInput1 + i ); // 1 9 17 25
  349. fltx4 fl4Samples2 = LoadAlignedSIMD( pInput2 + i ); // 2 10 18 26
  350. fltx4 fl4Samples3 = LoadAlignedSIMD( pInput3 + i ); // 3 11 19 27
  351. fltx4 fl4Samples4 = LoadAlignedSIMD( pInput4 + i ); // 4 12 20 28
  352. fltx4 fl4Samples5 = LoadAlignedSIMD( pInput5 + i ); // 5 13 21 29
  353. fltx4 fl4Samples6 = LoadAlignedSIMD( pInput6 + i ); // 6 14 22 30
  354. fltx4 fl4Samples7 = LoadAlignedSIMD( pInput7 + i ); // 7 15 23 31
  355. // interleave into pairs
  356. fltx4 fl4Pair0 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 0 8 1 9
  357. fltx4 fl4Pair1 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 16 24 17 25
  358. fltx4 fl4Pair2 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 2 10 3 11
  359. fltx4 fl4Pair3 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 18 26 19 27
  360. fltx4 fl4Pair4 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 4 12 5 13
  361. fltx4 fl4Pair5 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 20 28 21 29
  362. fltx4 fl4Pair6 = _mm_shuffle_ps( fl4Samples6, fl4Samples7, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 6 14 7 15
  363. fltx4 fl4Pair7 = _mm_shuffle_ps( fl4Samples6, fl4Samples7, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 22 30 23 31
  364. // now put in final order
  365. fltx4 fl4Out0 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 0 1 2 3
  366. fltx4 fl4Out1 = _mm_shuffle_ps( fl4Pair4, fl4Pair6, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 4 5 6 7
  367. fltx4 fl4Out2 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 8 9 10 11
  368. fltx4 fl4Out3 = _mm_shuffle_ps( fl4Pair4, fl4Pair6, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 12 13 14 15
  369. fltx4 fl4Out4 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 16 17 18 19
  370. fltx4 fl4Out5 = _mm_shuffle_ps( fl4Pair5, fl4Pair7, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 20 21 22 23
  371. fltx4 fl4Out6 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 24 25 26 27
  372. fltx4 fl4Out7 = _mm_shuffle_ps( fl4Pair5, fl4Pair7, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 28 29 30 31
  373. // pack into 4 bundles of 8
  374. shortx8 nOut0 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out0 ), RoundtFloatToInt32( fl4Out1 ) );
  375. shortx8 nOut1 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out2 ), RoundtFloatToInt32( fl4Out3 ) );
  376. shortx8 nOut2 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out4 ), RoundtFloatToInt32( fl4Out5 ) );
  377. shortx8 nOut3 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out6 ), RoundtFloatToInt32( fl4Out7 ) );
  378. // NOTE: Optimize alignment?
  379. StoreUnalignedSIMD( pWrite, nOut0 );
  380. StoreUnalignedSIMD( pWrite + 8, nOut1 );
  381. StoreUnalignedSIMD( pWrite + 16, nOut2 );
  382. StoreUnalignedSIMD( pWrite + 24, nOut3 );
  383. pWrite += 32;
  384. }
  385. }
  386. // slow version to support 4/6/8 channel devices
  387. void ConvertFloat32Int16_Clamp_InterleaveStride( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
  388. {
  389. // detect optimizable cases and call fast code
  390. if ( nInputChannelCount == 6 && nOutputChannelCount == 6 && IsAlign4( nSampleCount ) )
  391. {
  392. ConvertFloat32Int16_Clamp_Interleave6( pOut, nOutputChannelCount, nChannelStrideFloats, pflChannel0, nInputChannelCount, nSampleCount );
  393. return;
  394. }
  395. if ( nInputChannelCount == 8 && nOutputChannelCount == 8 && IsAlign4( nSampleCount ) )
  396. {
  397. ConvertFloat32Int16_Clamp_Interleave8( pOut, nOutputChannelCount, nChannelStrideFloats, pflChannel0, nInputChannelCount, nSampleCount );
  398. return;
  399. }
  400. // run the slower code in this case
  401. if ( nOutputChannelCount > nInputChannelCount )
  402. {
  403. for ( int i = 0; i < nSampleCount; i++ )
  404. {
  405. float *pIn = pflChannel0 + i;
  406. for ( int j = 0; j < nInputChannelCount; j++ )
  407. {
  408. int nOut = int( pIn[0] );
  409. nOut = clamp( nOut, -32768, 32767 );
  410. *pOut++ = nOut;
  411. pIn += nChannelStrideFloats;
  412. }
  413. for ( int j = nInputChannelCount; j < nOutputChannelCount; j++ )
  414. {
  415. *pOut++ = 0;
  416. }
  417. }
  418. }
  419. else
  420. {
  421. int nCopyChannels = MIN(nOutputChannelCount, nInputChannelCount);
  422. for ( int i = 0; i < nSampleCount; i++ )
  423. {
  424. float *pIn = pflChannel0 + i;
  425. for ( int j = 0; j < nCopyChannels; j++ )
  426. {
  427. int nOut = int( pIn[0] );
  428. nOut = clamp( nOut, -32768, 32767 );
  429. *pOut++ = nOut;
  430. pIn += nChannelStrideFloats;
  431. }
  432. }
  433. }
  434. Assert( nOutputChannelCount >= nInputChannelCount );
  435. }
  436. static void ConvertShortToFloatx8( float flOutput[MIX_BUFFER_SIZE], const short *pIn )
  437. {
  438. fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
  439. const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pIn);
  440. for ( int i = 0; i < (MIX_BUFFER_SIZE/8); i++ )
  441. {
  442. shortx8 samples = LoadUnalignedShortSIMD( pInput );
  443. pInput++;
  444. fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
  445. fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
  446. StoreAlignedSIMD( (float *)pOutput, lo );
  447. pOutput++;
  448. StoreAlignedSIMD( (float *)pOutput, hi );
  449. pOutput++;
  450. }
  451. }
  452. // use 15-bit fixed point fractions for resampling
  453. #define FIX_BITS 15
  454. #define FIX_MASK ((1ul<<FIX_BITS)-1)
  455. FORCEINLINE int FLOAT_TO_FIXED( float flVal )
  456. {
  457. return int( flVal * float( 1ul << FIX_BITS ) );
  458. }
  459. // UNDONE: This can be trivially optimized to not loop
  460. static int CalcAdvanceSamples( int nOutCount, float sampleRatio, uint *pInputOffsetFrac )
  461. {
  462. uint nRateScaleFix = FLOAT_TO_FIXED( sampleRatio );
  463. uint nSampleFrac = *pInputOffsetFrac;
  464. uint nSampleIndex = 0;
  465. for ( int i = 0; i < nOutCount; i++ )
  466. {
  467. nSampleFrac += nRateScaleFix;
  468. nSampleIndex += nSampleFrac >> FIX_BITS;
  469. nSampleFrac = nSampleFrac & FIX_MASK;
  470. }
  471. *pInputOffsetFrac = nSampleFrac;
  472. return nSampleIndex;
  473. }
  474. // resample 16-bit audio data at the given ratio using linear interpolation
  475. // output is 32-bits per sample float
  476. static uint Resample16to32( float *pOut, const short *pWaveData, float sampleRatio, uint *pInputOffsetFrac )
  477. {
  478. uint nRateScaleFix = FLOAT_TO_FIXED( sampleRatio );
  479. uint nSampleFrac = *pInputOffsetFrac;
  480. Assert( nSampleFrac < ( 1ul << FIX_BITS ) );
  481. uint nSampleIndex = 0;
  482. int nFirst, nSecond, nInterp;
  483. for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
  484. {
  485. nFirst = (int)( pWaveData[nSampleIndex] );
  486. nSecond = (int)( pWaveData[nSampleIndex + 1] );
  487. #if 0
  488. // this expression doesn't truncate the value to 16-bits and preserves fractional samples in the float
  489. // output. It is a bit slower and the improved precision won't be audible unless the sample is amplified
  490. // or processed in some way because the output stage will simply round these back to 16-bit values
  491. // so disable this until we find a reason that we need it
  492. nInterp = ( nFirst << FIX_BITS ) + ( ( ( nSecond - nFirst ) * int( nSampleFrac ) ) );
  493. pOut[i] = float( nInterp ) * ( 1.0f / float( 1ul << FIX_BITS ) );
  494. #else
  495. nInterp = nFirst + ( ( ( nSecond - nFirst ) * int( nSampleFrac ) ) >> FIX_BITS );
  496. pOut[i] = float( nInterp );
  497. #endif
  498. nSampleFrac += nRateScaleFix;
  499. nSampleIndex += nSampleFrac >> FIX_BITS;
  500. nSampleFrac = nSampleFrac & FIX_MASK;
  501. }
  502. *pInputOffsetFrac = nSampleFrac;
  503. return nSampleIndex;
  504. }
  505. const fltx4 g_fl4LinerInterp2x_lo={1.0,0.5,1.0,0.5};
  506. const fltx4 g_fl4LinerInterp2x_hi={0.0,0.5,0.0,0.5};
  507. static uint Resample16to32_2x( float flOutput[MIX_BUFFER_SIZE], const short *pWaveData, uint *pInputOffsetFrac )
  508. {
  509. fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
  510. const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pWaveData);
  511. fltx4 flAllOne = LoadAlignedSIMD( (float *)g_SIMD_AllOnesMask );
  512. fltx4 fl4FirstTwo = LoadAlignedSIMD( (float *)&g_SIMD_SkipTailMask[2] );
  513. fltx4 fl4LastTwo = AndNotSIMD( fl4FirstTwo, flAllOne );
  514. for ( int i = 0; i < (MIX_BUFFER_SIZE/16); i++ )
  515. {
  516. shortx8 samples = LoadUnalignedShortSIMD( pInput );
  517. pInput++;
  518. fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
  519. fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
  520. shortx8 samplesNext = LoadUnalignedShortSIMD( pInput );
  521. // LAME: Only need one value for this but I can't be bothered to unroll this yet
  522. fltx4 hi4 = SplatXSIMD( SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samplesNext ) ) );
  523. fltx4 samp0 = SplatXSIMD( lo );
  524. fltx4 samp1 = SplatYSIMD( lo );
  525. fltx4 samp0011 = OrSIMD( AndSIMD( fl4FirstTwo, samp0 ), AndSIMD( fl4LastTwo, samp1 ) );
  526. fltx4 samp2 = SplatZSIMD( lo );
  527. fltx4 samp1122 = OrSIMD( AndSIMD( fl4FirstTwo, samp1 ), AndSIMD( fl4LastTwo, samp2 ) );
  528. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp0011, MulSIMD( g_fl4LinerInterp2x_hi, samp1122 ) ) ); // 4
  529. pOutput++;
  530. fltx4 samp3 = SplatWSIMD( lo );
  531. fltx4 samp2233 = OrSIMD( AndSIMD( fl4FirstTwo, samp2 ), AndSIMD( fl4LastTwo, samp3 ) );
  532. fltx4 samp4 = SplatXSIMD( hi );
  533. fltx4 samp3344 = OrSIMD( AndSIMD( fl4FirstTwo, samp3 ), AndSIMD( fl4LastTwo, samp4 ) );
  534. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp2233, MulSIMD( g_fl4LinerInterp2x_hi, samp3344 ) ) ); // 8
  535. pOutput++;
  536. fltx4 samp5 = SplatYSIMD( hi );
  537. fltx4 samp4455 = OrSIMD( AndSIMD( fl4FirstTwo, samp4 ), AndSIMD( fl4LastTwo, samp5 ) );
  538. fltx4 samp6 = SplatZSIMD( hi );
  539. fltx4 samp5566 = OrSIMD( AndSIMD( fl4FirstTwo, samp5 ), AndSIMD( fl4LastTwo, samp6 ) );
  540. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp4455, MulSIMD( g_fl4LinerInterp2x_hi, samp5566 ) ) ); // 12
  541. pOutput++;
  542. fltx4 samp7 = SplatWSIMD( hi );
  543. fltx4 samp6677 = OrSIMD( AndSIMD( fl4FirstTwo, samp6 ), AndSIMD( fl4LastTwo, samp7 ) );
  544. fltx4 samp8 = SplatXSIMD( hi4 );
  545. fltx4 samp7788 = OrSIMD( AndSIMD( fl4FirstTwo, samp7 ), AndSIMD( fl4LastTwo, samp8 ) );
  546. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp6677, MulSIMD( g_fl4LinerInterp2x_hi, samp7788 ) ) ); // 16
  547. pOutput++;
  548. }
  549. return MIX_BUFFER_SIZE / 2;
  550. }
  551. const fltx4 g_fl4LinerInterp4x_lo={1.0,0.75,0.5,0.25};
  552. const fltx4 g_fl4LinerInterp4x_hi={0.0,0.25,0.5,0.75};
  553. static uint Resample16to32_4x( float flOutput[MIX_BUFFER_SIZE], const short *pWaveData, uint *pInputOffsetFrac )
  554. {
  555. fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
  556. const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pWaveData);
  557. for ( int i = 0; i < (MIX_BUFFER_SIZE/32); i++ )
  558. {
  559. shortx8 samples = LoadUnalignedShortSIMD( pInput );
  560. pInput++;
  561. fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
  562. fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
  563. shortx8 samplesNext = LoadUnalignedShortSIMD( pInput );
  564. // LAME: Only need one value for this but I can't be bothered to unroll this yet
  565. fltx4 hi4 = SplatXSIMD( SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samplesNext ) ) );
  566. fltx4 samp0 = SplatXSIMD( lo );
  567. fltx4 samp1 = SplatYSIMD( lo );
  568. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp0, MulSIMD( g_fl4LinerInterp4x_hi, samp1 ) ) ); // 4
  569. pOutput++;
  570. fltx4 samp2 = SplatZSIMD( lo );
  571. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp1, MulSIMD( g_fl4LinerInterp4x_hi, samp2 ) ) ); // 8
  572. pOutput++;
  573. fltx4 samp3 = SplatWSIMD( lo );
  574. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp2, MulSIMD( g_fl4LinerInterp4x_hi, samp3 ) ) ); // 12
  575. pOutput++;
  576. fltx4 samp4 = SplatXSIMD( hi );
  577. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp3, MulSIMD( g_fl4LinerInterp4x_hi, samp4 ) ) ); // 16
  578. pOutput++;
  579. fltx4 samp5 = SplatYSIMD( hi );
  580. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp4, MulSIMD( g_fl4LinerInterp4x_hi, samp5 ) ) ); // 20
  581. pOutput++;
  582. fltx4 samp6 = SplatZSIMD( hi );
  583. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp5, MulSIMD( g_fl4LinerInterp4x_hi, samp6 ) ) ); // 24
  584. pOutput++;
  585. fltx4 samp7 = SplatWSIMD( hi );
  586. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp6, MulSIMD( g_fl4LinerInterp4x_hi, samp7 ) ) ); // 28
  587. pOutput++;
  588. fltx4 samp8 = SplatXSIMD( hi4 );
  589. StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp7, MulSIMD( g_fl4LinerInterp4x_hi, samp8 ) ) ); // 32
  590. pOutput++;
  591. }
  592. return MIX_BUFFER_SIZE / 4;
  593. }
  594. static void Convert32ToFloatx4( float flOutput[MIX_BUFFER_SIZE], int *pIn )
  595. {
  596. fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
  597. const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pIn);
  598. for ( int i = 0; i < (MIX_BUFFER_SIZE/4); i++ )
  599. {
  600. shortx8 n4Samples = LoadAlignedShortx8SIMD( pInput );
  601. pInput++;
  602. fltx4 fl4Output = SignedIntConvertToFltSIMD( n4Samples );
  603. StoreAlignedSIMD( (float *)pOutput, fl4Output );
  604. pOutput++;
  605. }
  606. }
  607. inline void ZeroFill( short *pBuffer, int nCount )
  608. {
  609. short *pLast = pBuffer + nCount;
  610. while ( pBuffer < pLast )
  611. {
  612. *pBuffer++ = 0;
  613. }
  614. }
  615. // Join buffer list into a contiguous sample list
  616. const short *GetContiguousSamples_8Mono( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount )
  617. {
  618. Assert( nSamplesNeeded < nTempSampleCount );
  619. int nSampleIndex = pState->m_nBufferSampleOffset;
  620. uint nPacketIndex = pState->m_nPacketIndex;
  621. int nOutIndex = 0;
  622. for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
  623. {
  624. const uint8 *pSourceData = (uint8 *)(source.m_pPackets[nPacketIndex].m_pSamples) + nSampleIndex;
  625. int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
  626. Assert( nSamplesAvailable > 0 );
  627. int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
  628. for ( int i = 0; i < nCopy; i++ )
  629. {
  630. // 8-bit PCM is unsigned, but we assume it has been converted to signed on load
  631. uint32 nSample = (uint8)((int32) pSourceData[i]);
  632. pTemp[nOutIndex+i] = (nSample<<8) | nSample;
  633. }
  634. nSamplesNeeded -= nCopy;
  635. nOutIndex += nCopy;
  636. Assert(nSamplesNeeded >= 0);
  637. if ( nSamplesNeeded <= 0 )
  638. break;
  639. nSampleIndex = 0;
  640. }
  641. if ( nSamplesNeeded )
  642. {
  643. ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
  644. }
  645. return pTemp;
  646. }
  647. const short *GetContiguousSamples_8Stereo( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount, int nChannel )
  648. {
  649. Assert( nSamplesNeeded < nTempSampleCount );
  650. uint nSampleIndex = pState->m_nBufferSampleOffset;
  651. uint nPacketIndex = pState->m_nPacketIndex;
  652. int nOutIndex = 0;
  653. for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
  654. {
  655. const uint8 *pSourceData = (uint8 *)(source.m_pPackets[nPacketIndex].m_pSamples) + (nSampleIndex<<1) + nChannel;
  656. int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
  657. Assert( nSamplesAvailable > 0 );
  658. int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
  659. for ( int i = 0; i < nCopy; i++ )
  660. {
  661. // 8-bit PCM is unsigned, but we assume it has been converted to signed on load
  662. uint32 nSample = (uint8)( (int32)pSourceData[i << 1] );
  663. pTemp[nOutIndex+i] = (nSample<<8) | nSample;
  664. }
  665. nSamplesNeeded -= nCopy;
  666. nOutIndex += nCopy;
  667. Assert(nSamplesNeeded >= 0);
  668. if ( nSamplesNeeded <= 0 )
  669. break;
  670. nSampleIndex = 0;
  671. }
  672. if ( nSamplesNeeded )
  673. {
  674. ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
  675. }
  676. return pTemp;
  677. }
  678. const short *GetContiguousSamples_16Mono( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount )
  679. {
  680. Assert( nSamplesNeeded <= nTempSampleCount );
  681. uint nSampleIndex = pState->m_nBufferSampleOffset;
  682. uint nPacketIndex = pState->m_nPacketIndex;
  683. if ( nPacketIndex < source.m_nPacketCount )
  684. {
  685. int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
  686. // optimization: if the entire request can be satisfied by the current packet, just point to that (don't copy)
  687. if ( nSamplesAvailable >= nSamplesNeeded )
  688. {
  689. Assert( source.m_pPackets[nPacketIndex].m_pSamples != NULL );
  690. return source.m_pPackets[nPacketIndex].m_pSamples + nSampleIndex;
  691. }
  692. int nOutIndex = 0;
  693. for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
  694. {
  695. const short *pSourceData = source.m_pPackets[nPacketIndex].m_pSamples + nSampleIndex;
  696. nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
  697. Assert( nSamplesAvailable > 0 );
  698. int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
  699. V_memcpy( &pTemp[nOutIndex], pSourceData, nCopy * sizeof(short) );
  700. nSamplesNeeded -= nCopy;
  701. nOutIndex += nCopy;
  702. Assert(nSamplesNeeded >= 0);
  703. if ( nSamplesNeeded <= 0 )
  704. break;
  705. nSampleIndex = 0;
  706. }
  707. if ( nSamplesNeeded )
  708. {
  709. // pad with zeros
  710. ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
  711. }
  712. return pTemp;
  713. }
  714. return NULL;
  715. }
  716. const short *GetContiguousSamples_16Stereo( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount, int nChannel )
  717. {
  718. Assert( nSamplesNeeded < nTempSampleCount );
  719. uint nSampleIndex = pState->m_nBufferSampleOffset;
  720. uint nPacketIndex = pState->m_nPacketIndex;
  721. int nOutIndex = 0;
  722. for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
  723. {
  724. const short *pSourceData = source.m_pPackets[nPacketIndex].m_pSamples + (nSampleIndex<<1) + nChannel;
  725. int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
  726. Assert( nSamplesAvailable > 0 );
  727. int nCopy = MIN(nSamplesAvailable, nSamplesNeeded);
  728. for ( int i = 0; i < nCopy; i++ )
  729. {
  730. // copy every other sample to drop one channel. Note that pSourceData is already offset to the appropriate channel
  731. pTemp[nOutIndex + i] = pSourceData[ i<<1 ];
  732. }
  733. nSamplesNeeded -= nCopy;
  734. nOutIndex += nCopy;
  735. Assert(nSamplesNeeded >= 0);
  736. if ( nSamplesNeeded <= 0 )
  737. break;
  738. nSampleIndex = 0;
  739. }
  740. if ( nSamplesNeeded )
  741. {
  742. // pad with zeros
  743. ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
  744. }
  745. return pTemp;
  746. }
  747. // has this source finished playing its sample data
  748. bool IsFinished( const audio_source_input_t &source, const audio_source_indexstate_t *pCurrentState )
  749. {
  750. return pCurrentState->m_nPacketIndex >= source.m_nPacketCount ? true : false;
  751. }
  752. // Move the source offset by some number of samples
  753. // If necessary also advance the packet index
  754. uint AdvanceSourceIndex( audio_source_indexstate_t *pOut, const audio_source_input_t &source, uint nAdvance )
  755. {
  756. for ( ; pOut->m_nPacketIndex < source.m_nPacketCount; pOut->m_nPacketIndex++ )
  757. {
  758. nAdvance += pOut->m_nBufferSampleOffset;
  759. pOut->m_nBufferSampleOffset = nAdvance;
  760. // We can skip entirely within this packet by adjusting the offset, so return
  761. if ( nAdvance < source.m_pPackets[pOut->m_nPacketIndex].m_nSampleCount )
  762. return 0;
  763. nAdvance -= source.m_pPackets[pOut->m_nPacketIndex].m_nSampleCount;
  764. pOut->m_nBufferSampleOffset = 0;
  765. }
  766. return nAdvance;
  767. }
  768. int ConvertSourceToFloat( const audio_source_input_t &source, float flPitch, float flOutput[MIX_BUFFER_SIZE], audio_source_indexstate_t *pOut )
  769. {
  770. //TestResample();
  771. VPROF("ConvertSourceToFloat");
  772. // if float
  773. // join, resample
  774. // return;
  775. // if 8 bit
  776. // if stereo - extract/join/updepth
  777. // if mono - join/updepth
  778. // if 16 bit
  779. // if stereo - extract/join
  780. // if mono - join
  781. // now we have 16-bit joined mono data
  782. // resample and convert to float
  783. // for now assume 16-bit mono, joined
  784. short nJoinedData[MIX_BUFFER_SIZE*2 + 8];
  785. float flSampleRatio = 1.0f;
  786. int nSamplesNeeded = MIX_BUFFER_SIZE;
  787. float flSampleRate = float(source.m_nSamplingRate) * flPitch;
  788. bool bResample = flSampleRate != MIX_DEFAULT_SAMPLING_RATE ? true : false;
  789. if ( bResample )
  790. {
  791. flSampleRatio = flSampleRate * (1.0f / MIX_DEFAULT_SAMPLING_RATE);
  792. flSampleRatio = clamp(flSampleRatio, 0.125f, 2.0f);
  793. nSamplesNeeded = int( (MIX_BUFFER_SIZE * flSampleRatio) + 0.5f ) + 2; // add 2 for rounding, interpolate to next neighbor
  794. // some of the resampling code processes in blocks of 8 samples with SSE2 instructions, so align to nearest 8
  795. nSamplesNeeded = AlignValue( nSamplesNeeded, 8 );
  796. #if _DEBUG
  797. uint64 nSampleRefCount = ( ( ( MIX_BUFFER_SIZE * FLOAT_TO_FIXED( flSampleRatio ) ) + pOut->m_nSampleFracOffset ) >> FIX_BITS ) + 1;
  798. Assert( nSampleRefCount <= nSamplesNeeded );
  799. #endif
  800. }
  801. const short *pSourceData = NULL;
  802. // Grab a pointer to a joined set of sample data at the right length
  803. if ( source.m_nSampleFormat == SAMPLE_INT8_MONO )
  804. {
  805. pSourceData = GetContiguousSamples_8Mono( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData) );
  806. }
  807. else if ( source.m_nSampleFormat == SAMPLE_INT16_MONO )
  808. {
  809. pSourceData = GetContiguousSamples_16Mono( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData) );
  810. }
  811. else if ( source.m_nSampleFormat == SAMPLE_INT16_STEREO_L )
  812. {
  813. pSourceData = GetContiguousSamples_16Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 0 );
  814. }
  815. else if ( source.m_nSampleFormat == SAMPLE_INT16_STEREO_R )
  816. {
  817. pSourceData = GetContiguousSamples_16Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 1 );
  818. }
  819. else if ( source.m_nSampleFormat == SAMPLE_INT8_STEREO_L )
  820. {
  821. pSourceData = GetContiguousSamples_8Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 0 );
  822. }
  823. else if ( source.m_nSampleFormat == SAMPLE_INT8_STEREO_R )
  824. {
  825. pSourceData = GetContiguousSamples_8Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 1 );
  826. }
  827. if ( pSourceData )
  828. {
  829. if ( bResample )
  830. {
  831. if ( flSampleRate == 11025.0f )
  832. {
  833. nSamplesNeeded = Resample16to32_4x( flOutput, pSourceData, &pOut->m_nSampleFracOffset );
  834. }
  835. else if ( flSampleRate == 22050.0f )
  836. {
  837. nSamplesNeeded = Resample16to32_2x( flOutput, pSourceData, &pOut->m_nSampleFracOffset );
  838. }
  839. else
  840. {
  841. // slow path, resample arbitrary ratio
  842. VPROF("Resample_Ratio");
  843. nSamplesNeeded = Resample16to32( flOutput, pSourceData, flSampleRatio, &pOut->m_nSampleFracOffset );
  844. }
  845. }
  846. else
  847. {
  848. ConvertShortToFloatx8( flOutput, pSourceData );
  849. }
  850. // update the index state
  851. AdvanceSourceIndex( pOut, source, nSamplesNeeded );
  852. return 1;
  853. }
  854. return 0;
  855. }
  856. int AdvanceSource( const audio_source_input_t &source, float flPitch, audio_source_indexstate_t *pOut )
  857. {
  858. float flSampleRatio = 1.0f;
  859. int nSamplesNeeded = MIX_BUFFER_SIZE;
  860. float flSampleRate = float(source.m_nSamplingRate) * flPitch;
  861. if ( flSampleRate != MIX_DEFAULT_SAMPLING_RATE )
  862. {
  863. flSampleRatio = flSampleRate * (1.0f / MIX_DEFAULT_SAMPLING_RATE);
  864. flSampleRatio = clamp(flSampleRatio, 0.125f, 2.0f);
  865. nSamplesNeeded = CalcAdvanceSamples( nSamplesNeeded, flSampleRatio, &pOut->m_nSampleFracOffset );
  866. }
  867. // update the index state
  868. AdvanceSourceIndex( pOut, source, nSamplesNeeded );
  869. return nSamplesNeeded;
  870. }
  871. // constants for linear ramping
  872. const float flMixBufferSizeInv = 1.0f / MIX_BUFFER_SIZE;
  873. const fltx4 g_fl4_MixBufferSizeInv = { flMixBufferSizeInv, flMixBufferSizeInv, flMixBufferSizeInv, flMixBufferSizeInv };
  874. const fltx4 g_fl4_Sequence1234 = { 1.0, 2.0, 3.0, 4.0 };
  875. void ScaleBuffer( float flOutput[MIX_BUFFER_SIZE], const float input[MIX_BUFFER_SIZE], float scale )
  876. {
  877. fltx4 volume = ReplicateX4(scale);
  878. fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
  879. fltx4 * RESTRICT pIn = (fltx4 *)&input[0];
  880. for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
  881. {
  882. fltx4 sample = LoadAlignedSIMD( pIn );
  883. StoreAlignedSIMD( (float *)pOut, MulSIMD( volume, sample ) );
  884. pOut++;
  885. pIn++;
  886. }
  887. }
  888. void ScaleBufferRamp( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float flScaleStart, float flScaleEnd )
  889. {
  890. fltx4 fl4Volume = ReplicateX4( flScaleStart );
  891. fltx4 fl4VolumeStep = MulSIMD( g_fl4_MixBufferSizeInv, SubSIMD( ReplicateX4( flScaleEnd ), fl4Volume ) );
  892. // offset volume by first ramp steps
  893. fl4Volume = AddSIMD( fl4Volume, MulSIMD( fl4VolumeStep, g_fl4_Sequence1234 ) );
  894. fltx4 fl4VolumeInc = MulSIMD( fl4VolumeStep, Four_Fours );
  895. fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
  896. fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
  897. for ( int i = 0; i < MIX_BUFFER_SIZE / 4; i++ )
  898. {
  899. fltx4 fl4Sample = LoadAlignedSIMD( pIn );
  900. StoreAlignedSIMD( (float *)pOut, MulSIMD( fl4Volume, fl4Sample ) );
  901. pOut++;
  902. pIn++;
  903. fl4Volume = AddSIMD( fl4VolumeInc, fl4Volume );
  904. }
  905. }
  906. void SilenceBuffer( float flBuffer[MIX_BUFFER_SIZE] )
  907. {
  908. fltx4 * RESTRICT pOut = (fltx4 *)&flBuffer[0];
  909. fltx4 fl4Zero = LoadZeroSIMD();
  910. for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
  911. {
  912. StoreAlignedSIMD( (float *)pOut, fl4Zero );
  913. pOut++;
  914. }
  915. }
  916. void SilenceBuffers( CAudioMixBuffer *pBuffers, int nBufferCount )
  917. {
  918. for ( int i = 0; i < nBufferCount; i++ )
  919. {
  920. SilenceBuffer( pBuffers[i].m_flData );
  921. }
  922. }
  923. void MixBuffer( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float scale )
  924. {
  925. fltx4 fl4Volume = ReplicateX4(scale);
  926. fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
  927. fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
  928. for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
  929. {
  930. fltx4 fl4Sample = LoadAlignedSIMD( pIn );
  931. fltx4 fl4Mix = LoadAlignedSIMD( pOut );
  932. StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Volume, fl4Sample, fl4Mix ) );
  933. pOut++;
  934. pIn++;
  935. }
  936. }
  937. void MixBufferRamp( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float flScaleStart, float flScaleEnd )
  938. {
  939. fltx4 fl4Volume = ReplicateX4( flScaleStart );
  940. fltx4 fl4VolumeStep = MulSIMD( g_fl4_MixBufferSizeInv, SubSIMD( ReplicateX4( flScaleEnd ), fl4Volume ) );
  941. // offset volume by first ramp steps
  942. fl4Volume = AddSIMD( fl4Volume, MulSIMD( fl4VolumeStep, g_fl4_Sequence1234 ) );
  943. fltx4 fl4VolumeInc = MulSIMD( fl4VolumeStep, Four_Fours );
  944. fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
  945. fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
  946. for ( int i = 0; i < MIX_BUFFER_SIZE / 4; i++ )
  947. {
  948. fltx4 fl4Sample = LoadAlignedSIMD( pIn );
  949. fltx4 fl4Mix = LoadAlignedSIMD( pOut );
  950. StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Volume, fl4Sample, fl4Mix ) );
  951. pOut++;
  952. pIn++;
  953. fl4Volume = AddSIMD( fl4VolumeInc, fl4Volume );
  954. }
  955. }
  956. void SumBuffer2x1( float flOutput[MIX_BUFFER_SIZE], float flInput0[MIX_BUFFER_SIZE], float flScale0, float flInput1[MIX_BUFFER_SIZE], float flScale1 )
  957. {
  958. fltx4 fl4Scale0 = ReplicateX4(flScale0);
  959. fltx4 fl4Scale1 = ReplicateX4(flScale1);
  960. fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
  961. fltx4 * RESTRICT pIn0 = (fltx4 *)&flInput0[0];
  962. fltx4 * RESTRICT pIn1 = (fltx4 *)&flInput1[0];
  963. for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
  964. {
  965. fltx4 fl4Sample0 = LoadAlignedSIMD( pIn0 );
  966. fltx4 fl4Sample1 = LoadAlignedSIMD( pIn1 );
  967. StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Scale0, fl4Sample0, MulSIMD( fl4Scale1, fl4Sample1 ) ) );
  968. pOut++;
  969. pIn0++;
  970. pIn1++;
  971. }
  972. }
  973. void SwapBuffersInPlace( float flInput0[MIX_BUFFER_SIZE], float flInput1[MIX_BUFFER_SIZE] )
  974. {
  975. fltx4 * RESTRICT pIn0 = (fltx4 *)&flInput0[0];
  976. fltx4 * RESTRICT pIn1 = (fltx4 *)&flInput1[0];
  977. for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
  978. {
  979. fltx4 fl4Sample0 = LoadAlignedSIMD( pIn0 );
  980. fltx4 fl4Sample1 = LoadAlignedSIMD( pIn1 );
  981. StoreAlignedSIMD( (float *)pIn0, fl4Sample1 );
  982. StoreAlignedSIMD( (float *)pIn1, fl4Sample0 );
  983. pIn0++;
  984. pIn1++;
  985. }
  986. }
  987. // UNDONE: OPTIMIZE: SIMD implementation
  988. float BufferLevel( float flInput0[MIX_BUFFER_SIZE] )
  989. {
  990. float flAbsMax = 0.0f;
  991. for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
  992. {
  993. flAbsMax = Max( flAbsMax, (float)fabs(flInput0[i]) );
  994. }
  995. return flAbsMax;
  996. }
  997. float AvergeBufferAmplitude( float flInput0[MIX_BUFFER_SIZE] )
  998. {
  999. float flTotal = 0;
  1000. for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
  1001. {
  1002. flTotal += fabs( flInput0[i] );
  1003. }
  1004. return flTotal * ( 1.0f / MIX_BUFFER_SIZE );
  1005. }