Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1121 lines
41 KiB

#include "basetypes.h"
#include "mathlib/ssemath.h"
#include "soundsystem/lowlevel.h"
#include "mix.h"
#include "tier0/vprof.h"
// simple inline to test alignemnt of a value
inline bool IsAlign4( uint nAlign )
{
return ( nAlign & 3 ) == 0;
}
inline bool IsAligned16Bytes( void *p )
{
return ( uintp( p ) & 0xF ) ? false : true;
}
// this processes the low-level mix command list and produces pResults
void ProcessAudioMix( CAudioMixResults *pResults, const CAudioMixState &mixState, CAudioMixDescription &mixSetup )
{
// set up with current counts
pResults->m_pOutput.RemoveAll();
pResults->m_pOutput.SetCount( mixSetup.m_nMixBufferMax );
pResults->m_debugOutputs.SetCount( mixSetup.m_nDebugOutputCount );
pResults->m_flOutputLevels.SetCount( mixSetup.m_nOutputLevelCount );
// now run the commands
VPROF("IAudioMix::Process");
for ( int i = 0; i < mixSetup.m_commands.Count(); i++ )
{
audio_mix_command_t &cmd = mixSetup.m_commands[i];
switch( cmd.m_nCommandId )
{
case AUDIO_MIX_CLEAR:
SilenceBuffer( pResults->m_pOutput[ cmd.m_nOutput ].m_flData );
break;
case AUDIO_MIX_EXTRACT_SOURCE:
ConvertSourceToFloat( *mixState.GetInput( cmd.m_nInput0 ), cmd.m_flParam1, pResults->m_pOutput[cmd.m_nOutput].m_flData, mixState.GetOutput( cmd.m_nInput0 ) );
break;
case AUDIO_MIX_ADVANCE_SOURCE:
AdvanceSource( *mixState.GetInput( cmd.m_nInput0 ), cmd.m_flParam1, mixState.GetOutput( cmd.m_nInput0 ) );
break;
case AUDIO_MIX_MULTIPLY:
ScaleBuffer( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0 );
break;
case AUDIO_MIX_PROCESS:
{
CAudioProcessor *pProc = mixSetup.m_processors[cmd.m_nInput1];
pProc->Process( &pResults->m_pOutput[cmd.m_nInput0], &pResults->m_pOutput[cmd.m_nOutput], int(cmd.m_flParam0), mixState.DSPGlobals() );
}
break;
case AUDIO_MIX_ACCUMULATE:
MixBuffer( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0 );
break;
case AUDIO_MIX_ACCUMULATE_RAMP:
MixBufferRamp( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0, cmd.m_flParam1 );
break;
case AUDIO_MIX_SUM:
SumBuffer2x1( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0, pResults->m_pOutput[cmd.m_nInput1].m_flData, cmd.m_flParam1 );
break;
case AUDIO_MIX_SWAP:
SwapBuffersInPlace( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData );
break;
case AUDIO_MIX_MEASURE_DEBUG_LEVEL:
{
int nChannelCount = cmd.m_nInput1;
mix_debug_outputs_t &debugOut = pResults->m_debugOutputs[cmd.m_nOutput];
debugOut.m_flLevel = 0.0f;
const float flScale = 1.0f / 32768.0f;
for ( int nChan = 0; nChan < nChannelCount; nChan++ )
{
debugOut.m_flChannelLevels[nChan] = flScale * BufferLevel( pResults->m_pOutput[cmd.m_nInput0 + nChan].m_flData );
debugOut.m_flLevel = Max( debugOut.m_flLevel, debugOut.m_flChannelLevels[nChan] );
}
debugOut.m_nChannelCount = nChannelCount;
}
break;
case AUDIO_MIX_OUTPUT_LEVEL:
{
int nChannelCount = cmd.m_nInput1;
float flLevel = 0.0f;
const float flScale = 1.0f / 32768.0f;
for ( int nChan = 0; nChan < nChannelCount; nChan++ )
{
float flOut = flScale * AvergeBufferAmplitude( pResults->m_pOutput[cmd.m_nInput0 + nChan].m_flData );
flLevel = Max( flLevel, flOut );
}
pResults->m_flOutputLevels[cmd.m_nOutput] = clamp( flLevel, 0.0f, 1.0f );
}
break;
default:
Assert( 0 );
//AssertMsg( 0, "Unknown mix command %d\n", int(cmd.m_nCommandId) );
break;
}
}
}
void CAudioMixCommandList::ClearMultichannel( uint16 nTarget, int nCount )
{
for ( int i = 0; i < nCount; i++ )
{
audio_mix_command_t cmd;
cmd.Init( AUDIO_MIX_CLEAR, nTarget + i );
m_commands.AddToTail( cmd );
}
}
void CAudioMixCommandList::ScaleMultichannel( uint16 nOutput, uint16 nInput, int nCount, float flVolume )
{
for ( int i = 0; i < nCount; i++ )
{
audio_mix_command_t cmd;
cmd.Init( AUDIO_MIX_MULTIPLY, nOutput + i, nInput + i, flVolume );
m_commands.AddToTail( cmd );
}
}
void CAudioMixCommandList::AccumulateMultichannel( uint16 nOutput, int nOutputChannels, uint16 nInput, int nInputChannels, float flInputVolume )
{
if ( nOutputChannels == nInputChannels )
{
for ( int i = 0; i < nInputChannels; i++ )
{
AccumulateToBuffer( nOutput + i, nInput + i, flInputVolume );
}
}
else
{
// need to downmix or expand channels
if ( nOutputChannels == 2 )
{
// downmix 6 ch to 2 ch
Assert( nInputChannels == 6 ); // other cases should have been handled above or there's more code to write
// out.left += 0.5 * (in.left + in.center*0.5) + 0.5 * in.rear_left
AccumulateToBuffer( nOutput + 0, nInput + 0, flInputVolume * 0.5f );
AccumulateToBuffer( nOutput + 0, nInput + 2, flInputVolume * 0.25f );
AccumulateToBuffer( nOutput + 0, nInput + 4, flInputVolume * 0.5f );
// out.right += 0.5 * (in.right + in.center*0.5) + 0.5 * in.rear_right
AccumulateToBuffer( nOutput + 1, nInput + 1, flInputVolume * 0.5f );
AccumulateToBuffer( nOutput + 1, nInput + 2, flInputVolume * 0.25f );
AccumulateToBuffer( nOutput + 1, nInput + 5, flInputVolume * 0.5f );
}
else if ( nOutputChannels == 6 )
{
// expand 2ch to 6 ch
Assert( nInputChannels == 2 );
// out.left += in.left
AccumulateToBuffer( nOutput + 0, nInput + 0, flInputVolume );
// out.right += in.right
AccumulateToBuffer( nOutput + 1, nInput + 1, flInputVolume );
// out.center = 0.5f * (in.left + in.right)
AccumulateToBuffer( nOutput + 2, nInput + 0, flInputVolume * 0.5f );
AccumulateToBuffer( nOutput + 2, nInput + 1, flInputVolume * 0.5f );
// out.rear_left += in.left
AccumulateToBuffer( nOutput + 4, nInput + 0, flInputVolume );
// out.rear_right += in.right
AccumulateToBuffer( nOutput + 5, nInput + 1, flInputVolume );
}
else if ( nOutputChannels == 8 && (nInputChannels == 2 || nInputChannels == 6) )
{
// right now we just use this for solo/debug, copy
for ( int i = 0; i < nInputChannels; i++ )
{
AccumulateToBuffer( nOutput + i, nInput + i, flInputVolume );
}
}
else
{
// some other case we haven't implemented
Assert(0);
}
}
}
FORCEINLINE shortx8 ShiftRightShortSIMD( const shortx8 &inputValue, const shortx8 &shiftBitCount )
{
return _mm_srl_epi16( inputValue, shiftBitCount );
}
FORCEINLINE shortx8 SignedExtractLowAsInt32( const shortx8 &a )
{
shortx8 signExtend = _mm_cmplt_epi16( a, _mm_setzero_si128() );
return _mm_unpacklo_epi16( a, signExtend );
}
FORCEINLINE shortx8 SignedExtractHighAsInt32( const shortx8 &a )
{
shortx8 signExtend = _mm_cmplt_epi16( a, _mm_setzero_si128() );
return _mm_unpackhi_epi16( a, signExtend );
}
FORCEINLINE shortx8 RoundtFloatToInt32( const fltx4 &input )
{
return _mm_cvtps_epi32( input );
}
FORCEINLINE shortx8 PackInt32x2ToShortx8( const shortx8 &input0, const shortx8 &input1 )
{
return _mm_packs_epi32( input0, input1 );
}
// Load 4 aligned words into a SIMD register
FORCEINLINE shortx8 LoadAlignedShortx8SIMD( const void * RESTRICT pSIMD )
{
return _mm_load_si128( reinterpret_cast<const __m128i *>( pSIMD ) );
}
// Load 4 unaligned words into a SIMD register
FORCEINLINE shortx8 LoadUnalignedShortx8SIMD( const void * RESTRICT pSIMD )
{
return _mm_loadu_si128( reinterpret_cast<const __m128i *>( pSIMD ) );
}
// create a stereo interleaved signed-16 buffer from two float-32 buffers
void ConvertFloat32Int16_Clamp_Interleave2_Unaligned( short *pOut, float *pflInputLeft, float *pflInputRight, int nSampleCount )
{
if ( nSampleCount >= 8 )
{
int nSampleQuads = nSampleCount >> 2;
// truncate sample count to remainder after 4-bundles
nSampleCount &= 3;
short *pWrite = pOut;
for ( int i = 0; i < nSampleQuads; i++ )
{
// load 4 samples from left and four from right
fltx4 leftSamples = LoadAlignedSIMD( pflInputLeft );
pflInputLeft += 4;
fltx4 rightSamples = LoadAlignedSIMD( pflInputRight );
pflInputRight += 4;
shortx8 nLeft = RoundtFloatToInt32( leftSamples );
shortx8 nRight = RoundtFloatToInt32( rightSamples );
// interleave into L/R pairs
shortx8 nInterleavedLow = _mm_unpacklo_epi32( nLeft, nRight );
shortx8 nInterleavedHigh = _mm_unpackhi_epi32( nLeft, nRight );
// pack
shortx8 nOut = PackInt32x2ToShortx8( nInterleavedLow, nInterleavedHigh );
StoreUnalignedSIMD( pWrite, nOut );
pWrite += 8;
}
}
// now convert and clamp any remaining samples (not in SIMD 4-bundles)
for ( int i = 0; i < nSampleCount; i++ )
{
int l = (int)pflInputLeft[i];
if ( l < -32768 ) l = -32768;
if ( l > 32767 ) l = 32767;
int r = (int)pflInputRight[i];
if ( r < -32768 ) r = -32768;
if ( r > 32767 ) r = 32767;
pOut[0] = l;
pOut[1] = r;
pOut += 2;
}
}
void ConvertFloat32Int16_Clamp_Interleave2( short *pOut, float *pflInputLeft, float *pflInputRight, int nSampleCount )
{
if ( !IsAligned16Bytes(pOut) )
{
ConvertFloat32Int16_Clamp_Interleave2_Unaligned( pOut, pflInputLeft, pflInputRight, nSampleCount );
return;
}
if ( nSampleCount >= 8 )
{
int nSampleQuads = nSampleCount >> 2;
// truncate sample count to remainder after 4-bundles
nSampleCount &= 3;
short *pWrite = pOut;
for ( int i = 0; i < nSampleQuads; i++ )
{
// load 4 samples from left and four from right
fltx4 leftSamples = LoadAlignedSIMD( pflInputLeft );
pflInputLeft += 4;
fltx4 rightSamples = LoadAlignedSIMD( pflInputRight );
pflInputRight += 4;
shortx8 nLeft = RoundtFloatToInt32( leftSamples );
shortx8 nRight = RoundtFloatToInt32( rightSamples );
shortx8 nInterleavedLow = _mm_unpacklo_epi32( nLeft, nRight );
shortx8 nInterleavedHigh = _mm_unpackhi_epi32( nLeft, nRight );
shortx8 nOut = PackInt32x2ToShortx8( nInterleavedLow, nInterleavedHigh );
StoreAlignedSIMD( pWrite, nOut );
pWrite += 8;
}
}
// now convert and clamp any remaining samples (not in SIMD 4-bundles)
for ( int i = 0; i < nSampleCount; i++ )
{
int l = (int)pflInputLeft[i];
if ( l < -32768 ) l = -32768;
if ( l > 32767 ) l = 32767;
int r = (int)pflInputRight[i];
if ( r < -32768 ) r = -32768;
if ( r > 32767 ) r = 32767;
pOut[0] = l;
pOut[1] = r;
pOut += 2;
}
}
// Faster SIMD version for 6-in, 6-out
void ConvertFloat32Int16_Clamp_Interleave6( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
{
Assert( nOutputChannelCount == 6 && nInputChannelCount == 6 && IsAligned16Bytes( pflChannel0 ) );
const float *pInput0 = pflChannel0;
const float *pInput1 = pflChannel0 + nChannelStrideFloats;
const float *pInput2 = pflChannel0 + 2*nChannelStrideFloats;
const float *pInput3 = pflChannel0 + 3*nChannelStrideFloats;
const float *pInput4 = pflChannel0 + 4*nChannelStrideFloats;
const float *pInput5 = pflChannel0 + 5*nChannelStrideFloats;
short *pWrite = pOut;
// process 24 samples per loop, grab 6 bundles of 4, write out 3 bundles of 8
for ( int i = 0; i < nSampleCount; i += 4 )
{
// grab 6 bundles of 4 samples
fltx4 fl4Samples0 = LoadAlignedSIMD( pInput0 + i ); // 0 6 12 18
fltx4 fl4Samples1 = LoadAlignedSIMD( pInput1 + i ); // 1 7 13 19
fltx4 fl4Samples2 = LoadAlignedSIMD( pInput2 + i ); // 2 8 14 20
fltx4 fl4Samples3 = LoadAlignedSIMD( pInput3 + i ); // 3 9 15 21
fltx4 fl4Samples4 = LoadAlignedSIMD( pInput4 + i ); // 4 10 16 22
fltx4 fl4Samples5 = LoadAlignedSIMD( pInput5 + i ); // 5 11 17 23
// interleave into pairs
fltx4 fl4Pair0 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 0 6 1 7
fltx4 fl4Pair1 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 12 18 13 19
fltx4 fl4Pair2 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 2 8 3 9
fltx4 fl4Pair3 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 14 20 15 21
fltx4 fl4Pair4 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 4 10 5 11
fltx4 fl4Pair5 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 16 22 17 23
// now put in final order
fltx4 fl4Out0 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 0 1 2 3
fltx4 fl4Out1 = _mm_shuffle_ps( fl4Pair4, fl4Pair0, MM_SHUFFLE_REV( 0, 2, 1, 3 ) ); // 4 5 6 7
fltx4 fl4Out2 = _mm_shuffle_ps( fl4Pair2, fl4Pair4, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 8 9 10 11
fltx4 fl4Out3 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 12 13 14 15
fltx4 fl4Out4 = _mm_shuffle_ps( fl4Pair5, fl4Pair1, MM_SHUFFLE_REV( 0, 2, 1, 3 ) ); // 16 17 18 19
fltx4 fl4Out5 = _mm_shuffle_ps( fl4Pair3, fl4Pair5, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 20 21 22 23
// pack into 3 bundles of 8
shortx8 nOut0 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out0 ), RoundtFloatToInt32( fl4Out1 ) );
shortx8 nOut1 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out2 ), RoundtFloatToInt32( fl4Out3 ) );
shortx8 nOut2 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out4 ), RoundtFloatToInt32( fl4Out5 ) );
// NOTE: Optimize alignment?
StoreUnalignedSIMD( pWrite, nOut0 );
StoreUnalignedSIMD( pWrite + 8, nOut1 );
StoreUnalignedSIMD( pWrite + 16, nOut2 );
pWrite += 24;
}
}
// Faster SIMD version for 8-in, 8-out
void ConvertFloat32Int16_Clamp_Interleave8( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
{
Assert( nOutputChannelCount == 8 && nInputChannelCount == 8 && IsAligned16Bytes( pflChannel0 ) );
const float *pInput0 = pflChannel0;
const float *pInput1 = pflChannel0 + nChannelStrideFloats;
const float *pInput2 = pflChannel0 + 2 * nChannelStrideFloats;
const float *pInput3 = pflChannel0 + 3 * nChannelStrideFloats;
const float *pInput4 = pflChannel0 + 4 * nChannelStrideFloats;
const float *pInput5 = pflChannel0 + 5 * nChannelStrideFloats;
const float *pInput6 = pflChannel0 + 6 * nChannelStrideFloats;
const float *pInput7 = pflChannel0 + 7 * nChannelStrideFloats;
short *pWrite = pOut;
// process 32 samples per loop, grab 6 bundles of 4, write out 4 bundles of 8
for ( int i = 0; i < nSampleCount; i += 4 )
{
// grab 8 bundles of 4 samples
fltx4 fl4Samples0 = LoadAlignedSIMD( pInput0 + i ); // 0 8 16 24
fltx4 fl4Samples1 = LoadAlignedSIMD( pInput1 + i ); // 1 9 17 25
fltx4 fl4Samples2 = LoadAlignedSIMD( pInput2 + i ); // 2 10 18 26
fltx4 fl4Samples3 = LoadAlignedSIMD( pInput3 + i ); // 3 11 19 27
fltx4 fl4Samples4 = LoadAlignedSIMD( pInput4 + i ); // 4 12 20 28
fltx4 fl4Samples5 = LoadAlignedSIMD( pInput5 + i ); // 5 13 21 29
fltx4 fl4Samples6 = LoadAlignedSIMD( pInput6 + i ); // 6 14 22 30
fltx4 fl4Samples7 = LoadAlignedSIMD( pInput7 + i ); // 7 15 23 31
// interleave into pairs
fltx4 fl4Pair0 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 0 8 1 9
fltx4 fl4Pair1 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 16 24 17 25
fltx4 fl4Pair2 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 2 10 3 11
fltx4 fl4Pair3 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 18 26 19 27
fltx4 fl4Pair4 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 4 12 5 13
fltx4 fl4Pair5 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 20 28 21 29
fltx4 fl4Pair6 = _mm_shuffle_ps( fl4Samples6, fl4Samples7, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 6 14 7 15
fltx4 fl4Pair7 = _mm_shuffle_ps( fl4Samples6, fl4Samples7, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 22 30 23 31
// now put in final order
fltx4 fl4Out0 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 0 1 2 3
fltx4 fl4Out1 = _mm_shuffle_ps( fl4Pair4, fl4Pair6, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 4 5 6 7
fltx4 fl4Out2 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 8 9 10 11
fltx4 fl4Out3 = _mm_shuffle_ps( fl4Pair4, fl4Pair6, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 12 13 14 15
fltx4 fl4Out4 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 16 17 18 19
fltx4 fl4Out5 = _mm_shuffle_ps( fl4Pair5, fl4Pair7, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 20 21 22 23
fltx4 fl4Out6 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 24 25 26 27
fltx4 fl4Out7 = _mm_shuffle_ps( fl4Pair5, fl4Pair7, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 28 29 30 31
// pack into 4 bundles of 8
shortx8 nOut0 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out0 ), RoundtFloatToInt32( fl4Out1 ) );
shortx8 nOut1 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out2 ), RoundtFloatToInt32( fl4Out3 ) );
shortx8 nOut2 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out4 ), RoundtFloatToInt32( fl4Out5 ) );
shortx8 nOut3 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out6 ), RoundtFloatToInt32( fl4Out7 ) );
// NOTE: Optimize alignment?
StoreUnalignedSIMD( pWrite, nOut0 );
StoreUnalignedSIMD( pWrite + 8, nOut1 );
StoreUnalignedSIMD( pWrite + 16, nOut2 );
StoreUnalignedSIMD( pWrite + 24, nOut3 );
pWrite += 32;
}
}
// slow version to support 4/6/8 channel devices
void ConvertFloat32Int16_Clamp_InterleaveStride( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
{
// detect optimizable cases and call fast code
if ( nInputChannelCount == 6 && nOutputChannelCount == 6 && IsAlign4( nSampleCount ) )
{
ConvertFloat32Int16_Clamp_Interleave6( pOut, nOutputChannelCount, nChannelStrideFloats, pflChannel0, nInputChannelCount, nSampleCount );
return;
}
if ( nInputChannelCount == 8 && nOutputChannelCount == 8 && IsAlign4( nSampleCount ) )
{
ConvertFloat32Int16_Clamp_Interleave8( pOut, nOutputChannelCount, nChannelStrideFloats, pflChannel0, nInputChannelCount, nSampleCount );
return;
}
// run the slower code in this case
if ( nOutputChannelCount > nInputChannelCount )
{
for ( int i = 0; i < nSampleCount; i++ )
{
float *pIn = pflChannel0 + i;
for ( int j = 0; j < nInputChannelCount; j++ )
{
int nOut = int( pIn[0] );
nOut = clamp( nOut, -32768, 32767 );
*pOut++ = nOut;
pIn += nChannelStrideFloats;
}
for ( int j = nInputChannelCount; j < nOutputChannelCount; j++ )
{
*pOut++ = 0;
}
}
}
else
{
int nCopyChannels = MIN(nOutputChannelCount, nInputChannelCount);
for ( int i = 0; i < nSampleCount; i++ )
{
float *pIn = pflChannel0 + i;
for ( int j = 0; j < nCopyChannels; j++ )
{
int nOut = int( pIn[0] );
nOut = clamp( nOut, -32768, 32767 );
*pOut++ = nOut;
pIn += nChannelStrideFloats;
}
}
}
Assert( nOutputChannelCount >= nInputChannelCount );
}
static void ConvertShortToFloatx8( float flOutput[MIX_BUFFER_SIZE], const short *pIn )
{
fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pIn);
for ( int i = 0; i < (MIX_BUFFER_SIZE/8); i++ )
{
shortx8 samples = LoadUnalignedShortSIMD( pInput );
pInput++;
fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
StoreAlignedSIMD( (float *)pOutput, lo );
pOutput++;
StoreAlignedSIMD( (float *)pOutput, hi );
pOutput++;
}
}
// use 15-bit fixed point fractions for resampling
#define FIX_BITS 15
#define FIX_MASK ((1ul<<FIX_BITS)-1)
FORCEINLINE int FLOAT_TO_FIXED( float flVal )
{
return int( flVal * float( 1ul << FIX_BITS ) );
}
// UNDONE: This can be trivially optimized to not loop
static int CalcAdvanceSamples( int nOutCount, float sampleRatio, uint *pInputOffsetFrac )
{
uint nRateScaleFix = FLOAT_TO_FIXED( sampleRatio );
uint nSampleFrac = *pInputOffsetFrac;
uint nSampleIndex = 0;
for ( int i = 0; i < nOutCount; i++ )
{
nSampleFrac += nRateScaleFix;
nSampleIndex += nSampleFrac >> FIX_BITS;
nSampleFrac = nSampleFrac & FIX_MASK;
}
*pInputOffsetFrac = nSampleFrac;
return nSampleIndex;
}
// resample 16-bit audio data at the given ratio using linear interpolation
// output is 32-bits per sample float
static uint Resample16to32( float *pOut, const short *pWaveData, float sampleRatio, uint *pInputOffsetFrac )
{
uint nRateScaleFix = FLOAT_TO_FIXED( sampleRatio );
uint nSampleFrac = *pInputOffsetFrac;
Assert( nSampleFrac < ( 1ul << FIX_BITS ) );
uint nSampleIndex = 0;
int nFirst, nSecond, nInterp;
for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
{
nFirst = (int)( pWaveData[nSampleIndex] );
nSecond = (int)( pWaveData[nSampleIndex + 1] );
#if 0
// this expression doesn't truncate the value to 16-bits and preserves fractional samples in the float
// output. It is a bit slower and the improved precision won't be audible unless the sample is amplified
// or processed in some way because the output stage will simply round these back to 16-bit values
// so disable this until we find a reason that we need it
nInterp = ( nFirst << FIX_BITS ) + ( ( ( nSecond - nFirst ) * int( nSampleFrac ) ) );
pOut[i] = float( nInterp ) * ( 1.0f / float( 1ul << FIX_BITS ) );
#else
nInterp = nFirst + ( ( ( nSecond - nFirst ) * int( nSampleFrac ) ) >> FIX_BITS );
pOut[i] = float( nInterp );
#endif
nSampleFrac += nRateScaleFix;
nSampleIndex += nSampleFrac >> FIX_BITS;
nSampleFrac = nSampleFrac & FIX_MASK;
}
*pInputOffsetFrac = nSampleFrac;
return nSampleIndex;
}
const fltx4 g_fl4LinerInterp2x_lo={1.0,0.5,1.0,0.5};
const fltx4 g_fl4LinerInterp2x_hi={0.0,0.5,0.0,0.5};
static uint Resample16to32_2x( float flOutput[MIX_BUFFER_SIZE], const short *pWaveData, uint *pInputOffsetFrac )
{
fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pWaveData);
fltx4 flAllOne = LoadAlignedSIMD( (float *)g_SIMD_AllOnesMask );
fltx4 fl4FirstTwo = LoadAlignedSIMD( (float *)&g_SIMD_SkipTailMask[2] );
fltx4 fl4LastTwo = AndNotSIMD( fl4FirstTwo, flAllOne );
for ( int i = 0; i < (MIX_BUFFER_SIZE/16); i++ )
{
shortx8 samples = LoadUnalignedShortSIMD( pInput );
pInput++;
fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
shortx8 samplesNext = LoadUnalignedShortSIMD( pInput );
// LAME: Only need one value for this but I can't be bothered to unroll this yet
fltx4 hi4 = SplatXSIMD( SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samplesNext ) ) );
fltx4 samp0 = SplatXSIMD( lo );
fltx4 samp1 = SplatYSIMD( lo );
fltx4 samp0011 = OrSIMD( AndSIMD( fl4FirstTwo, samp0 ), AndSIMD( fl4LastTwo, samp1 ) );
fltx4 samp2 = SplatZSIMD( lo );
fltx4 samp1122 = OrSIMD( AndSIMD( fl4FirstTwo, samp1 ), AndSIMD( fl4LastTwo, samp2 ) );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp0011, MulSIMD( g_fl4LinerInterp2x_hi, samp1122 ) ) ); // 4
pOutput++;
fltx4 samp3 = SplatWSIMD( lo );
fltx4 samp2233 = OrSIMD( AndSIMD( fl4FirstTwo, samp2 ), AndSIMD( fl4LastTwo, samp3 ) );
fltx4 samp4 = SplatXSIMD( hi );
fltx4 samp3344 = OrSIMD( AndSIMD( fl4FirstTwo, samp3 ), AndSIMD( fl4LastTwo, samp4 ) );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp2233, MulSIMD( g_fl4LinerInterp2x_hi, samp3344 ) ) ); // 8
pOutput++;
fltx4 samp5 = SplatYSIMD( hi );
fltx4 samp4455 = OrSIMD( AndSIMD( fl4FirstTwo, samp4 ), AndSIMD( fl4LastTwo, samp5 ) );
fltx4 samp6 = SplatZSIMD( hi );
fltx4 samp5566 = OrSIMD( AndSIMD( fl4FirstTwo, samp5 ), AndSIMD( fl4LastTwo, samp6 ) );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp4455, MulSIMD( g_fl4LinerInterp2x_hi, samp5566 ) ) ); // 12
pOutput++;
fltx4 samp7 = SplatWSIMD( hi );
fltx4 samp6677 = OrSIMD( AndSIMD( fl4FirstTwo, samp6 ), AndSIMD( fl4LastTwo, samp7 ) );
fltx4 samp8 = SplatXSIMD( hi4 );
fltx4 samp7788 = OrSIMD( AndSIMD( fl4FirstTwo, samp7 ), AndSIMD( fl4LastTwo, samp8 ) );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp6677, MulSIMD( g_fl4LinerInterp2x_hi, samp7788 ) ) ); // 16
pOutput++;
}
return MIX_BUFFER_SIZE / 2;
}
const fltx4 g_fl4LinerInterp4x_lo={1.0,0.75,0.5,0.25};
const fltx4 g_fl4LinerInterp4x_hi={0.0,0.25,0.5,0.75};
static uint Resample16to32_4x( float flOutput[MIX_BUFFER_SIZE], const short *pWaveData, uint *pInputOffsetFrac )
{
fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pWaveData);
for ( int i = 0; i < (MIX_BUFFER_SIZE/32); i++ )
{
shortx8 samples = LoadUnalignedShortSIMD( pInput );
pInput++;
fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
shortx8 samplesNext = LoadUnalignedShortSIMD( pInput );
// LAME: Only need one value for this but I can't be bothered to unroll this yet
fltx4 hi4 = SplatXSIMD( SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samplesNext ) ) );
fltx4 samp0 = SplatXSIMD( lo );
fltx4 samp1 = SplatYSIMD( lo );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp0, MulSIMD( g_fl4LinerInterp4x_hi, samp1 ) ) ); // 4
pOutput++;
fltx4 samp2 = SplatZSIMD( lo );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp1, MulSIMD( g_fl4LinerInterp4x_hi, samp2 ) ) ); // 8
pOutput++;
fltx4 samp3 = SplatWSIMD( lo );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp2, MulSIMD( g_fl4LinerInterp4x_hi, samp3 ) ) ); // 12
pOutput++;
fltx4 samp4 = SplatXSIMD( hi );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp3, MulSIMD( g_fl4LinerInterp4x_hi, samp4 ) ) ); // 16
pOutput++;
fltx4 samp5 = SplatYSIMD( hi );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp4, MulSIMD( g_fl4LinerInterp4x_hi, samp5 ) ) ); // 20
pOutput++;
fltx4 samp6 = SplatZSIMD( hi );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp5, MulSIMD( g_fl4LinerInterp4x_hi, samp6 ) ) ); // 24
pOutput++;
fltx4 samp7 = SplatWSIMD( hi );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp6, MulSIMD( g_fl4LinerInterp4x_hi, samp7 ) ) ); // 28
pOutput++;
fltx4 samp8 = SplatXSIMD( hi4 );
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp7, MulSIMD( g_fl4LinerInterp4x_hi, samp8 ) ) ); // 32
pOutput++;
}
return MIX_BUFFER_SIZE / 4;
}
static void Convert32ToFloatx4( float flOutput[MIX_BUFFER_SIZE], int *pIn )
{
fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pIn);
for ( int i = 0; i < (MIX_BUFFER_SIZE/4); i++ )
{
shortx8 n4Samples = LoadAlignedShortx8SIMD( pInput );
pInput++;
fltx4 fl4Output = SignedIntConvertToFltSIMD( n4Samples );
StoreAlignedSIMD( (float *)pOutput, fl4Output );
pOutput++;
}
}
inline void ZeroFill( short *pBuffer, int nCount )
{
short *pLast = pBuffer + nCount;
while ( pBuffer < pLast )
{
*pBuffer++ = 0;
}
}
// Join buffer list into a contiguous sample list
const short *GetContiguousSamples_8Mono( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount )
{
Assert( nSamplesNeeded < nTempSampleCount );
int nSampleIndex = pState->m_nBufferSampleOffset;
uint nPacketIndex = pState->m_nPacketIndex;
int nOutIndex = 0;
for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
{
const uint8 *pSourceData = (uint8 *)(source.m_pPackets[nPacketIndex].m_pSamples) + nSampleIndex;
int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
Assert( nSamplesAvailable > 0 );
int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
for ( int i = 0; i < nCopy; i++ )
{
// 8-bit PCM is unsigned, but we assume it has been converted to signed on load
uint32 nSample = (uint8)((int32) pSourceData[i]);
pTemp[nOutIndex+i] = (nSample<<8) | nSample;
}
nSamplesNeeded -= nCopy;
nOutIndex += nCopy;
Assert(nSamplesNeeded >= 0);
if ( nSamplesNeeded <= 0 )
break;
nSampleIndex = 0;
}
if ( nSamplesNeeded )
{
ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
}
return pTemp;
}
const short *GetContiguousSamples_8Stereo( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount, int nChannel )
{
Assert( nSamplesNeeded < nTempSampleCount );
uint nSampleIndex = pState->m_nBufferSampleOffset;
uint nPacketIndex = pState->m_nPacketIndex;
int nOutIndex = 0;
for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
{
const uint8 *pSourceData = (uint8 *)(source.m_pPackets[nPacketIndex].m_pSamples) + (nSampleIndex<<1) + nChannel;
int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
Assert( nSamplesAvailable > 0 );
int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
for ( int i = 0; i < nCopy; i++ )
{
// 8-bit PCM is unsigned, but we assume it has been converted to signed on load
uint32 nSample = (uint8)( (int32)pSourceData[i << 1] );
pTemp[nOutIndex+i] = (nSample<<8) | nSample;
}
nSamplesNeeded -= nCopy;
nOutIndex += nCopy;
Assert(nSamplesNeeded >= 0);
if ( nSamplesNeeded <= 0 )
break;
nSampleIndex = 0;
}
if ( nSamplesNeeded )
{
ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
}
return pTemp;
}
const short *GetContiguousSamples_16Mono( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount )
{
Assert( nSamplesNeeded <= nTempSampleCount );
uint nSampleIndex = pState->m_nBufferSampleOffset;
uint nPacketIndex = pState->m_nPacketIndex;
if ( nPacketIndex < source.m_nPacketCount )
{
int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
// optimization: if the entire request can be satisfied by the current packet, just point to that (don't copy)
if ( nSamplesAvailable >= nSamplesNeeded )
{
Assert( source.m_pPackets[nPacketIndex].m_pSamples != NULL );
return source.m_pPackets[nPacketIndex].m_pSamples + nSampleIndex;
}
int nOutIndex = 0;
for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
{
const short *pSourceData = source.m_pPackets[nPacketIndex].m_pSamples + nSampleIndex;
nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
Assert( nSamplesAvailable > 0 );
int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
V_memcpy( &pTemp[nOutIndex], pSourceData, nCopy * sizeof(short) );
nSamplesNeeded -= nCopy;
nOutIndex += nCopy;
Assert(nSamplesNeeded >= 0);
if ( nSamplesNeeded <= 0 )
break;
nSampleIndex = 0;
}
if ( nSamplesNeeded )
{
// pad with zeros
ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
}
return pTemp;
}
return NULL;
}
const short *GetContiguousSamples_16Stereo( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount, int nChannel )
{
Assert( nSamplesNeeded < nTempSampleCount );
uint nSampleIndex = pState->m_nBufferSampleOffset;
uint nPacketIndex = pState->m_nPacketIndex;
int nOutIndex = 0;
for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
{
const short *pSourceData = source.m_pPackets[nPacketIndex].m_pSamples + (nSampleIndex<<1) + nChannel;
int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
Assert( nSamplesAvailable > 0 );
int nCopy = MIN(nSamplesAvailable, nSamplesNeeded);
for ( int i = 0; i < nCopy; i++ )
{
// copy every other sample to drop one channel. Note that pSourceData is already offset to the appropriate channel
pTemp[nOutIndex + i] = pSourceData[ i<<1 ];
}
nSamplesNeeded -= nCopy;
nOutIndex += nCopy;
Assert(nSamplesNeeded >= 0);
if ( nSamplesNeeded <= 0 )
break;
nSampleIndex = 0;
}
if ( nSamplesNeeded )
{
// pad with zeros
ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
}
return pTemp;
}
// has this source finished playing its sample data
bool IsFinished( const audio_source_input_t &source, const audio_source_indexstate_t *pCurrentState )
{
return pCurrentState->m_nPacketIndex >= source.m_nPacketCount ? true : false;
}
// Move the source offset by some number of samples
// If necessary also advance the packet index
uint AdvanceSourceIndex( audio_source_indexstate_t *pOut, const audio_source_input_t &source, uint nAdvance )
{
for ( ; pOut->m_nPacketIndex < source.m_nPacketCount; pOut->m_nPacketIndex++ )
{
nAdvance += pOut->m_nBufferSampleOffset;
pOut->m_nBufferSampleOffset = nAdvance;
// We can skip entirely within this packet by adjusting the offset, so return
if ( nAdvance < source.m_pPackets[pOut->m_nPacketIndex].m_nSampleCount )
return 0;
nAdvance -= source.m_pPackets[pOut->m_nPacketIndex].m_nSampleCount;
pOut->m_nBufferSampleOffset = 0;
}
return nAdvance;
}
int ConvertSourceToFloat( const audio_source_input_t &source, float flPitch, float flOutput[MIX_BUFFER_SIZE], audio_source_indexstate_t *pOut )
{
//TestResample();
VPROF("ConvertSourceToFloat");
// if float
// join, resample
// return;
// if 8 bit
// if stereo - extract/join/updepth
// if mono - join/updepth
// if 16 bit
// if stereo - extract/join
// if mono - join
// now we have 16-bit joined mono data
// resample and convert to float
// for now assume 16-bit mono, joined
short nJoinedData[MIX_BUFFER_SIZE*2 + 8];
float flSampleRatio = 1.0f;
int nSamplesNeeded = MIX_BUFFER_SIZE;
float flSampleRate = float(source.m_nSamplingRate) * flPitch;
bool bResample = flSampleRate != MIX_DEFAULT_SAMPLING_RATE ? true : false;
if ( bResample )
{
flSampleRatio = flSampleRate * (1.0f / MIX_DEFAULT_SAMPLING_RATE);
flSampleRatio = clamp(flSampleRatio, 0.125f, 2.0f);
nSamplesNeeded = int( (MIX_BUFFER_SIZE * flSampleRatio) + 0.5f ) + 2; // add 2 for rounding, interpolate to next neighbor
// some of the resampling code processes in blocks of 8 samples with SSE2 instructions, so align to nearest 8
nSamplesNeeded = AlignValue( nSamplesNeeded, 8 );
#if _DEBUG
uint64 nSampleRefCount = ( ( ( MIX_BUFFER_SIZE * FLOAT_TO_FIXED( flSampleRatio ) ) + pOut->m_nSampleFracOffset ) >> FIX_BITS ) + 1;
Assert( nSampleRefCount <= nSamplesNeeded );
#endif
}
const short *pSourceData = NULL;
// Grab a pointer to a joined set of sample data at the right length
if ( source.m_nSampleFormat == SAMPLE_INT8_MONO )
{
pSourceData = GetContiguousSamples_8Mono( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData) );
}
else if ( source.m_nSampleFormat == SAMPLE_INT16_MONO )
{
pSourceData = GetContiguousSamples_16Mono( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData) );
}
else if ( source.m_nSampleFormat == SAMPLE_INT16_STEREO_L )
{
pSourceData = GetContiguousSamples_16Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 0 );
}
else if ( source.m_nSampleFormat == SAMPLE_INT16_STEREO_R )
{
pSourceData = GetContiguousSamples_16Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 1 );
}
else if ( source.m_nSampleFormat == SAMPLE_INT8_STEREO_L )
{
pSourceData = GetContiguousSamples_8Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 0 );
}
else if ( source.m_nSampleFormat == SAMPLE_INT8_STEREO_R )
{
pSourceData = GetContiguousSamples_8Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 1 );
}
if ( pSourceData )
{
if ( bResample )
{
if ( flSampleRate == 11025.0f )
{
nSamplesNeeded = Resample16to32_4x( flOutput, pSourceData, &pOut->m_nSampleFracOffset );
}
else if ( flSampleRate == 22050.0f )
{
nSamplesNeeded = Resample16to32_2x( flOutput, pSourceData, &pOut->m_nSampleFracOffset );
}
else
{
// slow path, resample arbitrary ratio
VPROF("Resample_Ratio");
nSamplesNeeded = Resample16to32( flOutput, pSourceData, flSampleRatio, &pOut->m_nSampleFracOffset );
}
}
else
{
ConvertShortToFloatx8( flOutput, pSourceData );
}
// update the index state
AdvanceSourceIndex( pOut, source, nSamplesNeeded );
return 1;
}
return 0;
}
int AdvanceSource( const audio_source_input_t &source, float flPitch, audio_source_indexstate_t *pOut )
{
float flSampleRatio = 1.0f;
int nSamplesNeeded = MIX_BUFFER_SIZE;
float flSampleRate = float(source.m_nSamplingRate) * flPitch;
if ( flSampleRate != MIX_DEFAULT_SAMPLING_RATE )
{
flSampleRatio = flSampleRate * (1.0f / MIX_DEFAULT_SAMPLING_RATE);
flSampleRatio = clamp(flSampleRatio, 0.125f, 2.0f);
nSamplesNeeded = CalcAdvanceSamples( nSamplesNeeded, flSampleRatio, &pOut->m_nSampleFracOffset );
}
// update the index state
AdvanceSourceIndex( pOut, source, nSamplesNeeded );
return nSamplesNeeded;
}
// constants for linear ramping
const float flMixBufferSizeInv = 1.0f / MIX_BUFFER_SIZE;
const fltx4 g_fl4_MixBufferSizeInv = { flMixBufferSizeInv, flMixBufferSizeInv, flMixBufferSizeInv, flMixBufferSizeInv };
const fltx4 g_fl4_Sequence1234 = { 1.0, 2.0, 3.0, 4.0 };
void ScaleBuffer( float flOutput[MIX_BUFFER_SIZE], const float input[MIX_BUFFER_SIZE], float scale )
{
fltx4 volume = ReplicateX4(scale);
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
fltx4 * RESTRICT pIn = (fltx4 *)&input[0];
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
{
fltx4 sample = LoadAlignedSIMD( pIn );
StoreAlignedSIMD( (float *)pOut, MulSIMD( volume, sample ) );
pOut++;
pIn++;
}
}
void ScaleBufferRamp( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float flScaleStart, float flScaleEnd )
{
fltx4 fl4Volume = ReplicateX4( flScaleStart );
fltx4 fl4VolumeStep = MulSIMD( g_fl4_MixBufferSizeInv, SubSIMD( ReplicateX4( flScaleEnd ), fl4Volume ) );
// offset volume by first ramp steps
fl4Volume = AddSIMD( fl4Volume, MulSIMD( fl4VolumeStep, g_fl4_Sequence1234 ) );
fltx4 fl4VolumeInc = MulSIMD( fl4VolumeStep, Four_Fours );
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
for ( int i = 0; i < MIX_BUFFER_SIZE / 4; i++ )
{
fltx4 fl4Sample = LoadAlignedSIMD( pIn );
StoreAlignedSIMD( (float *)pOut, MulSIMD( fl4Volume, fl4Sample ) );
pOut++;
pIn++;
fl4Volume = AddSIMD( fl4VolumeInc, fl4Volume );
}
}
void SilenceBuffer( float flBuffer[MIX_BUFFER_SIZE] )
{
fltx4 * RESTRICT pOut = (fltx4 *)&flBuffer[0];
fltx4 fl4Zero = LoadZeroSIMD();
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
{
StoreAlignedSIMD( (float *)pOut, fl4Zero );
pOut++;
}
}
void SilenceBuffers( CAudioMixBuffer *pBuffers, int nBufferCount )
{
for ( int i = 0; i < nBufferCount; i++ )
{
SilenceBuffer( pBuffers[i].m_flData );
}
}
void MixBuffer( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float scale )
{
fltx4 fl4Volume = ReplicateX4(scale);
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
{
fltx4 fl4Sample = LoadAlignedSIMD( pIn );
fltx4 fl4Mix = LoadAlignedSIMD( pOut );
StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Volume, fl4Sample, fl4Mix ) );
pOut++;
pIn++;
}
}
void MixBufferRamp( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float flScaleStart, float flScaleEnd )
{
fltx4 fl4Volume = ReplicateX4( flScaleStart );
fltx4 fl4VolumeStep = MulSIMD( g_fl4_MixBufferSizeInv, SubSIMD( ReplicateX4( flScaleEnd ), fl4Volume ) );
// offset volume by first ramp steps
fl4Volume = AddSIMD( fl4Volume, MulSIMD( fl4VolumeStep, g_fl4_Sequence1234 ) );
fltx4 fl4VolumeInc = MulSIMD( fl4VolumeStep, Four_Fours );
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
for ( int i = 0; i < MIX_BUFFER_SIZE / 4; i++ )
{
fltx4 fl4Sample = LoadAlignedSIMD( pIn );
fltx4 fl4Mix = LoadAlignedSIMD( pOut );
StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Volume, fl4Sample, fl4Mix ) );
pOut++;
pIn++;
fl4Volume = AddSIMD( fl4VolumeInc, fl4Volume );
}
}
void SumBuffer2x1( float flOutput[MIX_BUFFER_SIZE], float flInput0[MIX_BUFFER_SIZE], float flScale0, float flInput1[MIX_BUFFER_SIZE], float flScale1 )
{
fltx4 fl4Scale0 = ReplicateX4(flScale0);
fltx4 fl4Scale1 = ReplicateX4(flScale1);
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
fltx4 * RESTRICT pIn0 = (fltx4 *)&flInput0[0];
fltx4 * RESTRICT pIn1 = (fltx4 *)&flInput1[0];
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
{
fltx4 fl4Sample0 = LoadAlignedSIMD( pIn0 );
fltx4 fl4Sample1 = LoadAlignedSIMD( pIn1 );
StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Scale0, fl4Sample0, MulSIMD( fl4Scale1, fl4Sample1 ) ) );
pOut++;
pIn0++;
pIn1++;
}
}
void SwapBuffersInPlace( float flInput0[MIX_BUFFER_SIZE], float flInput1[MIX_BUFFER_SIZE] )
{
fltx4 * RESTRICT pIn0 = (fltx4 *)&flInput0[0];
fltx4 * RESTRICT pIn1 = (fltx4 *)&flInput1[0];
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
{
fltx4 fl4Sample0 = LoadAlignedSIMD( pIn0 );
fltx4 fl4Sample1 = LoadAlignedSIMD( pIn1 );
StoreAlignedSIMD( (float *)pIn0, fl4Sample1 );
StoreAlignedSIMD( (float *)pIn1, fl4Sample0 );
pIn0++;
pIn1++;
}
}
// UNDONE: OPTIMIZE: SIMD implementation
float BufferLevel( float flInput0[MIX_BUFFER_SIZE] )
{
float flAbsMax = 0.0f;
for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
{
flAbsMax = Max( flAbsMax, (float)fabs(flInput0[i]) );
}
return flAbsMax;
}
float AvergeBufferAmplitude( float flInput0[MIX_BUFFER_SIZE] )
{
float flTotal = 0;
for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
{
flTotal += fabs( flInput0[i] );
}
return flTotal * ( 1.0f / MIX_BUFFER_SIZE );
}