windows-xp/Source/XPSP1/NT/multimedia/directx/dmusic/dmsynth/sverb.c

/***********************************************************
Copyrights : ksWaves Ltd. 1998.

Provided to Microsoft under contract between ksWaves and Microsoft.

************************************************************/

/***********************************************************
General description :

The functions in this file provides for any apoplication to process audio data
with the SVerb algorithm.

In order to do so the application should :

1. Allocate two chunks of memory for 'Coefs' and 'States' with sizes as returned 
   by the functions 'GetCoefsSize' and 'GetStatesSize' accordingly.
   
2. Initialize these memory chunks using the functions : 'InitSVerb' and 'InitSVerbStates' 
   accordingly.

3. Change the settings of the SVerb sound using the function 'SetSVerb'.

4. Call one of the process functions according to the input/output data format:

   SVerbMonoToMonoShort 
   SVerbMonoToStereoShort
   SVerbStereoToStereoShort 
   SVerbMonoToMonoFloat
   SVerbMonoToStereoFloat
   SVerbStereoToStereoFloat

   The input/output are always the same data type (i.e. both input and output are short integer
   or both are 32bits floats).

   Stereo data format is always'interlaced' left,right samples.

   The 'coefs' and 'states' memory should be passed to the process functions.
   
5. Many coefs structures can be initialized each for different SVerb settings. Passing a different
   coefs structure will cause a real time change of sound quality.

   As long as sound continuity should be maintained the states structure should not be changes or
   re-initialized. Only when a completly new audio sequence is desired should the states be re-initialized.

6. Note that the coefs are valid per sampling rate.

7. Althaugh provisions for coefs compatibility for future versions are provided, it should be avoided to save coefs
   structures to files as-is and re-use them later. Rather the application should save the 'real-world'
   settings of the reverb - namely the parameters passed to 'SetSVerb'. These 'real-world' settings 
   will always be valid for future versions, as well as if other sampling rates are used. The coefs 
   structur(es) should be re-initialized in run time using the real-world settings and call to 
   'SetSverb'.


************************************************************/

#include <windows.h>
#include <String.h>
#include <math.h>
#include "SVerb.h"

#pragma optimize( "ty", on )

/****************************************************************************

Function Name	: GetCoefsSize

Input Arguments : None

Return Value    : The size of memory in bytes, to be allocated in order to hold coefficients.

Description		:  

This function must be called before any calls to other functions that uses the coefs structure.
The calling app must than allocate the returned size of memory and initialize it using 'InitSVerb()'
and 'SetSVerb()'.

The caller should not be botherred by the internals of the coefs structure, rather only konw it's size 
and than allocate enough memory to hold it.

The structure allocated can be used in a very flexible way in order to allow for real-time, pre-computed
changes in Reverb Sound. 

*****************************************************************************/

long GetCoefsSize(void) 
{
	return sizeof(sCoefsStruct); 
};

/****************************************************************************

Function Name	: GetStatesSize

Input Arguments : None

Return Value    : The size of memory in bytes, to be allocated in order to hold states.

Description		:  

This function must be called before any calls to other functions that uses the states structure.
The calling app must than allocate the returned size of memory and initialize it using 'InitSVerbStates()'.

The states allocated are valid in run-time only, and sould be re-initialized only when a complete
new input is to be processed by the SVerb. 

When changing the settings of revevreb in real time while audio is playing, the states should not 
be re-initialized, rather the same passed states must be passed to the process functions in order 
to maintain sound continuity.

*****************************************************************************/

long GetStatesSize(void) 
{
	return sizeof(long)*(BASE_REV_DELAY+2*BASE_DSPS_DELAY+2); 
};

/****************************************************************************

Function Name	: GetSVerbVersion

Input Arguments : None

Return Value    : Version of SVerb implementation - for future compatibility.

Description		:  

Since the caller do not know about the internals of the coefs structure, this function,
together with 'VerifyVersion' function provides a way to verify if a coefs structure
match the version of the reverb used.

This should be needed only if one is using a coefs structure that was saved to file, and 
being used later.

NOTE : In normal operation, this way of usage should be avoided... and only real-world reverb
settings should be saved to files, and re-initialize the coefs in run time.

*****************************************************************************/

long GetSVerbVersion(void) 
{
	return 0x1; 
};

/****************************************************************************

Function Name	: VerifySampleRate

Input Arguments : 

 void *pC		: The pointer to the coefs memory.

Return Value    : The sample rate for which this coefs are valid.

Description		:  

When an application uses different sampling rates, and re-uses same coefs structures, 
it should verify that the coefs match the audio sampling rate.

*****************************************************************************/

float VerifySampleRate(void *pC) {
	return ((sCoefsStruct *)pC)->SampleRate; 
};

/****************************************************************************

Function Name	: VerifyVersion

Input Arguments : 

 void *pC		: The pointer to the coefs memory.

Return Value    : The version of this coefs structure.

Description		:  

When initialized, each coefs structure is 'stamped' with it's version.
The location of this variable in the structure is fixed, and thus all future versions of
SVerb will know to read it.

Note : as explained above, in normal uses coefs should not be saved to files, rather the 
'real-world' settings should be saved and coefs re-initialized in run-time.
*****************************************************************************/

long VerifyVersion(void *pC) {
	return ((sCoefsStruct *)pC)->myVersion; 
};

/****************************************************************************

Function Name	: VerifySize

Input Arguments : 

 void *pC		: The pointer to the coefs memory.

Return Value    : The size of this coefs structure.

Description		:  

When initialized, each coefs structure is 'stamped' with it's size.
The location of this variable in the structure is fixed, and thus all future versions of
SVerb will know to read it.

Note : as explained above, in normal uses coefs should not be saved to files, rather the 
'real-world' settings should be saved and coefs re-initialized in run-time.
*****************************************************************************/

long VerifySize(void *pC) {
	return ((sCoefsStruct *)pC)->mySize; 
};


/****************************************************************************

Function Name	: InitSVerbStates

Input Arguments : 

 float *pStates	: The pointer to the states memory.

Return Value    : none.

Description		:  

After allocating memory for the states, according to thge size returned by 'GetStatesSize'
The application MUST initialize the states using this function. 
Note : in future versions this may be more complex than simply memset to 0...
*****************************************************************************/

void InitSVerbStates( long *pStates )
{
    memset( pStates, 0, GetStatesSize() ) ;
}

/****************************************************************************

Function Name	: DToF16

Input Arguments : 

 float SampleRate	: The sampling rate.
 void *pC			: The pointer to the coefs memory.

Return Value    : none.

Description		:  

Converts a float number between -1.0 .. 1.0 to a 16bits integer 
fixed point representation.
This allows for fix point arithmetics, where two 16bits integers are multiplied to 
a 32bits integer, and we than take the upper 16 bits of the result.

*****************************************************************************/

long DToF16( float dbl  )
{
	dbl *= MAX_16;
	dbl = max(-MAX_16,min(MAX_16-(float)1.0,dbl+(float)0.5));
	return (long)(dbl);
}

/****************************************************************************

Function Name	: ConvertCoefsToFix

Input Arguments : 

 void *pC			: The pointer to the coefs memory.

Return Value    : none.

Description		:  converts coefficients to longs, as fixed point numbers

*****************************************************************************/


void ConvertCoefsToFix( void *pC )
{

	sCoefsStruct *pCoefs = ((sCoefsStruct *)pC);

//		float directGain; 

	pCoefs->l_directGain =  DToF16(pCoefs->directGain);

//		float revGain; 
	pCoefs->l_revGain =  DToF16(pCoefs->revGain);
//		float dDsps;
	pCoefs->l_dDsps =  DToF16(pCoefs->dDsps);
//		float dDG1;
	pCoefs->l_dDG1 =  DToF16(pCoefs->dDG1);
//		float dDG2; 
	pCoefs->l_dDG2 =  DToF16(pCoefs->dDG2);
//	float dFB11;
	pCoefs->l_dFB11 =  DToF16(pCoefs->dFB11);
//		float dFB12;
	pCoefs->l_dFB12 =  DToF16(pCoefs->dFB12);
//		float dFB21;
	pCoefs->l_dFB21 =  DToF16(pCoefs->dFB21);
//		float dFB22;
	pCoefs->l_dFB22 =  DToF16(pCoefs->dFB22);
//		float dFB31;
	pCoefs->l_dFB31 =  DToF16(pCoefs->dFB31);
//		float dFB32;
	pCoefs->l_dFB32 =  DToF16(pCoefs->dFB32);
//		float dFB41;
	pCoefs->l_dFB41 =  DToF16(pCoefs->dFB41);
//		float dFB42;
	pCoefs->l_dFB42 =  DToF16(pCoefs->dFB42);
//		float dDamp;
	pCoefs->l_dDamp =  DToF16(pCoefs->dDamp);


}

/****************************************************************************

Function Name	: InitSVerb

Input Arguments : 

 float SampleRate	: The sampling rate.
 void *pC			: The pointer to the coefs memory.

Return Value    : none.

Description		:  

After allocating memory for the coefs, according to thge size returned by 'GetCoefsSize'
The application MUST initialize the coefs using this function. 
The initialization takes the sampling rate as an argument, ans thus is valid per this
sampling rate only.

It is possible to find out what is the sampling rate a coefs structure is valid for by calling 
the function 'VerifySampleRate'.

This function initialises the SVerb to so reasonable default setting by calling 'SetSVerb' with
the following real-world settings :

InGain				= -3.0dB   (to avoid output overflows)
dRevMix				= -6.0dB   (a reasonable reverb mix)
dRevTime			= 1000.0ms (one second global reverb time)
dHighFreqRTRatio	= 0.001    (the ratio of the high frequencies to the global reverb time) 

*****************************************************************************/

void InitSVerb( float SampleRate, void *pC)
{

	sCoefsStruct *pCoefs = ((sCoefsStruct *)pC);
 	//Magic numbers ...
    long lRefD;
	
	float dRatio =  (float)1.189207115003;
	
	float dD2MRatio = (float)0.2309333333;

	pCoefs->mySize = sizeof(sCoefsStruct);
	pCoefs->myVersion = 0x1;

	pCoefs->dDsps =  (float)0.6180339887499;

	pCoefs->SampleRate = SampleRate;

    lRefD = (long)( 0.5 + 0.045 * pCoefs->SampleRate ) ;

	pCoefs->lDelay1 = lRefD;
	pCoefs->lDelay3 = (long)(0.5 + dRatio * (float)pCoefs->lDelay1);
	pCoefs->lDelay2 = (long)(0.5 + dRatio * (float)pCoefs->lDelay3);
	pCoefs->lDelay4 = (long)(0.5 + dRatio * (float)pCoefs->lDelay2);
  
    pCoefs->lDDly1 = (long)(0.5 + 0.5 * dD2MRatio * (float)(pCoefs->lDelay1+pCoefs->lDelay2));
	pCoefs->lDDly2 = (long)(0.5 + 0.5 * dD2MRatio * (float)(pCoefs->lDelay3+pCoefs->lDelay4));

    pCoefs->lDelay1 -= pCoefs->lDDly1 ;    
    pCoefs->lDelay2 -= pCoefs->lDDly1 ;    
    pCoefs->lDelay3 -= pCoefs->lDDly2 ;    
    pCoefs->lDelay4 -= pCoefs->lDDly2 ;        

    pCoefs->lDelay1 <<= 2;    
    pCoefs->lDelay2 <<= 2;    
    pCoefs->lDelay3 <<= 2;    
    pCoefs->lDelay4 <<= 2;        

	pCoefs->lDDly1 <<= 1;
	pCoefs->lDDly2 <<= 1;

	SetSVerb( (float)0.0, (float)-10.0, (float)1000.0, (float)0.001, pC );

}

/****************************************************************************

Function Name	: SetSVerb

Input Arguments : 

InGain				: input gain in dB (to avoid output overflows)

dRevMix				: Reverb mix in dB. 0dB means 100% wet reverb (no direct signal)
                      Negative values gives less wet signal.
					  The coeficients are calculated so that the overall output level stays 
					  (approximately) constant regardless of the ammount of reverb mix.
dRevTime			: The global reverb time (decay time) in milliseconds.

dHighFreqRTRatio	: The ratio of the high frequencies to the global reverb time. 
					  Unless very 'splashy-bright' reverbs are wanted, this should be set to 
					  a value < 1.0.
					  For example if dRevTime==1000ms and dHighFreqRTRatio=0.1 than the 
					  decay time for high frequencies will be 100ms.

void *pC			: The pointer to the coefs memory.

Return Value    : none.

Description		:  

This function accepts the 'real world' settings or SVerb and computes the corresponding 
coefs structure.

The coefs pointer passed to it MUST have been initialized first by InitSVerb.

In normal uses one coefs structure is allocated, initialized, and than as the user changes 
SVerb settings this function should be called repeatedly with the same coefs pointer and the 
new 'real world' settings. 

And the coefs structure passed to the process function in the next buffer to process.

Also few coefs structures can be pre allocated, and initialized, and than different 'presets' 
can be pre-computed into each of them, and switched in real time. 

The coefs structures should not be saved to files by the application for future uses, rather 
the 'real world' settings them selvs. This way future compatibility is guaranteed.

*****************************************************************************/

void SetSVerb( float InGain, float dRevMix, 
			   float dRevTime, float dHighFreqRTRatio, void *pC )
{


	sCoefsStruct *pCoefs = ((sCoefsStruct *)pC);

    float dD,dTmp,dInGain,dRevGain;

	float dHfR;
    float dAPS;

    if (dHighFreqRTRatio > (float) 0.999)
    {
        dHighFreqRTRatio = (float) 0.999;
    }
    if (dHighFreqRTRatio <= (float) 0.0)
    {
        dHighFreqRTRatio = (float) 0.001;
    }
    dHfR = ( (float)1.0/dHighFreqRTRatio - (float)1.0);

    if (dRevTime < (float) 0.001) 
    {
        dRevTime = (float) 0.001;
    }

    if (InGain > (float) 0.0)
    {
        InGain = (float) 0.0;
    }

    if (dRevMix > (float) 0.0)
    {
        dRevMix = (float) 0.0;
    }

    if (pCoefs->SampleRate < (float) 1.0) 
    {
        pCoefs->SampleRate = (float) 22050.0;
    }

    dAPS = (float)(-3000.0) / (pCoefs->SampleRate * dRevTime);


    pCoefs->dDamp = 0.0;

 	pCoefs->dDG1 = (float)pow((float)10.0,(float)(pCoefs->lDDly1>>1)*dAPS);
 	pCoefs->dDG2 = (float)pow((float)10.0,(float)(pCoefs->lDDly2>>1)*dAPS);

	//////////////////////////////

		pCoefs->dFB11 = (float)pow((float)10.0,(float)(pCoefs->lDelay1>>2)*dAPS);
        
		dD = pCoefs->dFB11 * pCoefs->dDG1;
        dD = (float)1.0+dD*((float)1.0+dD*((float)1.0+dD*((float)1.0 + dD)));
        pCoefs->dDamp += dD *dD;

		dTmp = (float)pow((float)10.0,(float)((pCoefs->lDDly1>>1)+(pCoefs->lDelay1>>2))*dAPS*dHfR);
		dTmp = ((float)1.0 - dTmp)*(float)0.5;

		pCoefs->dFB12 = pCoefs->dFB11 * dTmp;
		pCoefs->dFB11 *= ((float)1.0-dTmp);

	///////////////////////////////

		pCoefs->dFB21 = (float)pow((float)10.0,(float)(pCoefs->lDelay2>>2)*dAPS);
        
		dD = pCoefs->dFB21 * pCoefs->dDG1;
        dD = (float)1.0+dD*((float)1.0+dD*((float)1.0+dD*((float)1.0 + dD)));
        pCoefs->dDamp += dD *dD;

		dTmp = (float)pow((float)10.0,(float)((pCoefs->lDDly1>>1)+(pCoefs->lDelay2>>2))*dAPS*dHfR);
		dTmp = ((float)1.0 - dTmp)*(float)0.5;

		pCoefs->dFB22 = pCoefs->dFB21 * dTmp;
		pCoefs->dFB21 *= ((float)1.0-dTmp);

	////////////////////////////////

		pCoefs->dFB31 = (float)pow((float)10.0,(float)(pCoefs->lDelay3>>2)*dAPS);
        
		dD = pCoefs->dFB31 * pCoefs->dDG2;
        dD = (float)1.0+dD*((float)1.0+dD*((float)1.0+dD*((float)1.0 + dD)));
		    pCoefs->dDamp += dD *dD;

		dTmp = (float)pow((float)10.0,(float)((pCoefs->lDDly2>>1)+(pCoefs->lDelay3>>2))*dAPS*dHfR);
		dTmp = ((float)1.0 - dTmp)*(float)0.5;

		pCoefs->dFB32 = pCoefs->dFB31 * dTmp;
		pCoefs->dFB31 *= ((float)1.0-dTmp);


	//////////////////////////////

		pCoefs->dFB41 = (float)pow((float)10.0,(float)(pCoefs->lDelay4>>2)*dAPS);

        dD = pCoefs->dFB41 * pCoefs->dDG2;
        dD = (float)1.0+dD*((float)1.0+dD*((float)1.0+dD*((float)1.0 + dD)));
        pCoefs->dDamp += dD *dD;

		dTmp = (float)pow((float)10.0,(float)((pCoefs->lDDly2>>1)+(pCoefs->lDelay4>>2))*dAPS*dHfR);
		dTmp = ((float)1.0 - dTmp)*(float)0.5;

		pCoefs->dFB42 = pCoefs->dFB41 * dTmp;
		pCoefs->dFB41 *= ((float)1.0-dTmp);


    pCoefs->dDamp = (float)sqrt(pCoefs->dDamp);

 	dInGain = (float)pow((float)10.0, (float)0.05*InGain ) ;
	dRevMix = (float)pow((float)10.0,(float)0.1*dRevMix);

	dRevGain = (float)4.0 / pCoefs->dDamp * dInGain;

	//in the DSP we used -  	 
	

	pCoefs->directGain = dInGain * (float)sqrt((float)1.0-dRevMix);
	pCoefs->revGain = dRevGain * (float)sqrt(dRevMix);

	ConvertCoefsToFix( pC );

}

///////////////////////////////////////////////////////////////////////////////////////
/**************************************************************************************/
/**************************************************************************************/
/**************************************************************************************/
/**************************************************************************************/
/* Process functions */
/**************************************************************************************/
/**************************************************************************************/
/**************************************************************************************/

/**********************************************************************************

Bellow are 6 different process functions.
The difference between the functions is only in the input/output data formats.

3 functions support short samples input/output.
3 other functions support float samples input/output.

Per each of the data types there are 3 functions :

  Mono-Mono
  Mono-Stereo
  Stereo-Stereo

The names of the functions are clear to which format they apply.

Stereo data is always interlaced left,right samples.

All process functions have basically the same format namely :

  SVerbXXXXXX(long NumInFrames, short *pInShort, short *pOutShort, 
			  void *pC, float *pStates)

Input arguments :

long NumInFrames	: Number of input frames
short *pInXXX		: Pointer to input buffer.
					  Each function expects the data format suggested by it's name in terms of
					  data type (short or float) and mono/stereo.
short *pOutXXX		: Pointer to output buffer.
					  Each function expects the data format suggested by it's name in terms of
					  data type (short or float) and mono/stereo.

void *pC			: The coefs structure allocated and initialized as explained above.
float *pStates		: The states structure allocated and initialized as explained above.

*******************************************************************************************/

void SVerbMonoToMonoShort(long NumInFrames, short *pInShort, short *pOutShort, 
						  void *pC, long *pStates)
{

	sCoefsStruct *pCoefs =  ((sCoefsStruct *)pC);
	long n_sample;
	long In1, In2, Out1, Out2;
	long Indx1,Indx2,Indx3,Indx4;
	long *pNewDll1, *pNewDll2, *pNewDll3, *pNewDll4;
	long *pPrevDll1, *pPrevDll2, *pPrevDll3, *pPrevDll4, *pDelayIn;
	long	*pDelay = pStates+2;
	long	*pDD1	 = pDelay+0x4000;
	long	*pDD2	 = pDD1+0x800;
	long Indx = ((long *)pStates)[0];
	long DIndx = ((long *)pStates)[1] ;

	Indx1 = (Indx+4+pCoefs->lDelay1) & REV_MASK;
	Indx2 = (Indx+4+pCoefs->lDelay2) & REV_MASK;
	Indx3 = (Indx+4+pCoefs->lDelay3) & REV_MASK;
	Indx4 = (Indx+4+pCoefs->lDelay4) & REV_MASK;

	pPrevDll1 = pDelay+Indx1;
	pPrevDll2 = pDelay+Indx2+1;
	pPrevDll3 = pDelay+Indx3+2;
	pPrevDll4 = pDelay+Indx4+3;

	Indx1 = (Indx1-4)&REV_MASK;
	Indx2 = (Indx2-4)&REV_MASK;
	Indx3 = (Indx3-4)&REV_MASK;
	Indx4 = (Indx4-4)&REV_MASK;

		for (n_sample = 0;n_sample < NumInFrames;n_sample++)
		{

			In1 = In2 = (long)(*pInShort++)>>1;

			Out1 = (In1 * pCoefs->l_directGain)>>15;

			Out2 = (In2 * pCoefs->l_directGain)>>15;

			In1 = (In1 * pCoefs->l_revGain)>>15;

			In2 = (In2 * pCoefs->l_revGain)>>15;

			pNewDll1 = pDelay+Indx1;
			pNewDll2 = pDelay+Indx2+1;
			pNewDll3 = pDelay+Indx3+2;
			pNewDll4 = pDelay+Indx4+3;

			dspsL( pDD1, DIndx, pCoefs->lDDly1, pCoefs->l_dDG1, pCoefs->l_dDsps, pNewDll1, pNewDll2 );
			dspsL( pDD2, DIndx, pCoefs->lDDly2, pCoefs->l_dDG2, pCoefs->l_dDsps, pNewDll3, pNewDll4 );

			Out1 += *pNewDll1 + *pNewDll3;
			Out2 += *pNewDll2 + *pNewDll4;

			pDelayIn = pDelay + Indx;

			*pDelayIn++ = In1 + ((*pNewDll1*pCoefs->l_dFB11 + *pPrevDll1*pCoefs->l_dFB12)>>15);
			pPrevDll1 = pNewDll1;
			Indx1 = (Indx1 - 4) & REV_MASK;

			*pDelayIn++ = In2 + ((*pNewDll2*pCoefs->l_dFB21 + *pPrevDll2*pCoefs->l_dFB22)>>15);			
			pPrevDll2 = pNewDll2;
			Indx2 = (Indx2 - 4) & REV_MASK;

			*pDelayIn++ = -In2 + ((*pNewDll3*pCoefs->l_dFB31 + *pPrevDll3*pCoefs->l_dFB32)>>15);
			pPrevDll3 = pNewDll3;
			Indx3 = (Indx3 - 4) & REV_MASK;

			*pDelayIn++ = In1 + ((*pNewDll4*pCoefs->l_dFB41 + *pPrevDll4*pCoefs->l_dFB42)>>15);
			pPrevDll4 = pNewDll4;
			Indx4 = (Indx4 - 4) & REV_MASK;

			Indx = (Indx - 4) & REV_MASK;
			DIndx = (DIndx - 2) & DSPS_MASK;

			Out1 += Out2;
			CLIP_SHORT_TO_SHORT(Out1)

			*pOutShort++ = (short)(Out1);
			
		}

	((long *)pStates)[0] = Indx ;
	((long *)pStates)[1] = DIndx ;

}

#ifdef USE_ALL_VERBS
void SVerbMonoToStereoShort(long NumInFrames, short *pInShort, short *pOutShort, 
						    void *pC, long *pStates)
{

	sCoefsStruct *pCoefs = ((sCoefsStruct *)pC);
	long n_sample;
	long In1, In2, Out1, Out2;
	long Indx1,Indx2,Indx3,Indx4;
	long *pNewDll1, *pNewDll2, *pNewDll3, *pNewDll4;
	long *pPrevDll1, *pPrevDll2, *pPrevDll3, *pPrevDll4, *pDelayIn;
	long	*pDelay = pStates+2;
	long	*pDD1	 = pDelay+0x4000;
	long	*pDD2	 = pDD1+0x800;
	long Indx = ((long *)pStates)[0];
	long DIndx = ((long *)pStates)[1] ;

	Indx1 = (Indx+4+pCoefs->lDelay1) & REV_MASK;
	Indx2 = (Indx+4+pCoefs->lDelay2) & REV_MASK;
	Indx3 = (Indx+4+pCoefs->lDelay3) & REV_MASK;
	Indx4 = (Indx+4+pCoefs->lDelay4) & REV_MASK;

	pPrevDll1 = pDelay+Indx1;
	pPrevDll2 = pDelay+Indx2+1;
	pPrevDll3 = pDelay+Indx3+2;
	pPrevDll4 = pDelay+Indx4+3;

	Indx1 = (Indx1-4)&REV_MASK;
	Indx2 = (Indx2-4)&REV_MASK;
	Indx3 = (Indx3-4)&REV_MASK;
	Indx4 = (Indx4-4)&REV_MASK;

		for (n_sample = 0;n_sample < NumInFrames;n_sample++)
		{

			In1 = (long)(*pInShort++);
			In1 += (In1>>1) - (In1>>2);
			In2 = In1;

			Out1 = (In1 * pCoefs->l_directGain)>>15;

			Out2 = (In2 * pCoefs->l_directGain)>>15;

			In1 = (In1 * pCoefs->l_revGain)>>15;

			In2 = (In2 * pCoefs->l_revGain)>>15;

			pNewDll1 = pDelay+Indx1;
			pNewDll2 = pDelay+Indx2+1;
			pNewDll3 = pDelay+Indx3+2;
			pNewDll4 = pDelay+Indx4+3;

			dspsL( pDD1, DIndx, pCoefs->lDDly1, pCoefs->l_dDG1, pCoefs->l_dDsps, pNewDll1, pNewDll2 );
			dspsL( pDD2, DIndx, pCoefs->lDDly2, pCoefs->l_dDG2, pCoefs->l_dDsps, pNewDll3, pNewDll4 );

			Out1 += *pNewDll1 + *pNewDll3;
			Out2 += *pNewDll2 + *pNewDll4;

			pDelayIn = pDelay + Indx;

			*pDelayIn++ = In1 + ((*pNewDll1*pCoefs->l_dFB11 + *pPrevDll1*pCoefs->l_dFB12)>>15);
			pPrevDll1 = pNewDll1;
			Indx1 = (Indx1 - 4) & REV_MASK;

			*pDelayIn++ = In2 + ((*pNewDll2*pCoefs->l_dFB21 + *pPrevDll2*pCoefs->l_dFB22)>>15);			
			pPrevDll2 = pNewDll2;
			Indx2 = (Indx2 - 4) & REV_MASK;

			*pDelayIn++ = -In2 + ((*pNewDll3*pCoefs->l_dFB31 + *pPrevDll3*pCoefs->l_dFB32)>>15);
			pPrevDll3 = pNewDll3;
			Indx3 = (Indx3 - 4) & REV_MASK;

			*pDelayIn++ = In1 + ((*pNewDll4*pCoefs->l_dFB41 + *pPrevDll4*pCoefs->l_dFB42)>>15);
			pPrevDll4 = pNewDll4;
			Indx4 = (Indx4 - 4) & REV_MASK;

			Indx = (Indx - 4) & REV_MASK;
			DIndx = (DIndx - 2) & DSPS_MASK;

			CLIP_SHORT_TO_SHORT(Out1)
			CLIP_SHORT_TO_SHORT(Out2)

			*pOutShort++ = (short)(Out1);
			*pOutShort++ = (short)(Out2);
			
		}

	((long *)pStates)[0] = Indx ;
	((long *)pStates)[1] = DIndx ;

}
#endif

void SVerbStereoToStereoShort(long NumInFrames, short *pInShort, short *pOutShort, 
						      void *pC, long *pStates)
{

	sCoefsStruct *pCoefs = ((sCoefsStruct *)pC);
	long n_sample;
	long In1, In2, Out1, Out2;
	long Indx1,Indx2,Indx3,Indx4;
	long *pNewDll1, *pNewDll2, *pNewDll3, *pNewDll4;
	long *pPrevDll1, *pPrevDll2, *pPrevDll3, *pPrevDll4, *pDelayIn;
	long	*pDelay = pStates+2;
	long	*pDD1	 = pDelay+0x4000;
	long	*pDD2	 = pDD1+0x800;
	long Indx = ((long *)pStates)[0];
	long DIndx = ((long *)pStates)[1] ;

	Indx1 = (Indx+4+pCoefs->lDelay1) & REV_MASK;
	Indx2 = (Indx+4+pCoefs->lDelay2) & REV_MASK;
	Indx3 = (Indx+4+pCoefs->lDelay3) & REV_MASK;
	Indx4 = (Indx+4+pCoefs->lDelay4) & REV_MASK;

	pPrevDll1 = pDelay+Indx1;
	pPrevDll2 = pDelay+Indx2+1;
	pPrevDll3 = pDelay+Indx3+2;
	pPrevDll4 = pDelay+Indx4+3;

	Indx1 = (Indx1-4)&REV_MASK;
	Indx2 = (Indx2-4)&REV_MASK;
	Indx3 = (Indx3-4)&REV_MASK;
	Indx4 = (Indx4-4)&REV_MASK;

		for (n_sample = 0;n_sample < NumInFrames;n_sample++)
		{

			In1 = (long)(*pInShort++);
			In2 = (long)(*pInShort++);

			Out1 = (In1 * pCoefs->l_directGain)>>15;

			Out2 = (In2 * pCoefs->l_directGain)>>15;

			In1 = (In1 * pCoefs->l_revGain)>>15;

			In2 = (In2 * pCoefs->l_revGain)>>15;

			pNewDll1 = pDelay+Indx1;
			pNewDll2 = pDelay+Indx2+1;
			pNewDll3 = pDelay+Indx3+2;
			pNewDll4 = pDelay+Indx4+3;

			dspsL( pDD1, DIndx, pCoefs->lDDly1, pCoefs->l_dDG1, pCoefs->l_dDsps, pNewDll1, pNewDll2 );
			dspsL( pDD2, DIndx, pCoefs->lDDly2, pCoefs->l_dDG2, pCoefs->l_dDsps, pNewDll3, pNewDll4 );

			Out1 += *pNewDll1 + *pNewDll3;
			Out2 += *pNewDll2 + *pNewDll4;

			pDelayIn = pDelay + Indx;

			*pDelayIn++ = In1 + ((*pNewDll1*pCoefs->l_dFB11 + *pPrevDll1*pCoefs->l_dFB12)>>15);
			pPrevDll1 = pNewDll1;
			Indx1 = (Indx1 - 4) & REV_MASK;

			*pDelayIn++ = In2 + ((*pNewDll2*pCoefs->l_dFB21 + *pPrevDll2*pCoefs->l_dFB22)>>15);			
			pPrevDll2 = pNewDll2;
			Indx2 = (Indx2 - 4) & REV_MASK;

			*pDelayIn++ = -In2 + ((*pNewDll3*pCoefs->l_dFB31 + *pPrevDll3*pCoefs->l_dFB32)>>15);
			pPrevDll3 = pNewDll3;
			Indx3 = (Indx3 - 4) & REV_MASK;

			*pDelayIn++ = In1 + ((*pNewDll4*pCoefs->l_dFB41 + *pPrevDll4*pCoefs->l_dFB42)>>15);
			pPrevDll4 = pNewDll4;
			Indx4 = (Indx4 - 4) & REV_MASK;

			Indx = (Indx - 4) & REV_MASK;
			DIndx = (DIndx - 2) & DSPS_MASK;

			CLIP_SHORT_TO_SHORT(Out1)
			CLIP_SHORT_TO_SHORT(Out2)

			*pOutShort++ = (short)(Out1);
			*pOutShort++ = (short)(Out2);
		}

	((long *)pStates)[0] = Indx ;
	((long *)pStates)[1] = DIndx ;

}

#ifdef USE_ALL_VERBS

void SVerbMonoToMonoFloat(long NumInFrames, float *pInFloat, float *pOutFloat, 
						  void *pC, float *pStates)
{

	sCoefsStruct *pCoefs = ((sCoefsStruct *)pC);
	long n_sample;
	float In1, In2, Out1, Out2;
	long Indx1,Indx2,Indx3,Indx4;
	float *pNewDll1, *pNewDll2, *pNewDll3, *pNewDll4;
	float *pPrevDll1, *pPrevDll2, *pPrevDll3, *pPrevDll4, *pDelayIn;
	float	*pDelay = pStates+2;
	float	*pDD1	 = pDelay+0x4000;
	float	*pDD2	 = pDD1+0x800;
	long Indx = ((long *)pStates)[0];
	long DIndx = ((long *)pStates)[1] ;

	Indx1 = (Indx+4+pCoefs->lDelay1) & REV_MASK;
	Indx2 = (Indx+4+pCoefs->lDelay2) & REV_MASK;
	Indx3 = (Indx+4+pCoefs->lDelay3) & REV_MASK;
	Indx4 = (Indx+4+pCoefs->lDelay4) & REV_MASK;

	pPrevDll1 = pDelay+Indx1;
	pPrevDll2 = pDelay+Indx2+1;
	pPrevDll3 = pDelay+Indx3+2;
	pPrevDll4 = pDelay+Indx4+3;

	Indx1 = (Indx1-4)&REV_MASK;
	Indx2 = (Indx2-4)&REV_MASK;
	Indx3 = (Indx3-4)&REV_MASK;
	Indx4 = (Indx4-4)&REV_MASK;

		for (n_sample = 0;n_sample < NumInFrames;n_sample++)
		{

			In1 = In2 = (float)0.5 * (*pInFloat++) + FPU_DENORM_OFFS;

			Out1 = In1 * pCoefs->directGain;
			Out2 = In2 * pCoefs->directGain;

			In1 *= pCoefs->revGain;
			In2 *= pCoefs->revGain;

			pNewDll1 = pDelay+Indx1;
			pNewDll2 = pDelay+Indx2+1;
			pNewDll3 = pDelay+Indx3+2;
			pNewDll4 = pDelay+Indx4+3;

			dsps( pDD1, DIndx, pCoefs->lDDly1, pCoefs->dDG1, pCoefs->dDsps, pNewDll1, pNewDll2 );
			dsps( pDD2, DIndx, pCoefs->lDDly2, pCoefs->dDG2, pCoefs->dDsps, pNewDll3, pNewDll4 );

			Out1 += *pNewDll1 + *pNewDll3;
			Out2 += *pNewDll2 + *pNewDll4;

			pDelayIn = pDelay + Indx;

			*pDelayIn++ = In1 + *pNewDll1*pCoefs->dFB11 + *pPrevDll1*pCoefs->dFB12;
			pPrevDll1 = pNewDll1;
			Indx1 = (Indx1 - 4) & REV_MASK;

			*pDelayIn++ = In2 + *pNewDll2*pCoefs->dFB21 + *pPrevDll2*pCoefs->dFB22;
			pPrevDll2 = pNewDll2;
			Indx2 = (Indx2 - 4) & REV_MASK;

			*pDelayIn++ = -In2 + *pNewDll3*pCoefs->dFB31 + *pPrevDll3*pCoefs->dFB32;
			pPrevDll3 = pNewDll3;
			Indx3 = (Indx3 - 4) & REV_MASK;

			*pDelayIn++ = In1 + *pNewDll4*pCoefs->dFB41 + *pPrevDll4*pCoefs->dFB42;
			pPrevDll4 = pNewDll4;
			Indx4 = (Indx4 - 4) & REV_MASK;

			Indx = (Indx - 4) & REV_MASK;
			DIndx = (DIndx - 2) & DSPS_MASK;

			*pOutFloat++ = Out1+Out2;
			
		}

	((long *)pStates)[0] = Indx ;
	((long *)pStates)[1] = DIndx ;

}

void SVerbMonoToStereoFloat(long NumInFrames, float *pInFloat, float *pOutFloat, 
						    void *pC, float *pStates)
{

	sCoefsStruct *pCoefs = ((sCoefsStruct *)pC);
	long n_sample;
	float In1, In2, Out1, Out2;
	long Indx1,Indx2,Indx3,Indx4;
	float *pNewDll1, *pNewDll2, *pNewDll3, *pNewDll4;
	float *pPrevDll1, *pPrevDll2, *pPrevDll3, *pPrevDll4, *pDelayIn;
	float	*pDelay = pStates+2;
	float	*pDD1	 = pDelay+0x4000;
	float	*pDD2	 = pDD1+0x800;
	long Indx = ((long *)pStates)[0];
	long DIndx = ((long *)pStates)[1] ;

	Indx1 = (Indx+4+pCoefs->lDelay1) & REV_MASK;
	Indx2 = (Indx+4+pCoefs->lDelay2) & REV_MASK;
	Indx3 = (Indx+4+pCoefs->lDelay3) & REV_MASK;
	Indx4 = (Indx+4+pCoefs->lDelay4) & REV_MASK;

	pPrevDll1 = pDelay+Indx1;
	pPrevDll2 = pDelay+Indx2+1;
	pPrevDll3 = pDelay+Indx3+2;
	pPrevDll4 = pDelay+Indx4+3;

	Indx1 = (Indx1-4)&REV_MASK;
	Indx2 = (Indx2-4)&REV_MASK;
	Indx3 = (Indx3-4)&REV_MASK;
	Indx4 = (Indx4-4)&REV_MASK;

		for (n_sample = 0;n_sample < NumInFrames;n_sample++)
		{

			In1 = In2 = (float)0.7071 * (*pInFloat++) + FPU_DENORM_OFFS;

			Out1 = In1 * pCoefs->directGain;
			Out2 = In2 * pCoefs->directGain;

			In1 *= pCoefs->revGain;
			In2 *= pCoefs->revGain;

			pNewDll1 = pDelay+Indx1;
			pNewDll2 = pDelay+Indx2+1;
			pNewDll3 = pDelay+Indx3+2;
			pNewDll4 = pDelay+Indx4+3;

			dsps( pDD1, DIndx, pCoefs->lDDly1, pCoefs->dDG1, pCoefs->dDsps, pNewDll1, pNewDll2 );
			dsps( pDD2, DIndx, pCoefs->lDDly2, pCoefs->dDG2, pCoefs->dDsps, pNewDll3, pNewDll4 );

			Out1 += *pNewDll1 + *pNewDll3;
			Out2 += *pNewDll2 + *pNewDll4;

			pDelayIn = pDelay + Indx;

			*pDelayIn++ = In1 + *pNewDll1*pCoefs->dFB11 + *pPrevDll1*pCoefs->dFB12;
			pPrevDll1 = pNewDll1;
			Indx1 = (Indx1 - 4) & REV_MASK;

			*pDelayIn++ = In2 + *pNewDll2*pCoefs->dFB21 + *pPrevDll2*pCoefs->dFB22;
			pPrevDll2 = pNewDll2;
			Indx2 = (Indx2 - 4) & REV_MASK;

			*pDelayIn++ = -In2 + *pNewDll3*pCoefs->dFB31 + *pPrevDll3*pCoefs->dFB32;
			pPrevDll3 = pNewDll3;
			Indx3 = (Indx3 - 4) & REV_MASK;

			*pDelayIn++ = In1 + *pNewDll4*pCoefs->dFB41 + *pPrevDll4*pCoefs->dFB42;
			pPrevDll4 = pNewDll4;
			Indx4 = (Indx4 - 4) & REV_MASK;

			Indx = (Indx - 4) & REV_MASK;
			DIndx = (DIndx - 2) & DSPS_MASK;

			*pOutFloat++ = Out1;
			*pOutFloat++ = Out2;
			
		}

	((long *)pStates)[0] = Indx ;
	((long *)pStates)[1] = DIndx ;

}

void SVerbStereoToStereoFloat(long NumInFrames, float *pInFloat, float *pOutFloat, 
						      void *pC, float *pStates)
{

	sCoefsStruct *pCoefs = ((sCoefsStruct *)pC);
	long n_sample;
	float In1, In2, Out1, Out2;
	long Indx1,Indx2,Indx3,Indx4;
	float *pNewDll1, *pNewDll2, *pNewDll3, *pNewDll4;
	float *pPrevDll1, *pPrevDll2, *pPrevDll3, *pPrevDll4, *pDelayIn;
	float	*pDelay = pStates+2;
	float	*pDD1	 = pDelay+0x4000;
	float	*pDD2	 = pDD1+0x800;
	long Indx = ((long *)pStates)[0];
	long DIndx = ((long *)pStates)[1] ;

	Indx1 = (Indx+4+pCoefs->lDelay1) & REV_MASK;
	Indx2 = (Indx+4+pCoefs->lDelay2) & REV_MASK;
	Indx3 = (Indx+4+pCoefs->lDelay3) & REV_MASK;
	Indx4 = (Indx+4+pCoefs->lDelay4) & REV_MASK;

	pPrevDll1 = pDelay+Indx1;
	pPrevDll2 = pDelay+Indx2+1;
	pPrevDll3 = pDelay+Indx3+2;
	pPrevDll4 = pDelay+Indx4+3;

	Indx1 = (Indx1-4)&REV_MASK;
	Indx2 = (Indx2-4)&REV_MASK;
	Indx3 = (Indx3-4)&REV_MASK;
	Indx4 = (Indx4-4)&REV_MASK;

		for (n_sample = 0;n_sample < NumInFrames;n_sample++)
		{

			In1 = (*pInFloat++) + FPU_DENORM_OFFS;
			In2 = (*pInFloat++) + FPU_DENORM_OFFS;

			Out1 = In1 * pCoefs->directGain;
			Out2 = In2 * pCoefs->directGain;

			In1 *= pCoefs->revGain;
			In2 *= pCoefs->revGain;

			pNewDll1 = pDelay+Indx1;
			pNewDll2 = pDelay+Indx2+1;
			pNewDll3 = pDelay+Indx3+2;
			pNewDll4 = pDelay+Indx4+3;

			dsps( pDD1, DIndx, pCoefs->lDDly1, pCoefs->dDG1, pCoefs->dDsps, pNewDll1, pNewDll2 );
			dsps( pDD2, DIndx, pCoefs->lDDly2, pCoefs->dDG2, pCoefs->dDsps, pNewDll3, pNewDll4 );

			Out1 += *pNewDll1 + *pNewDll3;
			Out2 += *pNewDll2 + *pNewDll4;

			pDelayIn = pDelay + Indx;

			*pDelayIn++ = In1 + *pNewDll1*pCoefs->dFB11 + *pPrevDll1*pCoefs->dFB12;
			pPrevDll1 = pNewDll1;
			Indx1 = (Indx1 - 4) & REV_MASK;

			*pDelayIn++ = In2 + *pNewDll2*pCoefs->dFB21 + *pPrevDll2*pCoefs->dFB22;
			pPrevDll2 = pNewDll2;
			Indx2 = (Indx2 - 4) & REV_MASK;

			*pDelayIn++ = -In2 + *pNewDll3*pCoefs->dFB31 + *pPrevDll3*pCoefs->dFB32;
			pPrevDll3 = pNewDll3;
			Indx3 = (Indx3 - 4) & REV_MASK;

			*pDelayIn++ = In1 + *pNewDll4*pCoefs->dFB41 + *pPrevDll4*pCoefs->dFB42;
			pPrevDll4 = pNewDll4;
			Indx4 = (Indx4 - 4) & REV_MASK;

			Indx = (Indx - 4) & REV_MASK;
			DIndx = (DIndx - 2) & DSPS_MASK;

			*pOutFloat++ = Out1;
			*pOutFloat++ = Out2;
			
		}

	((long *)pStates)[0] = Indx ;
	((long *)pStates)[1] = DIndx ;

}

__inline void dsps( float *pDly, long ref, long delay, float dDG1, float dDsps, float *inL, float *inR )
{
	float outL, outR;
	float *pDlyOut; 

    pDlyOut = pDly + ((ref+delay) & DSPS_MASK);
    pDly += (ref & DSPS_MASK);

    outL = dDG1 * (*pDlyOut++) + *inR * dDsps;
	outR = dDG1 * (*pDlyOut) - *inL * dDsps ;

    // here we feed back the output.
	*pDly++ = *inL + dDsps * outR ;
	*pDly = *inR - dDsps * outL ;

	*inL = outL;
	*inR = outR;

}
#endif

__inline void dspsL( long *pDly, long ref, long delay, long dDG1, long dDsps, long *inL, long *inR )
{
	long outL, outR;
	long *pDlyOut; 

    pDlyOut = pDly + ((ref+delay) & DSPS_MASK);
    pDly += (ref & DSPS_MASK);

    outL = (dDG1 * (*pDlyOut++) + *inR * dDsps)>>15;

	outR = (dDG1 * (*pDlyOut) - *inL * dDsps)>>15;

    // here we feed back the output.
	*pDly++ = *inL + ((dDsps * outR)>>15) ;

	*pDly = *inR - ((dDsps * outL)>>15) ;

	*inL = outL;
	*inR = outR;

}
#pragma optimize( "ty", off )