csgo/cstrike15_src/public/mathlib/ssemath_360.h


								//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//

								//

								// Purpose: Implementation of our SIMD functions for the 360.

								//==============================================================//


								#ifndef DBG_H

								#include "tier0/dbg.h"

								#endif


								//---------------------------------------------------------------------

								// X360 implementation

								//---------------------------------------------------------------------


								FORCEINLINE float & FloatSIMD( fltx4 & a, int idx )

								{

									fltx4_union & a_union = (fltx4_union &)a;

									return a_union.m128_f32[idx];

								}


								FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx )

								{

									fltx4_union & a_union = (fltx4_union &)a;

									return a_union.m128_u32[idx];

								}


								FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )

								{

									return __vaddfp( a, b );

								}


								FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b )				// a-b

								{

									return __vsubfp( a, b );

								}


								FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b )				// a*b

								{

									return __vmulfp( a, b );

								}


								FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// a*b + c

								{

									return __vmaddfp( a, b, c );

								}


								FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// c - a*b

								{

									return __vnmsubfp( a, b, c );

								};


								FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )

								{

									return __vmsum3fp( a, b );

								}


								FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )

								{

									return __vmsum4fp( a, b );

								}


								FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )

								{

									return XMVectorSin( radians );

								}


								FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )

								{

									XMVectorSinCos( &sine, &cosine, radians );

								}


								FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )

								{

									XMVectorSinCos( &sine, &cosine, radians );

								}


								FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians )

								{

									cosine = XMVectorCos( radians );

								}


								FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )

								{

									return XMVectorASin( sine );

								}


								FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )

								{

									return XMVectorACos( cs );

								}


								// tan^1(a/b) .. ie, pass sin in as a and cos in as b

								FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )

								{

									return XMVectorATan2( a, b );

								}


								// DivSIMD defined further down, since it uses ReciprocalSIMD


								FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)

								{

									return __vmaxfp( a, b );

								}


								FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)

								{

									return __vminfp( a, b );

								}


								FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b

								{

								    return __vand( a, b );

								}


								FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b

								{

									// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second

								    return __vandc( b, a );

								}


								FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b )				// a ^ b

								{

								    return __vxor( a, b );

								}


								FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b )				// a | b

								{

								    return __vor( a, b );

								}


								FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a

								{

									return XMVectorNegate(a);

								}


								FORCEINLINE bool IsAllZeros( const fltx4 & a )								// all floats of a zero?

								{

									unsigned int equalFlags = 0;

								    __vcmpeqfpR( a, Four_Zeros, &equalFlags );

								    return XMComparisonAllTrue( equalFlags );

								}


								FORCEINLINE bool IsAnyZeros( const fltx4 & a )								// any floats are zero?

								{

									unsigned int conditionregister;

									XMVectorEqualR(&conditionregister, a, XMVectorZero());

									return XMComparisonAnyTrue(conditionregister);

								}


								FORCEINLINE bool IsAnyXYZZero( const fltx4 &a )								// are any of x,y,z zero?

								{

									// copy a's x component into w, in case w was zero.

									fltx4 temp = __vrlimi(a, a, 1, 1);

									unsigned int conditionregister;

									XMVectorEqualR(&conditionregister, temp, XMVectorZero());

									return XMComparisonAnyTrue(conditionregister);

								}


								/// for branching when a.xyzw > b.xyzw

								FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )

								{

									unsigned int cr;

									XMVectorGreaterR(&cr,a,b);

									return XMComparisonAllTrue(cr);

								}


								/// for branching when a.xyzw >= b.xyzw

								FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )

								{

									unsigned int cr;

									XMVectorGreaterOrEqualR(&cr,a,b);

									return XMComparisonAllTrue(cr);

								}


								/// for branching when a.xyzw > b.xyzw

								FORCEINLINE bool IsAnyGreaterThan( const fltx4 &a, const fltx4 &b )

								{

									unsigned int cr;

									XMVectorGreaterR(&cr,a,b);

									return XMComparisonAnyTrue(cr);

								}


								/// for branching when a.xyzw >= b.xyzw

								FORCEINLINE bool IsAnyGreaterThanOrEq( const fltx4 &a, const fltx4 &b )

								{

									unsigned int cr;

									XMVectorGreaterOrEqualR(&cr,a,b);

									return XMComparisonAnyTrue(cr);

								}


								// For branching if all a.xyzw == b.xyzw

								FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )

								{

									unsigned int cr;

									XMVectorEqualR(&cr,a,b);

									return XMComparisonAllTrue(cr);

								}


								FORCEINLINE int TestSignSIMD( const fltx4 & a )								// mask of which floats have the high bit set

								{

									// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)

									int nRet = 0;


									const fltx4_union & a_union = (const fltx4_union &)a;

									nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0

									nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1

									nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2

									nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3


									return nRet;

								}


								// Squelch the w component of a vector to +0.0.

								// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)

								FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )

								{

									return __vrlimi( a, __vzero(), 1, 0 );

								}


								FORCEINLINE bool IsAnyNegative( const fltx4 & a )							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)

								{

									// NOTE: this tests the top bits of each vector element using integer math

									//       (so it ignores NaNs - it will return true for "-NaN")

									unsigned int equalFlags = 0;

								    fltx4 signMask = __vspltisw( -1 );             // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)

								    signMask       = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000

									__vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags );

									return !XMComparisonAllTrue( equalFlags );

								}


								FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? ~0:0

								{

								    return __vcmpeqfp( a, b );

								}


								FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? ~0:0

								{

								    return __vcmpgtfp( a, b );

								}


								FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? ~0:0

								{

								    return __vcmpgefp( a, b );

								}


								FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? ~0:0

								{

								    return __vcmpgtfp( b, a );

								}


								FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? ~0:0

								{

								    return __vcmpgefp( b, a );

								}


								FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? ~0 : 0

								{

									return XMVectorInBounds( a, b );

								}


								FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? 1.0:0

								{

								    return AndSIMD( Four_Ones, __vcmpeqfp( a, b ) );

								}


								FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? 1.0:0

								{

								    return AndSIMD( Four_Ones, __vcmpgtfp( a, b ) );

								}


								FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? 1.0:0

								{

								    return AndSIMD( Four_Ones, __vcmpgefp( a, b ) );

								}


								FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? 1.0:0

								{

								    return AndSIMD( Four_Ones, __vcmpgtfp( b, a ) );

								}


								FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? 1.0:0

								{

								    return AndSIMD( Four_Ones, __vcmpgefp( b, a ) );

								}


								FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? 1.0 : 0

								{

									return AndSIMD( Four_Ones, XMVectorInBounds( a, b ) );

								}


								// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue

								FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )

								{

								    return __vsel( OldValue, NewValue, ReplacementMask );

								}


								// AKA "Broadcast", "Splat"

								FORCEINLINE fltx4 ReplicateX4( float flValue )					//  a,a,a,a

								{

									// NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)

									float * pValue = &flValue;

									Assert( pValue );

								    Assert( ((unsigned int)pValue & 3) == 0);

									return __vspltw( __lvlx( pValue, 0 ), 0 );

								}


								FORCEINLINE fltx4 ReplicateX4( const float *pValue )					//  a,a,a,a

								{

									Assert( pValue );

									return __vspltw( __lvlx( pValue, 0 ), 0 );

								}


								/// replicate a single 32 bit integer value to all 4 components of an m128

								FORCEINLINE fltx4 ReplicateIX4( int nValue )

								{

									// NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)

									int * pValue = &nValue;

									Assert( pValue );

								    Assert( ((unsigned int)pValue & 3) == 0);

									return __vspltw( __lvlx( pValue, 0 ), 0 );

								}


								// Round towards positive infinity

								FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )

								{

									return __vrfip(a);

								}


								// Round towards nearest integer

								FORCEINLINE fltx4 RoundSIMD( const fltx4 &a )

								{

									return __vrfin(a);

								}


								// Round towards negative infinity

								FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )

								{

									return __vrfim(a);

								}


								FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a )				// sqrt(a), more or less

								{

									// This is emulated from rsqrt

									return XMVectorSqrtEst( a );

								}


								FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a )					// sqrt(a)

								{

									// This is emulated from rsqrt

									return XMVectorSqrt( a );

								}


								FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a )		// 1/sqrt(a), more or less

								{

								    return __vrsqrtefp( a );

								}


								FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )

								{

									// Convert zeros to epsilons

									fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );

									fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );

									return ReciprocalSqrtEstSIMD( a_safe );

								}


								FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a )			// 1/sqrt(a)

								{

									// This uses Newton-Raphson to improve the HW result

								 	return XMVectorReciprocalSqrt( a );

								}


								FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a )			// 1/a, more or less

								{

								    return __vrefp( a );

								}


								/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.

								/// No error checking!

								FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a )				// 1/a

								{

									// This uses Newton-Raphson to improve the HW result

									return XMVectorReciprocal( a );

								}


								// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?)

								FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b )	// a/b

								{

									return MulSIMD( ReciprocalSIMD( b ), a );

								}


								// CHRISG: is it worth doing integer bitfiddling for this?

								// 2^x for all values (the antilog)

								FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )

								{

									return XMVectorExp(toPower);

								}


								// Clamps the components of a vector to a specified minimum and maximum range.

								FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)

								{

									return XMVectorClamp(in, min, max);

								}


								FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )

								{

									return XMLoadVector4( pSIMD );

								}


								// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).

								FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )

								{

									return XMLoadVector3( pSIMD );

								}


								// load a single unaligned float into the x component of a SIMD word

								FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )

								{

									return __lvlx( pFlt, 0 );

								}


								FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )

								{

									return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );

								}


								FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD )

								{

									return XMLoadVector4A( pSIMD );

								}


								FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD )

								{

									return XMLoadVector4( pSIMD );

								}


								FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )

								{

									*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;

								}


								FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )

								{

									XMStoreVector4( pSIMD, a );

								}


								FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )

								{

									XMStoreVector3( pSIMD, a );

								}


								// Fixed-point conversion and save as SIGNED INTS.

								// pDest->x = Int (vSrc.x)

								// note: some architectures have means of doing

								// fixed point conversion when the fix depth is

								// specified as an immediate.. but there is no way

								// to guarantee an immediate as a parameter to function

								// like this.

								FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)

								{

									fltx4 asInt = __vctsxs( vSrc, 0 );

									XMStoreVector4A(pDest->Base(), asInt);

								}


								FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )

								{

									XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w );

									xyzwMatrix = XMMatrixTranspose( xyzwMatrix );

									x = xyzwMatrix.r[0];

									y = xyzwMatrix.r[1];

									z = xyzwMatrix.r[2];

									w = xyzwMatrix.r[3];

								}


								// Return one in the fastest way -- faster even than loading.

								FORCEINLINE fltx4 LoadZeroSIMD( void )

								{

									return XMVectorZero();

								}


								// Return one in the fastest way -- faster even than loading.

								FORCEINLINE fltx4 LoadOneSIMD( void )

								{

									return XMVectorSplatOne();

								}


								FORCEINLINE fltx4 SplatXSIMD( fltx4 a )

								{

									return XMVectorSplatX( a );

								}


								FORCEINLINE fltx4 SplatYSIMD( fltx4 a )

								{

									return XMVectorSplatY( a );

								}


								FORCEINLINE fltx4 SplatZSIMD( fltx4 a )

								{

									return XMVectorSplatZ( a );

								}


								FORCEINLINE fltx4 SplatWSIMD( fltx4 a )

								{

									return XMVectorSplatW( a );

								}


								FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )

								{

									fltx4 result = __vrlimi(a, x, 8, 0);

									return result;

								}


								FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )

								{

									fltx4 result = __vrlimi(a, y, 4, 0);

									return result;

								}


								FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )

								{

									fltx4 result = __vrlimi(a, z, 2, 0);

									return result;

								}


								FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )

								{

									fltx4 result = __vrlimi(a, w, 1, 0);

									return result;

								}


								FORCEINLINE fltx4 RotateLeft( const fltx4 & a )

								{

									fltx4 compareOne = a;

									return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 );

								}


								FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )

								{

									fltx4 compareOne = a;

									return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );

								}


								FORCEINLINE fltx4 RotateRight( const fltx4 & a )

								{

									fltx4 compareOne = a;

									return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 3 );

								}


								FORCEINLINE fltx4 RotateRight2( const fltx4 & a )

								{

									fltx4 compareOne = a;

									return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );

								}


								// find the lowest component of a.x, a.y, a.z,

								// and replicate it to the whole return value.

								// ignores a.w.

								// Though this is only five instructions long,

								// they are all dependent, making this stall city.

								// Forcing this inline should hopefully help with scheduling.

								FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )

								{

									// a is [x,y,z,G] (where G is garbage)

									// rotate left by one

									fltx4 compareOne = a ;

									compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );

									// compareOne is [y,z,G,G]

									fltx4 retval = MinSIMD( a, compareOne );

									// retVal is [min(x,y), min(y,z), G, G]

									compareOne = __vrlimi( compareOne, a, 8 , 2);

									// compareOne is [z, G, G, G]

									retval = MinSIMD( retval, compareOne );

									// retVal = [ min(min(x,y),z), G, G, G ]


									// splat the x component out to the whole vector and return

									return SplatXSIMD( retval );

								}


								// find the highest component of a.x, a.y, a.z,

								// and replicate it to the whole return value.

								// ignores a.w.

								// Though this is only five instructions long,

								// they are all dependent, making this stall city.

								// Forcing this inline should hopefully help with scheduling.

								FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )

								{

									// a is [x,y,z,G] (where G is garbage)

									// rotate left by one

									fltx4 compareOne = a ;

									compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );

									// compareOne is [y,z,G,G]

									fltx4 retval = MaxSIMD( a, compareOne );

									// retVal is [max(x,y), max(y,z), G, G]

									compareOne = __vrlimi( compareOne, a, 8 , 2);

									// compareOne is [z, G, G, G]

									retval = MaxSIMD( retval, compareOne );

									// retVal = [ max(max(x,y),z), G, G, G ]


									// splat the x component out to the whole vector and return

									return SplatXSIMD( retval );

								}


								// ------------------------------------

								// INTEGER SIMD OPERATIONS.

								// ------------------------------------


								// Load 4 aligned words into a SIMD register

								FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)

								{

									return XMLoadVector4A(pSIMD);

								}


								// Load 4 unaligned words into a SIMD register

								FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD)

								{

									return XMLoadVector4( pSIMD );

								}


								// save into four words, 16-byte aligned

								FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )

								{

									*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;

								}


								FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )

								{

									*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;

								}


								FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )

								{

									XMStoreVector4(pSIMD, a);

								}


								// Load four consecutive uint16's, and turn them into floating point numbers.

								// This function isn't especially fast and could be made faster if anyone is

								// using it heavily.

								FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )

								{

									return XMLoadUShort4(reinterpret_cast<const XMUSHORT4 *>(pInts));

								}


								// a={ a.x, a.z, b.x, b.z }

								// combine two fltx4s by throwing away every other field.

								FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )

								{

									return XMVectorPermute( a, b, XMVectorPermuteControl( 0, 2, 4, 6  )  );

								}


								// a={ a.x, b.x, c.x, d.x }

								// combine 4 fltx4s by throwing away 3/4s of the fields

								// TODO: make more efficient by doing this in a parallel way at the caller

								//    Compress4SIMD(FourVectors.. )

								FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )

								{

									fltx4 abcd = __vrlimi( a, b, 4, 3 );  // a.x, b.x, a.z, a.w

									abcd = __vrlimi( abcd, c, 2, 2 );  // ax, bx, cx, aw

									abcd = __vrlimi( abcd, d, 1, 1 );  // ax, bx, cx, dx


									return abcd;

								}


								// Take a fltx4 containing fixed-point uints and

								// return them as single precision floats. No

								// fixed point conversion is done.

								FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )

								{

									return __vcfux( vSrcA, 0 );

								}


								// Take a fltx4 containing fixed-point sints and

								// return them as single precision floats. No

								// fixed point conversion is done.

								FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )

								{

									return __vcfsx( vSrcA, 0 );

								}


								// Take a fltx4 containing fixed-point uints and

								// return them as single precision floats. Each uint

								// will be divided by 2^immed after conversion

								// (eg, this is fixed point math).

								/* as if:

								   FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )

								   {

								   return __vcfux( vSrcA, uImmed );

								   }

								*/

								#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))


								// Take a fltx4 containing fixed-point sints and

								// return them as single precision floats. Each int

								// will be divided by 2^immed (eg, this is fixed point

								// math).

								/* as if:

								   FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )

								   {

								   return __vcfsx( vSrcA, uImmed );

								   }

								*/

								#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))


								// set all components of a vector to a signed immediate int number.

								/* as if:

								   FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)

								   {

								   return __vspltisw( toImmediate );

								   }

								*/

								#define IntSetImmediateSIMD(x) (__vspltisw(x))


								/*

								  works on fltx4's as if they are four uints.

								  the first parameter contains the words to be shifted,

								  the second contains the amount to shift by AS INTS


								  for i = 0 to 3

								  shift = vSrcB_i*32:(i*32)+4

								  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift

								*/

								FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB)

								{

									return __vslw(vSrcA, vSrcB);

								}


								FORCEINLINE float SubFloat( const fltx4 & a, int idx )

								{

									// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)

									const fltx4_union & a_union = (const fltx4_union &)a;

									return a_union.m128_f32[ idx ];

								}


								FORCEINLINE float & SubFloat( fltx4 & a, int idx )

								{

									fltx4_union & a_union = (fltx4_union &)a;

									return a_union.m128_f32[idx];

								}


								/// Set one component of a SIMD word with the given float value.

								/// This function is a template because the native implementation of

								/// this on PPC platforms requires that the component be given as a

								/// compiler immediate -- not a function parameter, not a const function

								/// parameter, not even a load from a const static array. It has to be

								/// a real immediate.

								/// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.

								/// \note This function is not particularly performant on any platform (because of

								///       the load from float), so prefer a masked assign from a fltx4 wherever

								///       possible.

								template < unsigned int NCOMPONENT >

								FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )

								{

									// vrlimi can only take an immediate opcode -- that is a constant

									// passed in from the compiler, not a function parameter, nor an

									// element loaded from an array, not even a const static array.

								#define SETCOMPONENTSIMD_MASK_IMMEDIATE ( NCOMPONENT == 0 ) ? 8 :\

																		( NCOMPONENT == 1 ) ? 4 :\

																		( NCOMPONENT == 2 ) ? 2 :\

																		( NCOMPONENT == 3 ) ? 1 :\

																		17 //< a meaningless immediate intended to make the compiler angry


									fltx4 val = ReplicateX4( flValue );

									fltx4 result = __vrlimi(a, val, SETCOMPONENTSIMD_MASK_IMMEDIATE, 0);

									return result;


								#undef SETCOMPONENTSIMD_MASK_IMMEDIATE

								}


								FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )

								{

									fltx4 t = __vctuxs( a, 0 );

									const fltx4_union & a_union = (const fltx4_union &)t;

									return a_union.m128_u32[idx];

								}


								FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )

								{

									const fltx4_union & a_union = (const fltx4_union &)a;

									return a_union.m128_u32[idx];

								}


								FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )

								{

									fltx4_union & a_union = (fltx4_union &)a;

									return a_union.m128_u32[idx];

								}