//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
//
// Purpose: Implementation of our SIMD functions for the 360.
//==============================================================//


#ifndef DBG_H
#include "tier0/dbg.h"
#endif


//---------------------------------------------------------------------
// X360 implementation
//---------------------------------------------------------------------

FORCEINLINE float & FloatSIMD( fltx4 & a, int idx )
{
	fltx4_union & a_union = (fltx4_union &)a;
	return a_union.m128_f32[idx];
}

FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx )
{
	fltx4_union & a_union = (fltx4_union &)a;
	return a_union.m128_u32[idx];
}

FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
{
	return __vaddfp( a, b );
}

FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b )				// a-b
{
	return __vsubfp( a, b );
}

FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b )				// a*b
{
	return __vmulfp( a, b );
}

FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// a*b + c
{
	return __vmaddfp( a, b, c );
}

FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c )				// c - a*b
{
	return __vnmsubfp( a, b, c );
};

FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
{
	return __vmsum3fp( a, b );
}

FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
{
	return __vmsum4fp( a, b );
}

FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
{
	return XMVectorSin( radians );
}

FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
{
	XMVectorSinCos( &sine, &cosine, radians ); 	
}

FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )			
{
	XMVectorSinCos( &sine, &cosine, radians ); 	
}

FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians )				
{
	cosine = XMVectorCos( radians ); 	
}

FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
{
	return XMVectorASin( sine );
}

FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
{
	return XMVectorACos( cs );
}

// tan^1(a/b) .. ie, pass sin in as a and cos in as b
FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
{
	return XMVectorATan2( a, b );
}

// DivSIMD defined further down, since it uses ReciprocalSIMD

FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
{
	return __vmaxfp( a, b );
}

FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
{
	return __vminfp( a, b );
}

FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b
{
    return __vand( a, b );
}

FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b
{
	// NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
    return __vandc( b, a );
}

FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b )				// a ^ b
{
    return __vxor( a, b );
}

FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b )				// a | b
{
    return __vor( a, b );
}

FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
{
	return XMVectorNegate(a);
}

FORCEINLINE bool IsAllZeros( const fltx4 & a )								// all floats of a zero?
{
	unsigned int equalFlags = 0;
    __vcmpeqfpR( a, Four_Zeros, &equalFlags );
    return XMComparisonAllTrue( equalFlags );
}

FORCEINLINE bool IsAnyZeros( const fltx4 & a )								// any floats are zero?
{
	unsigned int conditionregister;
	XMVectorEqualR(&conditionregister, a, XMVectorZero());
	return XMComparisonAnyTrue(conditionregister);
}

FORCEINLINE bool IsAnyXYZZero( const fltx4 &a )								// are any of x,y,z zero?
{
	// copy a's x component into w, in case w was zero.
	fltx4 temp = __vrlimi(a, a, 1, 1);
	unsigned int conditionregister;
	XMVectorEqualR(&conditionregister, temp, XMVectorZero());
	return XMComparisonAnyTrue(conditionregister);
}

/// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
{
	unsigned int cr;
	XMVectorGreaterR(&cr,a,b);
	return XMComparisonAllTrue(cr);
}

/// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
{
	unsigned int cr;
	XMVectorGreaterOrEqualR(&cr,a,b);
	return XMComparisonAllTrue(cr);
}

/// for branching when a.xyzw > b.xyzw
FORCEINLINE bool IsAnyGreaterThan( const fltx4 &a, const fltx4 &b )
{
	unsigned int cr;
	XMVectorGreaterR(&cr,a,b);
	return XMComparisonAnyTrue(cr);
}

/// for branching when a.xyzw >= b.xyzw
FORCEINLINE bool IsAnyGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
{
	unsigned int cr;
	XMVectorGreaterOrEqualR(&cr,a,b);
	return XMComparisonAnyTrue(cr);
}

// For branching if all a.xyzw == b.xyzw
FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
{
	unsigned int cr;
	XMVectorEqualR(&cr,a,b);
	return XMComparisonAllTrue(cr);
}


FORCEINLINE int TestSignSIMD( const fltx4 & a )								// mask of which floats have the high bit set
{
	// NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
	int nRet = 0;

	const fltx4_union & a_union = (const fltx4_union &)a;
	nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
	nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
	nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
	nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3

	return nRet;
}

// Squelch the w component of a vector to +0.0.
// Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
{
	return __vrlimi( a, __vzero(), 1, 0 );
}

FORCEINLINE bool IsAnyNegative( const fltx4 & a )							// (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
{
	// NOTE: this tests the top bits of each vector element using integer math
	//       (so it ignores NaNs - it will return true for "-NaN")
	unsigned int equalFlags = 0;
    fltx4 signMask = __vspltisw( -1 );             // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)
    signMask       = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000 
	__vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags );
	return !XMComparisonAllTrue( equalFlags );
}

FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? ~0:0
{
    return __vcmpeqfp( a, b );
}


FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? ~0:0
{
    return __vcmpgtfp( a, b );
}

FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? ~0:0
{
    return __vcmpgefp( a, b );
}

FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? ~0:0
{
    return __vcmpgtfp( b, a );
}

FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? ~0:0
{
    return __vcmpgefp( b, a );
}

FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? ~0 : 0
{
	return XMVectorInBounds( a, b );
}

FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b )				// (a==b) ? 1.0:0
{
    return AndSIMD( Four_Ones, __vcmpeqfp( a, b ) );
}


FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b )				// (a>b) ? 1.0:0
{
    return AndSIMD( Four_Ones, __vcmpgtfp( a, b ) );
}

FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b )				// (a>=b) ? 1.0:0
{
    return AndSIMD( Four_Ones, __vcmpgefp( a, b ) );
}

FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b )				// (a<b) ? 1.0:0
{
    return AndSIMD( Four_Ones, __vcmpgtfp( b, a ) );
}

FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b )				// (a<=b) ? 1.0:0
{
    return AndSIMD( Four_Ones, __vcmpgefp( b, a ) );
}

FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <= b && a >= -b) ? 1.0 : 0
{
	return AndSIMD( Four_Ones, XMVectorInBounds( a, b ) );
}


// returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
{
    return __vsel( OldValue, NewValue, ReplacementMask );
}

// AKA "Broadcast", "Splat"
FORCEINLINE fltx4 ReplicateX4( float flValue )					//  a,a,a,a
{
	// NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
	float * pValue = &flValue;
	Assert( pValue );
    Assert( ((unsigned int)pValue & 3) == 0);
	return __vspltw( __lvlx( pValue, 0 ), 0 );
}

FORCEINLINE fltx4 ReplicateX4( const float *pValue )					//  a,a,a,a
{
	Assert( pValue );
	return __vspltw( __lvlx( pValue, 0 ), 0 );
}

/// replicate a single 32 bit integer value to all 4 components of an m128
FORCEINLINE fltx4 ReplicateIX4( int nValue )
{
	// NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
	int * pValue = &nValue;
	Assert( pValue );
    Assert( ((unsigned int)pValue & 3) == 0);
	return __vspltw( __lvlx( pValue, 0 ), 0 );
}

// Round towards positive infinity
FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
{
	return __vrfip(a);
}

// Round towards nearest integer
FORCEINLINE fltx4 RoundSIMD( const fltx4 &a )
{
	return __vrfin(a);
}

// Round towards negative infinity
FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
{
	return __vrfim(a);
}

FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a )				// sqrt(a), more or less
{
	// This is emulated from rsqrt
	return XMVectorSqrtEst( a );
}

FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a )					// sqrt(a)
{
	// This is emulated from rsqrt
	return XMVectorSqrt( a );
}

FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a )		// 1/sqrt(a), more or less
{
    return __vrsqrtefp( a );
}

FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
{
	// Convert zeros to epsilons
	fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
	fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
	return ReciprocalSqrtEstSIMD( a_safe );
}

FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a )			// 1/sqrt(a)
{
	// This uses Newton-Raphson to improve the HW result
 	return XMVectorReciprocalSqrt( a );
}

FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a )			// 1/a, more or less
{
    return __vrefp( a );
}

/// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
/// No error checking!
FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a )				// 1/a
{
	// This uses Newton-Raphson to improve the HW result
	return XMVectorReciprocal( a );
}

// FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?)
FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b )	// a/b
{
	return MulSIMD( ReciprocalSIMD( b ), a );
}


// CHRISG: is it worth doing integer bitfiddling for this?
// 2^x for all values (the antilog)
FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )
{
	return XMVectorExp(toPower);
}

// Clamps the components of a vector to a specified minimum and maximum range.
FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
{
	return XMVectorClamp(in, min, max);
}

FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
{
	return XMLoadVector4( pSIMD );
}

// load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). 
FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
{
	return XMLoadVector3( pSIMD );
}

// load a single unaligned float into the x component of a SIMD word
FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
{
	return __lvlx( pFlt, 0 );
}

FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
{
	return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
}


FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD )
{
	return XMLoadVector4A( pSIMD );
}

FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD )
{
	return XMLoadVector4( pSIMD );
}

FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
{
	*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
}

FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
{
	XMStoreVector4( pSIMD, a );
}

FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
{
	XMStoreVector3( pSIMD, a );
}


// Fixed-point conversion and save as SIGNED INTS.
// pDest->x = Int (vSrc.x)
// note: some architectures have means of doing 
// fixed point conversion when the fix depth is
// specified as an immediate.. but there is no way 
// to guarantee an immediate as a parameter to function
// like this.
FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
{
	fltx4 asInt = __vctsxs( vSrc, 0 );
	XMStoreVector4A(pDest->Base(), asInt);
}

FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
{
	XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w );
	xyzwMatrix = XMMatrixTranspose( xyzwMatrix );
	x = xyzwMatrix.r[0];
	y = xyzwMatrix.r[1];
	z = xyzwMatrix.r[2];
	w = xyzwMatrix.r[3];
}

// Return one in the fastest way -- faster even than loading.
FORCEINLINE fltx4 LoadZeroSIMD( void )
{
	return XMVectorZero();
}

// Return one in the fastest way -- faster even than loading.
FORCEINLINE fltx4 LoadOneSIMD( void )
{
	return XMVectorSplatOne();
}

FORCEINLINE fltx4 SplatXSIMD( fltx4 a )
{
	return XMVectorSplatX( a );
}

FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
{
	return XMVectorSplatY( a );
}

FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
{
	return XMVectorSplatZ( a );
}

FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
{
	return XMVectorSplatW( a );
}

FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
{
	fltx4 result = __vrlimi(a, x, 8, 0);
	return result;
}

FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
{
	fltx4 result = __vrlimi(a, y, 4, 0);
	return result;
}

FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
{
	fltx4 result = __vrlimi(a, z, 2, 0);
	return result;
}

FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
{
	fltx4 result = __vrlimi(a, w, 1, 0);
	return result;
}


FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
{
	fltx4 compareOne = a;
	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 );
}

FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
{
	fltx4 compareOne = a;
	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
}

FORCEINLINE fltx4 RotateRight( const fltx4 & a )
{
	fltx4 compareOne = a;
	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 3 );
}

FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
{
	fltx4 compareOne = a;
	return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
}


// find the lowest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
// ignores a.w.
// Though this is only five instructions long,
// they are all dependent, making this stall city.
// Forcing this inline should hopefully help with scheduling.
FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
{
	// a is [x,y,z,G] (where G is garbage)
	// rotate left by one 
	fltx4 compareOne = a ;
	compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
	// compareOne is [y,z,G,G]
	fltx4 retval = MinSIMD( a, compareOne );
	// retVal is [min(x,y), min(y,z), G, G]
	compareOne = __vrlimi( compareOne, a, 8 , 2);
	// compareOne is [z, G, G, G]
	retval = MinSIMD( retval, compareOne );
	// retVal = [ min(min(x,y),z), G, G, G ]
	
	// splat the x component out to the whole vector and return
	return SplatXSIMD( retval );
}

// find the highest component of a.x, a.y, a.z,
// and replicate it to the whole return value.
// ignores a.w.
// Though this is only five instructions long,
// they are all dependent, making this stall city.
// Forcing this inline should hopefully help with scheduling.
FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
{
	// a is [x,y,z,G] (where G is garbage)
	// rotate left by one 
	fltx4 compareOne = a ;
	compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
	// compareOne is [y,z,G,G]
	fltx4 retval = MaxSIMD( a, compareOne );
	// retVal is [max(x,y), max(y,z), G, G]
	compareOne = __vrlimi( compareOne, a, 8 , 2);
	// compareOne is [z, G, G, G]
	retval = MaxSIMD( retval, compareOne );
	// retVal = [ max(max(x,y),z), G, G, G ]

	// splat the x component out to the whole vector and return
	return SplatXSIMD( retval );
}


// ------------------------------------
// INTEGER SIMD OPERATIONS.
// ------------------------------------

// Load 4 aligned words into a SIMD register
FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
{
	return XMLoadVector4A(pSIMD);
}

// Load 4 unaligned words into a SIMD register
FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD)
{
	return XMLoadVector4( pSIMD );
}

// save into four words, 16-byte aligned
FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
{
	*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
}

FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
{
	*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
}

FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
{
	XMStoreVector4(pSIMD, a);
}

// Load four consecutive uint16's, and turn them into floating point numbers.
// This function isn't especially fast and could be made faster if anyone is
// using it heavily.
FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
{
	return XMLoadUShort4(reinterpret_cast<const XMUSHORT4 *>(pInts));
}

// a={ a.x, a.z, b.x, b.z }
// combine two fltx4s by throwing away every other field.
FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )
{
	return XMVectorPermute( a, b, XMVectorPermuteControl( 0, 2, 4, 6  )  );
}

// a={ a.x, b.x, c.x, d.x }
// combine 4 fltx4s by throwing away 3/4s of the fields
// TODO: make more efficient by doing this in a parallel way at the caller
//    Compress4SIMD(FourVectors.. )
FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )
{
	fltx4 abcd = __vrlimi( a, b, 4, 3 );  // a.x, b.x, a.z, a.w
	abcd = __vrlimi( abcd, c, 2, 2 );  // ax, bx, cx, aw
	abcd = __vrlimi( abcd, d, 1, 1 );  // ax, bx, cx, dx

	return abcd;
}

// Take a fltx4 containing fixed-point uints and 
// return them as single precision floats. No
// fixed point conversion is done.
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )
{
	return __vcfux( vSrcA, 0 );
}

// Take a fltx4 containing fixed-point sints and 
// return them as single precision floats. No 
// fixed point conversion is done.
FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
{
	return __vcfsx( vSrcA, 0 );
}

// Take a fltx4 containing fixed-point uints and 
// return them as single precision floats. Each uint
// will be divided by 2^immed after conversion
// (eg, this is fixed point math). 
/* as if:
   FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
   {
   return __vcfux( vSrcA, uImmed );
   }
*/
#define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))

// Take a fltx4 containing fixed-point sints and 
// return them as single precision floats. Each int
// will be divided by 2^immed (eg, this is fixed point
// math). 
/* as if:
   FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
   {
   return __vcfsx( vSrcA, uImmed );
   }
*/
#define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))

// set all components of a vector to a signed immediate int number.
/* as if:
   FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
   {
   return __vspltisw( toImmediate );
   }
*/
#define IntSetImmediateSIMD(x) (__vspltisw(x))

/*
  works on fltx4's as if they are four uints.
  the first parameter contains the words to be shifted,
  the second contains the amount to shift by AS INTS

  for i = 0 to 3
  shift = vSrcB_i*32:(i*32)+4
  vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
*/
FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB)
{
	return __vslw(vSrcA, vSrcB);
}

FORCEINLINE float SubFloat( const fltx4 & a, int idx )
{
	// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
	const fltx4_union & a_union = (const fltx4_union &)a;
	return a_union.m128_f32[ idx ];
}

FORCEINLINE float & SubFloat( fltx4 & a, int idx )
{
	fltx4_union & a_union = (fltx4_union &)a;
	return a_union.m128_f32[idx];
}

/// Set one component of a SIMD word with the given float value. 
/// This function is a template because the native implementation of
/// this on PPC platforms requires that the component be given as a 
/// compiler immediate -- not a function parameter, not a const function
/// parameter, not even a load from a const static array. It has to be
/// a real immediate. 
/// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.
/// \note This function is not particularly performant on any platform (because of 
///       the load from float), so prefer a masked assign from a fltx4 wherever
///       possible. 
template < unsigned int NCOMPONENT >
FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )
{
	// vrlimi can only take an immediate opcode -- that is a constant
	// passed in from the compiler, not a function parameter, nor an 
	// element loaded from an array, not even a const static array.
#define SETCOMPONENTSIMD_MASK_IMMEDIATE ( NCOMPONENT == 0 ) ? 8 :\
										( NCOMPONENT == 1 ) ? 4 :\
										( NCOMPONENT == 2 ) ? 2 :\
										( NCOMPONENT == 3 ) ? 1 :\
										17 //< a meaningless immediate intended to make the compiler angry

	fltx4 val = ReplicateX4( flValue );
	fltx4 result = __vrlimi(a, val, SETCOMPONENTSIMD_MASK_IMMEDIATE, 0);
	return result;

#undef SETCOMPONENTSIMD_MASK_IMMEDIATE
}


FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
{
	fltx4 t = __vctuxs( a, 0 );
	const fltx4_union & a_union = (const fltx4_union &)t;
	return a_union.m128_u32[idx];
}


FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
{
	const fltx4_union & a_union = (const fltx4_union &)a;
	return a_union.m128_u32[idx];
}

FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
{
	fltx4_union & a_union = (fltx4_union &)a;
	return a_union.m128_u32[idx];
}