//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======// // // Purpose: Implementation of our SIMD functions for the 360. //==============================================================// #ifndef DBG_H #include "tier0/dbg.h" #endif //--------------------------------------------------------------------- // X360 implementation //--------------------------------------------------------------------- FORCEINLINE float & FloatSIMD( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_f32[idx]; } FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_u32[idx]; } FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) { return __vaddfp( a, b ); } FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b { return __vsubfp( a, b ); } FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b { return __vmulfp( a, b ); } FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c { return __vmaddfp( a, b, c ); } FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b { return __vnmsubfp( a, b, c ); }; FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) { return __vmsum3fp( a, b ); } FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) { return __vmsum4fp( a, b ); } FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) { return XMVectorSin( radians ); } FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { XMVectorSinCos( &sine, &cosine, radians ); } FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { XMVectorSinCos( &sine, &cosine, radians ); } FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians ) { cosine = XMVectorCos( radians ); } FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) { return XMVectorASin( sine ); } FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) { return XMVectorACos( cs ); } // tan^1(a/b) .. ie, pass sin in as a and cos in as b FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) { return XMVectorATan2( a, b ); } // DivSIMD defined further down, since it uses ReciprocalSIMD FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) { return __vmaxfp( a, b ); } FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) { return __vminfp( a, b ); } FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b { return __vand( a, b ); } FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b { // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second return __vandc( b, a ); } FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b { return __vxor( a, b ); } FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b { return __vor( a, b ); } FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a { return XMVectorNegate(a); } FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? { unsigned int equalFlags = 0; __vcmpeqfpR( a, Four_Zeros, &equalFlags ); return XMComparisonAllTrue( equalFlags ); } FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero? { unsigned int conditionregister; XMVectorEqualR(&conditionregister, a, XMVectorZero()); return XMComparisonAnyTrue(conditionregister); } FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero? { // copy a's x component into w, in case w was zero. fltx4 temp = __vrlimi(a, a, 1, 1); unsigned int conditionregister; XMVectorEqualR(&conditionregister, temp, XMVectorZero()); return XMComparisonAnyTrue(conditionregister); } /// for branching when a.xyzw > b.xyzw FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) { unsigned int cr; XMVectorGreaterR(&cr,a,b); return XMComparisonAllTrue(cr); } /// for branching when a.xyzw >= b.xyzw FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) { unsigned int cr; XMVectorGreaterOrEqualR(&cr,a,b); return XMComparisonAllTrue(cr); } /// for branching when a.xyzw > b.xyzw FORCEINLINE bool IsAnyGreaterThan( const fltx4 &a, const fltx4 &b ) { unsigned int cr; XMVectorGreaterR(&cr,a,b); return XMComparisonAnyTrue(cr); } /// for branching when a.xyzw >= b.xyzw FORCEINLINE bool IsAnyGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) { unsigned int cr; XMVectorGreaterOrEqualR(&cr,a,b); return XMComparisonAnyTrue(cr); } // For branching if all a.xyzw == b.xyzw FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) { unsigned int cr; XMVectorEqualR(&cr,a,b); return XMComparisonAllTrue(cr); } FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set { // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) int nRet = 0; const fltx4_union & a_union = (const fltx4_union &)a; nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0 nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1 nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2 nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3 return nRet; } // Squelch the w component of a vector to +0.0. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) { return __vrlimi( a, __vzero(), 1, 0 ); } FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) { // NOTE: this tests the top bits of each vector element using integer math // (so it ignores NaNs - it will return true for "-NaN") unsigned int equalFlags = 0; fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31) signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000 __vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags ); return !XMComparisonAllTrue( equalFlags ); } FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 { return __vcmpeqfp( a, b ); } FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 { return __vcmpgtfp( a, b ); } FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 { return __vcmpgefp( a, b ); } FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a= -b) ? ~0 : 0 { return XMVectorInBounds( a, b ); } FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? 1.0:0 { return AndSIMD( Four_Ones, __vcmpeqfp( a, b ) ); } FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? 1.0:0 { return AndSIMD( Four_Ones, __vcmpgtfp( a, b ) ); } FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? 1.0:0 { return AndSIMD( Four_Ones, __vcmpgefp( a, b ) ); } FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b ) // (a= -b) ? 1.0 : 0 { return AndSIMD( Four_Ones, XMVectorInBounds( a, b ) ); } // returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) { return __vsel( OldValue, NewValue, ReplacementMask ); } // AKA "Broadcast", "Splat" FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a { // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) float * pValue = &flValue; Assert( pValue ); Assert( ((unsigned int)pValue & 3) == 0); return __vspltw( __lvlx( pValue, 0 ), 0 ); } FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a { Assert( pValue ); return __vspltw( __lvlx( pValue, 0 ), 0 ); } /// replicate a single 32 bit integer value to all 4 components of an m128 FORCEINLINE fltx4 ReplicateIX4( int nValue ) { // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!) int * pValue = &nValue; Assert( pValue ); Assert( ((unsigned int)pValue & 3) == 0); return __vspltw( __lvlx( pValue, 0 ), 0 ); } // Round towards positive infinity FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) { return __vrfip(a); } // Round towards nearest integer FORCEINLINE fltx4 RoundSIMD( const fltx4 &a ) { return __vrfin(a); } // Round towards negative infinity FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) { return __vrfim(a); } FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less { // This is emulated from rsqrt return XMVectorSqrtEst( a ); } FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) { // This is emulated from rsqrt return XMVectorSqrt( a ); } FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less { return __vrsqrtefp( a ); } FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) { // Convert zeros to epsilons fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); return ReciprocalSqrtEstSIMD( a_safe ); } FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) { // This uses Newton-Raphson to improve the HW result return XMVectorReciprocalSqrt( a ); } FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less { return __vrefp( a ); } /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. /// No error checking! FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a { // This uses Newton-Raphson to improve the HW result return XMVectorReciprocal( a ); } // FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?) FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b { return MulSIMD( ReciprocalSIMD( b ), a ); } // CHRISG: is it worth doing integer bitfiddling for this? // 2^x for all values (the antilog) FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower ) { return XMVectorExp(toPower); } // Clamps the components of a vector to a specified minimum and maximum range. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) { return XMVectorClamp(in, min, max); } FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) { return XMLoadVector4( pSIMD ); } // load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) { return XMLoadVector3( pSIMD ); } // load a single unaligned float into the x component of a SIMD word FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt ) { return __lvlx( pFlt, 0 ); } FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) { return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); } FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD ) { return XMLoadVector4A( pSIMD ); } FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD ) { return XMLoadVector4( pSIMD ); } FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) { *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; } FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) { XMStoreVector4( pSIMD, a ); } FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) { XMStoreVector3( pSIMD, a ); } // Fixed-point conversion and save as SIGNED INTS. // pDest->x = Int (vSrc.x) // note: some architectures have means of doing // fixed point conversion when the fix depth is // specified as an immediate.. but there is no way // to guarantee an immediate as a parameter to function // like this. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) { fltx4 asInt = __vctsxs( vSrc, 0 ); XMStoreVector4A(pDest->Base(), asInt); } FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) { XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w ); xyzwMatrix = XMMatrixTranspose( xyzwMatrix ); x = xyzwMatrix.r[0]; y = xyzwMatrix.r[1]; z = xyzwMatrix.r[2]; w = xyzwMatrix.r[3]; } // Return one in the fastest way -- faster even than loading. FORCEINLINE fltx4 LoadZeroSIMD( void ) { return XMVectorZero(); } // Return one in the fastest way -- faster even than loading. FORCEINLINE fltx4 LoadOneSIMD( void ) { return XMVectorSplatOne(); } FORCEINLINE fltx4 SplatXSIMD( fltx4 a ) { return XMVectorSplatX( a ); } FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) { return XMVectorSplatY( a ); } FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) { return XMVectorSplatZ( a ); } FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) { return XMVectorSplatW( a ); } FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) { fltx4 result = __vrlimi(a, x, 8, 0); return result; } FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) { fltx4 result = __vrlimi(a, y, 4, 0); return result; } FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) { fltx4 result = __vrlimi(a, z, 2, 0); return result; } FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) { fltx4 result = __vrlimi(a, w, 1, 0); return result; } FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) { fltx4 compareOne = a; return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 ); } FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) { fltx4 compareOne = a; return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 ); } FORCEINLINE fltx4 RotateRight( const fltx4 & a ) { fltx4 compareOne = a; return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 3 ); } FORCEINLINE fltx4 RotateRight2( const fltx4 & a ) { fltx4 compareOne = a; return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 ); } // find the lowest component of a.x, a.y, a.z, // and replicate it to the whole return value. // ignores a.w. // Though this is only five instructions long, // they are all dependent, making this stall city. // Forcing this inline should hopefully help with scheduling. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) { // a is [x,y,z,G] (where G is garbage) // rotate left by one fltx4 compareOne = a ; compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); // compareOne is [y,z,G,G] fltx4 retval = MinSIMD( a, compareOne ); // retVal is [min(x,y), min(y,z), G, G] compareOne = __vrlimi( compareOne, a, 8 , 2); // compareOne is [z, G, G, G] retval = MinSIMD( retval, compareOne ); // retVal = [ min(min(x,y),z), G, G, G ] // splat the x component out to the whole vector and return return SplatXSIMD( retval ); } // find the highest component of a.x, a.y, a.z, // and replicate it to the whole return value. // ignores a.w. // Though this is only five instructions long, // they are all dependent, making this stall city. // Forcing this inline should hopefully help with scheduling. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) { // a is [x,y,z,G] (where G is garbage) // rotate left by one fltx4 compareOne = a ; compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); // compareOne is [y,z,G,G] fltx4 retval = MaxSIMD( a, compareOne ); // retVal is [max(x,y), max(y,z), G, G] compareOne = __vrlimi( compareOne, a, 8 , 2); // compareOne is [z, G, G, G] retval = MaxSIMD( retval, compareOne ); // retVal = [ max(max(x,y),z), G, G, G ] // splat the x component out to the whole vector and return return SplatXSIMD( retval ); } // ------------------------------------ // INTEGER SIMD OPERATIONS. // ------------------------------------ // Load 4 aligned words into a SIMD register FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) { return XMLoadVector4A(pSIMD); } // Load 4 unaligned words into a SIMD register FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD) { return XMLoadVector4( pSIMD ); } // save into four words, 16-byte aligned FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) { *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; } FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) { *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; } FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) { XMStoreVector4(pSIMD, a); } // Load four consecutive uint16's, and turn them into floating point numbers. // This function isn't especially fast and could be made faster if anyone is // using it heavily. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts ) { return XMLoadUShort4(reinterpret_cast(pInts)); } // a={ a.x, a.z, b.x, b.z } // combine two fltx4s by throwing away every other field. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b ) { return XMVectorPermute( a, b, XMVectorPermuteControl( 0, 2, 4, 6 ) ); } // a={ a.x, b.x, c.x, d.x } // combine 4 fltx4s by throwing away 3/4s of the fields // TODO: make more efficient by doing this in a parallel way at the caller // Compress4SIMD(FourVectors.. ) FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d ) { fltx4 abcd = __vrlimi( a, b, 4, 3 ); // a.x, b.x, a.z, a.w abcd = __vrlimi( abcd, c, 2, 2 ); // ax, bx, cx, aw abcd = __vrlimi( abcd, d, 1, 1 ); // ax, bx, cx, dx return abcd; } // Take a fltx4 containing fixed-point uints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { return __vcfux( vSrcA, 0 ); } // Take a fltx4 containing fixed-point sints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { return __vcfsx( vSrcA, 0 ); } // Take a fltx4 containing fixed-point uints and // return them as single precision floats. Each uint // will be divided by 2^immed after conversion // (eg, this is fixed point math). /* as if: FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) { return __vcfux( vSrcA, uImmed ); } */ #define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) )) // Take a fltx4 containing fixed-point sints and // return them as single precision floats. Each int // will be divided by 2^immed (eg, this is fixed point // math). /* as if: FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) { return __vcfsx( vSrcA, uImmed ); } */ #define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) )) // set all components of a vector to a signed immediate int number. /* as if: FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate) { return __vspltisw( toImmediate ); } */ #define IntSetImmediateSIMD(x) (__vspltisw(x)) /* works on fltx4's as if they are four uints. the first parameter contains the words to be shifted, the second contains the amount to shift by AS INTS for i = 0 to 3 shift = vSrcB_i*32:(i*32)+4 vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift */ FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB) { return __vslw(vSrcA, vSrcB); } FORCEINLINE float SubFloat( const fltx4 & a, int idx ) { // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) const fltx4_union & a_union = (const fltx4_union &)a; return a_union.m128_f32[ idx ]; } FORCEINLINE float & SubFloat( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_f32[idx]; } /// Set one component of a SIMD word with the given float value. /// This function is a template because the native implementation of /// this on PPC platforms requires that the component be given as a /// compiler immediate -- not a function parameter, not a const function /// parameter, not even a load from a const static array. It has to be /// a real immediate. /// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w. /// \note This function is not particularly performant on any platform (because of /// the load from float), so prefer a masked assign from a fltx4 wherever /// possible. template < unsigned int NCOMPONENT > FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue ) { // vrlimi can only take an immediate opcode -- that is a constant // passed in from the compiler, not a function parameter, nor an // element loaded from an array, not even a const static array. #define SETCOMPONENTSIMD_MASK_IMMEDIATE ( NCOMPONENT == 0 ) ? 8 :\ ( NCOMPONENT == 1 ) ? 4 :\ ( NCOMPONENT == 2 ) ? 2 :\ ( NCOMPONENT == 3 ) ? 1 :\ 17 //< a meaningless immediate intended to make the compiler angry fltx4 val = ReplicateX4( flValue ); fltx4 result = __vrlimi(a, val, SETCOMPONENTSIMD_MASK_IMMEDIATE, 0); return result; #undef SETCOMPONENTSIMD_MASK_IMMEDIATE } FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) { fltx4 t = __vctuxs( a, 0 ); const fltx4_union & a_union = (const fltx4_union &)t; return a_union.m128_u32[idx]; } FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) { const fltx4_union & a_union = (const fltx4_union &)a; return a_union.m128_u32[idx]; } FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_u32[idx]; }