You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
921 lines
30 KiB
921 lines
30 KiB
//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
|
|
//
|
|
// Purpose: Implementation of our SIMD function using generic c++ code and a struct. This
|
|
// implementation will not be especially fast, but gets us up fast on new platforms and also acts
|
|
// as an easy-to-understand reference implementation.
|
|
//
|
|
//==============================================================//
|
|
|
|
|
|
//---------------------------------------------------------------------
|
|
// Standard C (fallback/new platform) implementation (only there for compat - slow)
|
|
//---------------------------------------------------------------------
|
|
|
|
FORCEINLINE float SubFloat( const fltx4 & a, int idx )
|
|
{
|
|
return a.m128_f32[ idx ];
|
|
}
|
|
|
|
FORCEINLINE float & SubFloat( fltx4 & a, int idx )
|
|
{
|
|
return a.m128_f32[idx];
|
|
}
|
|
|
|
FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
|
|
{
|
|
return a.m128_u32[idx];
|
|
}
|
|
|
|
FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
|
|
{
|
|
return a.m128_u32[idx];
|
|
}
|
|
|
|
// Return one in the fastest way -- on the x360, faster even than loading.
|
|
FORCEINLINE fltx4 LoadZeroSIMD( void )
|
|
{
|
|
return Four_Zeros;
|
|
}
|
|
|
|
// Return one in the fastest way -- on the x360, faster even than loading.
|
|
FORCEINLINE fltx4 LoadOneSIMD( void )
|
|
{
|
|
return Four_Ones;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = SubFloat( a, 0 );
|
|
SubFloat( retVal, 1 ) = SubFloat( a, 0 );
|
|
SubFloat( retVal, 2 ) = SubFloat( a, 0 );
|
|
SubFloat( retVal, 3 ) = SubFloat( a, 0 );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = SubFloat( a, 1 );
|
|
SubFloat( retVal, 1 ) = SubFloat( a, 1 );
|
|
SubFloat( retVal, 2 ) = SubFloat( a, 1 );
|
|
SubFloat( retVal, 3 ) = SubFloat( a, 1 );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = SubFloat( a, 2 );
|
|
SubFloat( retVal, 1 ) = SubFloat( a, 2 );
|
|
SubFloat( retVal, 2 ) = SubFloat( a, 2 );
|
|
SubFloat( retVal, 3 ) = SubFloat( a, 2 );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = SubFloat( a, 3 );
|
|
SubFloat( retVal, 1 ) = SubFloat( a, 3 );
|
|
SubFloat( retVal, 2 ) = SubFloat( a, 3 );
|
|
SubFloat( retVal, 3 ) = SubFloat( a, 3 );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
|
|
{
|
|
fltx4 result = a;
|
|
SubFloat( result, 0 ) = SubFloat( x, 0 );
|
|
return result;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
|
|
{
|
|
fltx4 result = a;
|
|
SubFloat( result, 1 ) = SubFloat( y, 1 );
|
|
return result;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
|
|
{
|
|
fltx4 result = a;
|
|
SubFloat( result, 2 ) = SubFloat( z, 2 );
|
|
return result;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
|
|
{
|
|
fltx4 result = a;
|
|
SubFloat( result, 3 ) = SubFloat( w, 3 );
|
|
return result;
|
|
}
|
|
|
|
/// Set one component of a SIMD word with the given float value.
|
|
/// This function is a template because the native implementation of
|
|
/// this on PPC platforms requires that the component be given as a
|
|
/// compiler immediate -- not a function parameter, not a const function
|
|
/// parameter, not even a load from a const static array. It has to be
|
|
/// a real immediate.
|
|
/// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.
|
|
/// \note This function is not particularly performant on any platform (because of
|
|
/// the load from float), so prefer a masked assign from a fltx4 wherever
|
|
/// possible.
|
|
template < unsigned int NCOMPONENT >
|
|
FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )
|
|
{
|
|
fltx4 result = a;
|
|
SubFloat( result, NCOMPONENT ) = flValue;
|
|
return result;
|
|
}
|
|
|
|
|
|
// a b c d -> b c d a
|
|
FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = SubFloat( a, 1 );
|
|
SubFloat( retVal, 1 ) = SubFloat( a, 2 );
|
|
SubFloat( retVal, 2 ) = SubFloat( a, 3 );
|
|
SubFloat( retVal, 3 ) = SubFloat( a, 0 );
|
|
return retVal;
|
|
}
|
|
|
|
// a b c d -> c d a b
|
|
FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = SubFloat( a, 2 );
|
|
SubFloat( retVal, 1 ) = SubFloat( a, 3 );
|
|
SubFloat( retVal, 2 ) = SubFloat( a, 0 );
|
|
SubFloat( retVal, 3 ) = SubFloat( a, 1 );
|
|
return retVal;
|
|
}
|
|
|
|
#define BINOP(op) \
|
|
fltx4 retVal; \
|
|
SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \
|
|
SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \
|
|
SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \
|
|
SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \
|
|
return retVal;
|
|
|
|
#define IBINOP(op) \
|
|
fltx4 retVal; \
|
|
SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \
|
|
SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \
|
|
SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \
|
|
SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \
|
|
return retVal;
|
|
|
|
FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
|
|
{
|
|
BINOP(+);
|
|
}
|
|
|
|
FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
|
|
{
|
|
BINOP(-);
|
|
};
|
|
|
|
FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
|
|
{
|
|
BINOP(*);
|
|
}
|
|
|
|
FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
|
|
{
|
|
BINOP(/);
|
|
}
|
|
|
|
|
|
FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
|
|
{
|
|
return AddSIMD( MulSIMD(a,b), c );
|
|
}
|
|
|
|
FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
|
|
{
|
|
return SubSIMD( c, MulSIMD(a,b) );
|
|
};
|
|
|
|
|
|
FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
|
|
{
|
|
fltx4 result;
|
|
SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
|
|
SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
|
|
SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
|
|
SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
|
|
return result;
|
|
}
|
|
|
|
FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
|
|
{
|
|
SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
|
|
SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
|
|
SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
|
|
}
|
|
|
|
FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
|
|
{
|
|
SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
|
|
SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
|
|
SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
|
|
SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
|
|
}
|
|
|
|
FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
|
|
{
|
|
fltx4 result;
|
|
SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
|
|
SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
|
|
SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
|
|
SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
|
|
return result;
|
|
}
|
|
|
|
FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
|
|
{
|
|
fltx4 result;
|
|
SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
|
|
SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
|
|
SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
|
|
SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
|
|
return result;
|
|
}
|
|
|
|
// tan^1(a/b) .. ie, pass sin in as a and cos in as b
|
|
FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
|
|
{
|
|
fltx4 result;
|
|
SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
|
|
SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
|
|
SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
|
|
SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
|
|
return result;
|
|
}
|
|
|
|
FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) );
|
|
SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) );
|
|
SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) );
|
|
SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) );
|
|
SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) );
|
|
SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) );
|
|
SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
|
|
{
|
|
IBINOP(&);
|
|
}
|
|
|
|
FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
|
|
{
|
|
fltx4 retVal;
|
|
SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 );
|
|
SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 );
|
|
SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 );
|
|
SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
|
|
{
|
|
IBINOP(^);
|
|
}
|
|
|
|
FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
|
|
{
|
|
IBINOP(|);
|
|
}
|
|
|
|
FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
|
|
{
|
|
fltx4 retval;
|
|
SubFloat( retval, 0 ) = -SubFloat( a, 0 );
|
|
SubFloat( retval, 1 ) = -SubFloat( a, 1 );
|
|
SubFloat( retval, 2 ) = -SubFloat( a, 2 );
|
|
SubFloat( retval, 3 ) = -SubFloat( a, 3 );
|
|
|
|
return retval;
|
|
}
|
|
|
|
FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
|
|
{
|
|
return ( SubFloat( a, 0 ) == 0.0 ) &&
|
|
( SubFloat( a, 1 ) == 0.0 ) &&
|
|
( SubFloat( a, 2 ) == 0.0 ) &&
|
|
( SubFloat( a, 3 ) == 0.0 ) ;
|
|
}
|
|
|
|
|
|
// for branching when a.xyzw > b.xyzw
|
|
FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
|
|
{
|
|
return SubFloat(a,0) > SubFloat(b,0) &&
|
|
SubFloat(a,1) > SubFloat(b,1) &&
|
|
SubFloat(a,2) > SubFloat(b,2) &&
|
|
SubFloat(a,3) > SubFloat(b,3);
|
|
}
|
|
|
|
// for branching when a.xyzw >= b.xyzw
|
|
FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
|
|
{
|
|
return SubFloat(a,0) >= SubFloat(b,0) &&
|
|
SubFloat(a,1) >= SubFloat(b,1) &&
|
|
SubFloat(a,2) >= SubFloat(b,2) &&
|
|
SubFloat(a,3) >= SubFloat(b,3);
|
|
}
|
|
|
|
// For branching if all a.xyzw == b.xyzw
|
|
FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
|
|
{
|
|
return SubFloat(a,0) == SubFloat(b,0) &&
|
|
SubFloat(a,1) == SubFloat(b,1) &&
|
|
SubFloat(a,2) == SubFloat(b,2) &&
|
|
SubFloat(a,3) == SubFloat(b,3);
|
|
}
|
|
|
|
// For branching if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w
|
|
FORCEINLINE bool IsAnyEqual( const fltx4 & a, const fltx4 & b )
|
|
{
|
|
return SubFloat(a,0) == SubFloat(b,0) ||
|
|
SubFloat(a,1) == SubFloat(b,1) ||
|
|
SubFloat(a,2) == SubFloat(b,2) ||
|
|
SubFloat(a,3) == SubFloat(b,3);
|
|
}
|
|
|
|
FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
|
|
{
|
|
int nRet = 0;
|
|
|
|
nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0
|
|
nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1
|
|
nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2
|
|
nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3
|
|
|
|
return nRet;
|
|
}
|
|
|
|
FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
|
|
{
|
|
return (0 != TestSignSIMD( a ));
|
|
}
|
|
|
|
FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0;
|
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0;
|
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0;
|
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0;
|
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0;
|
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0;
|
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0;
|
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0;
|
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0;
|
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0;
|
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0;
|
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0;
|
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0;
|
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0;
|
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0;
|
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
|
|
{
|
|
fltx4 retVal;
|
|
SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0;
|
|
SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0;
|
|
SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0;
|
|
SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
///\name Functions which perform comparisons, resulting in a float value of either 0.0 or 1.0 (as opposed to resulting in a 32-bit integer mask ).
|
|
///@{
|
|
FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? 1.0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? 1.0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? 1.0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? 1.0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? 1.0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? 1.0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? 1.0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? 1.0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? 1.0:0
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? 1.0 : 0;
|
|
SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? 1.0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? 1.0 : 0
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? 1.0 : 0;
|
|
SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? 1.0 : 0;
|
|
SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? 1.0 : 0;
|
|
SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? 1.0 : 0;
|
|
return retVal;
|
|
}
|
|
|
|
|
|
//@}
|
|
|
|
|
|
|
|
// simd conditional. for example, a simd version of "( x > 0 ) ? a : b" would be expressed as
|
|
// "MaskedAssign( CmpGtSIMD( x, Four_Zeros ), a, b )". A typical use is to conditionally update
|
|
// subfiles of a fltx4 based upon some test.
|
|
FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
|
|
{
|
|
return OrSIMD(
|
|
AndSIMD( ReplacementMask, NewValue ),
|
|
AndNotSIMD( ReplacementMask, OldValue ) );
|
|
}
|
|
|
|
FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = flValue;
|
|
SubFloat( retVal, 1 ) = flValue;
|
|
SubFloat( retVal, 2 ) = flValue;
|
|
SubFloat( retVal, 3 ) = flValue;
|
|
return retVal;
|
|
}
|
|
|
|
/// replicate a single 32 bit integer value to all 4 components of an m128
|
|
FORCEINLINE fltx4 ReplicateIX4( int nValue )
|
|
{
|
|
fltx4 retVal;
|
|
SubInt( retVal, 0 ) = nValue;
|
|
SubInt( retVal, 1 ) = nValue;
|
|
SubInt( retVal, 2 ) = nValue;
|
|
SubInt( retVal, 3 ) = nValue;
|
|
return retVal;
|
|
|
|
}
|
|
|
|
// Round towards positive infinity
|
|
FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
|
|
SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
|
|
SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
|
|
SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
|
|
return retVal;
|
|
|
|
}
|
|
|
|
// Round towards negative infinity
|
|
FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) );
|
|
SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) );
|
|
SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) );
|
|
SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) );
|
|
return retVal;
|
|
|
|
}
|
|
|
|
FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
|
|
SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
|
|
SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
|
|
SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
|
|
SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
|
|
SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
|
|
SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
|
|
SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
|
|
SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
|
|
SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON );
|
|
SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON );
|
|
SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON );
|
|
SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
|
|
SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
|
|
SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
|
|
SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
|
|
SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
|
|
SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
|
|
SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
|
|
SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
|
|
SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
|
|
SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
|
|
return retVal;
|
|
}
|
|
|
|
/// 1/x for all 4 values.
|
|
/// 1/0 will result in a big but NOT infinite result
|
|
FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
|
|
SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
|
|
SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
|
|
SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
|
|
return retVal;
|
|
}
|
|
|
|
FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
|
|
SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
|
|
SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
|
|
SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
|
|
return retVal;
|
|
}
|
|
|
|
/// 2^x for all values (the antilog)
|
|
FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )
|
|
{
|
|
fltx4 retVal;
|
|
SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) );
|
|
SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) );
|
|
SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) );
|
|
SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) );
|
|
|
|
return retVal;
|
|
}
|
|
|
|
/// horizontal 3d dotproduct
|
|
FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
|
|
{
|
|
float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
|
|
SubFloat( a, 1 ) * SubFloat( b, 1 ) +
|
|
SubFloat( a, 2 ) * SubFloat( b, 2 );
|
|
return ReplicateX4( flDot );
|
|
}
|
|
|
|
/// horizontal 4d dotproduct
|
|
FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
|
|
{
|
|
float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
|
|
SubFloat( a, 1 ) * SubFloat( b, 1 ) +
|
|
SubFloat( a, 2 ) * SubFloat( b, 2 ) +
|
|
SubFloat( a, 3 ) * SubFloat( b, 3 );
|
|
return ReplicateX4( flDot );
|
|
}
|
|
|
|
/// Clamps the components of a vector to a specified minimum and maximum range.
|
|
FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
|
|
{
|
|
return MaxSIMD( min, MinSIMD( max, in ) );
|
|
}
|
|
|
|
/// Squelch the w component of a vector to +0.0. Most efficient when you say a = SetWToZeroSIMD(a)
|
|
/// (avoids a copy)
|
|
FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
|
|
{
|
|
fltx4 retval;
|
|
retval = a;
|
|
SubFloat( retval, 0 ) = 0;
|
|
return retval;
|
|
}
|
|
|
|
FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
|
|
{
|
|
return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
|
|
}
|
|
|
|
FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
|
|
{
|
|
return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
|
|
}
|
|
|
|
/// load a single unaligned float into the x component of a SIMD word
|
|
FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
|
|
{
|
|
fltx4 retval;
|
|
SubFloat( retval, 0 ) = *pFlt;
|
|
return retval;
|
|
}
|
|
|
|
FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
|
|
{
|
|
return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
|
|
}
|
|
|
|
/// for the transitional class -- load a 3-by VectorAligned and squash its w component
|
|
FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
|
|
{
|
|
fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
|
|
// squelch w
|
|
SubInt( retval, 3 ) = 0;
|
|
return retval;
|
|
}
|
|
|
|
FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
|
|
{
|
|
*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
|
|
}
|
|
|
|
FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
|
|
{
|
|
*( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
|
|
}
|
|
|
|
FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
|
|
{
|
|
*pSIMD = SubFloat(a, 0);
|
|
*(pSIMD+1) = SubFloat(a, 1);
|
|
*(pSIMD+2) = SubFloat(a, 2);
|
|
}
|
|
|
|
/// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
|
|
FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
|
|
{
|
|
StoreAlignedSIMD(pSIMD->Base(),a);
|
|
}
|
|
|
|
/// Store the x,y,z components of the four FLTX4 parameters
|
|
// into the four consecutive Vectors pDestination[0], pDestination[1], pDestination[2],
|
|
// pDestination[3] The Vectors are assumed
|
|
/// to be unaligned.
|
|
FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
|
|
Vector * const pDestination )
|
|
{
|
|
StoreUnaligned3SIMD( pDestination->Base(), a );
|
|
StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
|
|
StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
|
|
StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
|
|
}
|
|
|
|
// Store the x,y,z components of the four FLTX4 parameters
|
|
// into the four consecutive Vectors:
|
|
// pDestination , pDestination + 1, pDestination + 2, pDestination + 3
|
|
// The Vectors are assumed to start on an ALIGNED address, that is,
|
|
// pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
|
|
FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
|
|
Vector * const pDestination )
|
|
{
|
|
StoreUnaligned3SIMD( pDestination->Base(), a );
|
|
StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
|
|
StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
|
|
StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
|
|
}
|
|
|
|
|
|
FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
|
|
{
|
|
|
|
#define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) \
|
|
{ \
|
|
float tmp = SubFloat( _a_, _ia_ ); \
|
|
SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); \
|
|
SubFloat( _b_, _ib_ ) = tmp; \
|
|
}
|
|
|
|
SWAP_FLOATS( x, 1, y, 0 );
|
|
SWAP_FLOATS( x, 2, z, 0 );
|
|
SWAP_FLOATS( x, 3, w, 0 );
|
|
SWAP_FLOATS( y, 2, z, 1 );
|
|
SWAP_FLOATS( y, 3, w, 1 );
|
|
SWAP_FLOATS( z, 3, w, 2 );
|
|
}
|
|
|
|
/// find the lowest component of a.x, a.y, a.z, and replicate it to the whole return value.
|
|
FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
|
|
{
|
|
float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
|
|
return ReplicateX4(lowest);
|
|
}
|
|
|
|
/// find the highest component of a.x, a.y, a.z, and replicate it to the whole return value.
|
|
FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
|
|
{
|
|
float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
|
|
return ReplicateX4(highest);
|
|
}
|
|
|
|
/// Fixed-point conversion and save as SIGNED INTS. pDest->x = Int (vSrc.x) note: some
|
|
/// architectures have means of doing fixed point conversion when the fix depth is specified as an
|
|
/// immediate.. but there is no way to guarantee an immediate as a parameter to function like this.
|
|
FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
|
|
{
|
|
(*pDest)[0] = SubFloat(vSrc, 0);
|
|
(*pDest)[1] = SubFloat(vSrc, 1);
|
|
(*pDest)[2] = SubFloat(vSrc, 2);
|
|
(*pDest)[3] = SubFloat(vSrc, 3);
|
|
}
|
|
|
|
///@group INTEGER SIMD OPERATIONS {
|
|
|
|
/// splat all components of a vector to a signed immediate int number.
|
|
FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue )
|
|
{
|
|
fltx4 retval;
|
|
SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue;
|
|
return retval;
|
|
}
|
|
|
|
/// Load 4 aligned words into a SIMD register
|
|
FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD)
|
|
{
|
|
return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
|
|
}
|
|
|
|
/// Load 4 unaligned words into a SIMD register
|
|
FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
|
|
{
|
|
return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
|
|
}
|
|
|
|
/// save into four words, 16-byte aligned
|
|
FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
|
|
{
|
|
*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
|
|
}
|
|
|
|
FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
|
|
{
|
|
*( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
|
|
}
|
|
|
|
FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
|
|
{
|
|
*( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
|
|
}
|
|
|
|
/// Load four consecutive uint16's, and turn them into floating point numbers. This function isn't
|
|
/// especially fast and could be made faster if anyone is using it heavily.
|
|
FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
|
|
{
|
|
fltx4 retval;
|
|
SubFloat( retval, 0 ) = pInts[0];
|
|
SubFloat( retval, 1 ) = pInts[1];
|
|
SubFloat( retval, 2 ) = pInts[2];
|
|
SubFloat( retval, 3 ) = pInts[3];
|
|
}
|
|
|
|
|
|
/// Take a fltx4 containing fixed-point uints and return them as single precision floats. No fixed
|
|
/// point conversion is done.
|
|
FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
|
|
{
|
|
Assert(0); /* pc has no such operation */
|
|
fltx4 retval;
|
|
SubFloat( retval, 0 ) = ( (float) SubInt( vSrcA, 0 ) );
|
|
SubFloat( retval, 1 ) = ( (float) SubInt( vSrcA, 1 ) );
|
|
SubFloat( retval, 2 ) = ( (float) SubInt( vSrcA, 2 ) );
|
|
SubFloat( retval, 3 ) = ( (float) SubInt( vSrcA, 3 ) );
|
|
return retval;
|
|
}
|
|
|
|
|
|
#if 0 /* pc has no such op */
|
|
// Take a fltx4 containing fixed-point sints and
|
|
// return them as single precision floats. No
|
|
// fixed point conversion is done.
|
|
FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
|
|
{
|
|
fltx4 retval;
|
|
SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) );
|
|
SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) );
|
|
SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) );
|
|
SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) );
|
|
return retval;
|
|
}
|
|
|
|
|
|
/// works on fltx4's as if they are four uints. the first parameter contains the words to be
|
|
/// shifted, the second contains the amount to shift by AS INTS
|
|
///
|
|
/// for i = 0 to 3
|
|
/// shift = vSrcB_i*32:(i*32)+4
|
|
/// vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
|
|
FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
|
|
{
|
|
i32x4 retval;
|
|
SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
|
|
SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
|
|
SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
|
|
SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
|
|
|
|
|
|
return retval;
|
|
}
|
|
//@}
|