//===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======// // // Purpose: - defines SIMD "structure of arrays" classes and functions. // //===========================================================================// #ifndef SSEMATH_H #define SSEMATH_H #if defined( _X360 ) #include #elif defined ( _PS3 ) #include #include #else #include #ifndef _LINUX #include #endif #endif #ifndef SPU #include "mathlib/vector.h" #include "mathlib/mathlib.h" #else #include "mathlib/math_pfns.h" #endif #include "mathlib/fltx4.h" // The FLTX4 type is a fltx4 used as a parameter to a function. // On the 360, the best way to do this is pass-by-copy on the registers. // On the PC, the best way is to pass by const reference. // The compiler will sometimes, but not always, replace a pass-by-const-ref // with a pass-in-reg on the 360; to avoid this confusion, you can // explicitly use a FLTX4 as the parameter type. #ifdef _X360 typedef __vector4 FLTX4; #elif defined( _PS3 ) typedef vec_float4 FLTX4; #else typedef const fltx4 & FLTX4; #endif // A 16-byte aligned int32 datastructure // (for use when writing out fltx4's as SIGNED // ints). struct ALIGN16 intx4 { int32 m_i32[4]; inline int & operator[](int which) { return m_i32[which]; } inline const int & operator[](int which) const { return m_i32[which]; } inline int32 *Base() { return m_i32; } inline const int32 *Base() const { return m_i32; } inline bool operator==(const intx4 &other) const { return m_i32[0] == other.m_i32[0] && m_i32[1] == other.m_i32[1] && m_i32[2] == other.m_i32[2] && m_i32[3] == other.m_i32[3] ; } } ALIGN16_POST; #if defined( _DEBUG ) && defined( _X360 ) FORCEINLINE void TestVPUFlags() { // Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com) __vector4 a; __asm { mfvscr a; } unsigned int * flags = (unsigned int *)&a; unsigned int controlWord = flags[3]; Assert(controlWord == 0); } #else // _DEBUG FORCEINLINE void TestVPUFlags() {} #endif // _DEBUG // useful constants in SIMD packed float format: // (note: some of these aren't stored on the 360, // but are manufactured directly in one or two // instructions, saving a load and possible L2 // miss.) #ifdef _X360 // Shouldn't the PS3 have something similar? #define Four_Zeros XMVectorZero() // 0 0 0 0 #define Four_Ones XMVectorSplatOne() // 1 1 1 1 extern const fltx4 Four_Twos; // 2 2 2 2 extern const fltx4 Four_Threes; // 3 3 3 3 extern const fltx4 Four_Fours; // guess. extern const fltx4 Four_Point225s; // .225 .225 .225 .225 extern const fltx4 Four_PointFives; // .5 .5 .5 .5 extern const fltx4 Four_Thirds; // 1/3 extern const fltx4 Four_TwoThirds; // 2/3 extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 extern const fltx4 Four_DegToRad; // (float)(M_PI_F / 180.f) times four #elif defined(SPU) #define Four_Zeros spu_splats( 0.0f ) // 0 0 0 0 #define Four_Ones spu_splats( 1.0f ) // 1 1 1 1 #define Four_Twos spu_splats( 2.0f ) // 2 2 2 2 #define Four_Threes spu_splats( 3.0f ) // 3 3 3 3 #define Four_Fours spu_splats( 4.0f ) // guess. #define Four_Point225s spu_splats( 0.225f ) // .225 .225 .225 .225 #define Four_PointFives spu_splats( 0.5f ) // .5 .5 .5 .5 #define Four_Thirds spu_splats( 0.33333333 ); // 1/3 #define Four_TwoThirds spu_splats( 0.66666666 ); // 2/3 #define Four_NegativeOnes spu_splats( -1.0f ) // -1 -1 -1 -1 #define Four_DegToRad spu_splats((float)(M_PI_F / 180.f)) #else extern const fltx4 Four_Zeros; // 0 0 0 0 extern const fltx4 Four_Ones; // 1 1 1 1 extern const fltx4 Four_Twos; // 2 2 2 2 extern const fltx4 Four_Threes; // 3 3 3 3 extern const fltx4 Four_Fours; // guess. extern const fltx4 Four_Point225s; // .225 .225 .225 .225 extern const fltx4 Four_PointFives; // .5 .5 .5 .5 extern const fltx4 Four_Thirds; // 1/3 extern const fltx4 Four_TwoThirds; // 2/3 extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1 extern const fltx4 Four_DegToRad; // (float)(M_PI_F / 180.f) times four #endif extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON extern const fltx4 Four_2ToThe21s; // (1<<21).. extern const fltx4 Four_2ToThe22s; // (1<<22).. extern const fltx4 Four_2ToThe23s; // (1<<23).. extern const fltx4 Four_2ToThe24s; // (1<<24).. extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2) extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX extern const fltx4 g_SIMD_0123; // 0 1 2 3 as float // coefficients for polynomial approximation of srgb conversions // 4th order polynomial for x^(1/2.2), x in 0..1 extern const fltx4 Four_LinearToGammaCoefficients_A; // *x^4 extern const fltx4 Four_LinearToGammaCoefficients_B; // *x^3 extern const fltx4 Four_LinearToGammaCoefficients_C; // *x^2 extern const fltx4 Four_LinearToGammaCoefficients_D; // *x^1 extern const fltx4 Four_LinearToGammaCoefficients_E; // *x^0 // 3rd order polynomial for x^2.2 x in 0..1 extern const fltx4 Four_GammaToLinearCoefficients_A; // *x^3 extern const fltx4 Four_GammaToLinearCoefficients_B; // *x^2 extern const fltx4 Four_GammaToLinearCoefficients_C; // *x^1 extern const fltx4 Four_GammaToLinearCoefficients_D; // *x^0 // external aligned integer constants #ifndef ALIGN16_POST #define ALIGN16_POST #endif extern const ALIGN16 int32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4 extern const ALIGN16 int32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4 extern const ALIGN16 int32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4 extern const ALIGN16 int32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0 extern const ALIGN16 int32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF] extern const ALIGN16 int32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0 extern const fltx4 g_SIMD_Identity[4]; // [1 0 0 0], [0 1 0 0], [0 0 1 0], [0 0 0 1] extern const ALIGN16 int32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4 // this mask is used for skipping the tail of things. If you have N elements in an array, and wish // to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration. extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST; extern const int32 ALIGN16 g_SIMD_EveryOtherMask[]; // 0, ~0, 0, ~0 // Define prefetch macros. // The characteristics of cache and prefetch are completely // different between the different platforms, so you DO NOT // want to just define one macro that maps to every platform // intrinsic under the hood -- you need to prefetch at different // intervals between x86 and PPC, for example, and that is // a higher level code change. // On the other hand, I'm tired of typing #ifdef _X360 // all over the place, so this is just a nop on Intel, PS3. #ifdef PLATFORM_PPC #if defined(_X360) #define PREFETCH360(address, offset) __dcbt(offset,address) #elif defined(_PS3) #define PREFETCH360(address, offset) __dcbt( reinterpret_cast< const char * >(address) + offset ) #else #error Prefetch not defined for this platform! #endif #else #define PREFETCH360(x,y) // nothing #endif // Here's a handy function to align a pointer to the next // sixteen byte boundary -- it'll round it up to the nearest // multiple of 16. This is useful if you're subdividing // big swaths of allocated memory, but in that case, remember // to leave yourself the necessary slack in the allocation. template inline T *AlignPointer(void * ptr) { #if defined( __clang__ ) uintp temp = (uintp)ptr; #else unsigned temp = ptr; #endif temp = ALIGN_VALUE(temp, sizeof(T)); return (T *)temp; } #ifdef _PS3 // Note that similar defines exist in math_pfns.h // Maybe we should consolidate in one place for all platforms. #define _VEC_CLEAR_SIGNMASK (__vector unsigned int) {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff} #define _VEC_SIGNMASK (__vector unsigned int) { 0x80000000, 0x80000000, 0x80000000, 0x80000000 } #define _VEC_LSBMASK (__vector unsigned int) { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe } #define _VEC_CLEAR_WMASK (__vector unsigned int) {0xffffffff, 0xffffffff, 0xffffffff, 0} #define _VEC_COMPONENT_MASK_0 (__vector unsigned int) {0xffffffff, 0, 0, 0} #define _VEC_COMPONENT_MASK_1 (__vector unsigned int) {0, 0xffffffff, 0, 0} #define _VEC_COMPONENT_MASK_2 (__vector unsigned int) {0, 0, 0xffffffff, 0} #define _VEC_COMPONENT_MASK_3 (__vector unsigned int) {0, 0, 0, 0xffffffff} #define _VEC_SWIZZLE_WZYX (__vector unsigned char) { 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b, 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03 } #define _VEC_SWIZZLE_ZWXY (__vector unsigned char) { 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07 } #define _VEC_SWIZZLE_YXWZ (__vector unsigned char) { 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03, 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b } #define _VEC_ZERO (__vector unsigned int) {0,0,0,0} #define _VEC_FLTMAX (__vector float) {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX} #define _VEC_FLTMIN (__vector float) {FLT_MIN,FLT_MIN,FLT_MIN,FLT_MIN} #define _VEC_ORIGIN (__vector unsigned int) { 0x00000000, 0x00000000, 0x00000000, 0xffffffff } #endif #if USE_STDC_FOR_SIMD //--------------------------------------------------------------------- // Standard C (fallback/Linux) implementation (only there for compat - slow) //--------------------------------------------------------------------- FORCEINLINE float SubFloat( const fltx4 & a, int idx ) { return a.m128_f32[ idx ]; } FORCEINLINE float & SubFloat( fltx4 & a, int idx ) { return a.m128_f32[idx]; } FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) { return a.m128_u32[idx]; } FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) { return a.m128_u32[idx]; } // Return one in the fastest way -- on the x360, faster even than loading. FORCEINLINE fltx4 LoadZeroSIMD( void ) { return Four_Zeros; } // Return one in the fastest way -- on the x360, faster even than loading. FORCEINLINE fltx4 LoadOneSIMD( void ) { return Four_Ones; } FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a ) { fltx4 retVal; SubFloat( retVal, 0 ) = SubFloat( a, 0 ); SubFloat( retVal, 1 ) = SubFloat( a, 0 ); SubFloat( retVal, 2 ) = SubFloat( a, 0 ); SubFloat( retVal, 3 ) = SubFloat( a, 0 ); return retVal; } FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) { fltx4 retVal; SubFloat( retVal, 0 ) = SubFloat( a, 1 ); SubFloat( retVal, 1 ) = SubFloat( a, 1 ); SubFloat( retVal, 2 ) = SubFloat( a, 1 ); SubFloat( retVal, 3 ) = SubFloat( a, 1 ); return retVal; } FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) { fltx4 retVal; SubFloat( retVal, 0 ) = SubFloat( a, 2 ); SubFloat( retVal, 1 ) = SubFloat( a, 2 ); SubFloat( retVal, 2 ) = SubFloat( a, 2 ); SubFloat( retVal, 3 ) = SubFloat( a, 2 ); return retVal; } FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) { fltx4 retVal; SubFloat( retVal, 0 ) = SubFloat( a, 3 ); SubFloat( retVal, 1 ) = SubFloat( a, 3 ); SubFloat( retVal, 2 ) = SubFloat( a, 3 ); SubFloat( retVal, 3 ) = SubFloat( a, 3 ); return retVal; } FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) { fltx4 result = a; SubFloat( result, 0 ) = SubFloat( x, 0 ); return result; } FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) { fltx4 result = a; SubFloat( result, 1 ) = SubFloat( y, 1 ); return result; } FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) { fltx4 result = a; SubFloat( result, 2 ) = SubFloat( z, 2 ); return result; } FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) { fltx4 result = a; SubFloat( result, 3 ) = SubFloat( w, 3 ); return result; } FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) { fltx4 result = a; SubFloat( result, nComponent ) = flValue; return result; } // a b c d -> b c d a FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) { fltx4 retVal; SubFloat( retVal, 0 ) = SubFloat( a, 1 ); SubFloat( retVal, 1 ) = SubFloat( a, 2 ); SubFloat( retVal, 2 ) = SubFloat( a, 3 ); SubFloat( retVal, 3 ) = SubFloat( a, 0 ); return retVal; } // a b c d -> c d a b FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) { fltx4 retVal; SubFloat( retVal, 0 ) = SubFloat( a, 2 ); SubFloat( retVal, 1 ) = SubFloat( a, 3 ); SubFloat( retVal, 2 ) = SubFloat( a, 0 ); SubFloat( retVal, 3 ) = SubFloat( a, 1 ); return retVal; } #define BINOP(op) \ fltx4 retVal; \ SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \ SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \ SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \ SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \ return retVal; #define IBINOP(op) \ fltx4 retVal; \ SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \ SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \ SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \ SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \ return retVal; FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) { BINOP(+); } FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b { BINOP(-); }; FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b { BINOP(*); } FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b { BINOP(/); } FORCEINLINE fltx4 DivEstSIMD( const fltx4 & a, const fltx4 & b ) // a/b { BINOP(/); } FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c { return AddSIMD( MulSIMD(a,b), c ); } FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b { return SubSIMD( c, MulSIMD(a,b) ); }; FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) { fltx4 result; SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); return result; } FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); } FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); } FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) { fltx4 result; SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) ); SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); return result; } FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) { fltx4 result; SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); return result; } // tan^1(a/b) .. ie, pass sin in as a and cos in as b FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) { fltx4 result; SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); return result; } FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) { fltx4 retVal; SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) ); SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) ); SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) ); SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) ); return retVal; } FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) { fltx4 retVal; SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) ); SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) ); SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) ); SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) ); return retVal; } FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b { IBINOP(&); } FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b { fltx4 retVal; SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 ); SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 ); SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 ); SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 ); return retVal; } FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b { IBINOP(^); } FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b { IBINOP(|); } FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a { fltx4 retval; SubFloat( retval, 0 ) = -SubFloat( a, 0 ); SubFloat( retval, 1 ) = -SubFloat( a, 1 ); SubFloat( retval, 2 ) = -SubFloat( a, 2 ); SubFloat( retval, 3 ) = -SubFloat( a, 3 ); return retval; } FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? { return ( SubFloat( a, 0 ) == 0.0 ) && ( SubFloat( a, 1 ) == 0.0 ) && ( SubFloat( a, 2 ) == 0.0 ) && ( SubFloat( a, 3 ) == 0.0 ) ; } // for branching when a.xyzw > b.xyzw FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) { return SubFloat(a,0) > SubFloat(b,0) && SubFloat(a,1) > SubFloat(b,1) && SubFloat(a,2) > SubFloat(b,2) && SubFloat(a,3) > SubFloat(b,3); } // for branching when a.xyzw >= b.xyzw FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) { return SubFloat(a,0) >= SubFloat(b,0) && SubFloat(a,1) >= SubFloat(b,1) && SubFloat(a,2) >= SubFloat(b,2) && SubFloat(a,3) >= SubFloat(b,3); } // For branching if all a.xyzw == b.xyzw FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) { return SubFloat(a,0) == SubFloat(b,0) && SubFloat(a,1) == SubFloat(b,1) && SubFloat(a,2) == SubFloat(b,2) && SubFloat(a,3) == SubFloat(b,3); } // For branching if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w FORCEINLINE bool IsAnyEqual( const fltx4 & a, const fltx4 & b ) { return SubFloat(a,0) == SubFloat(b,0) || SubFloat(a,1) == SubFloat(b,1) || SubFloat(a,2) == SubFloat(b,2) || SubFloat(a,3) == SubFloat(b,3); } FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set { int nRet = 0; nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0 nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1 nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2 nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3 return nRet; } FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) { return (0 != TestSignSIMD( a )); } FORCEINLINE bool IsAnyTrue( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) { return (0 != TestSignSIMD( a )); } FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 { fltx4 retVal; SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0; SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0; SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0; SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0; return retVal; } FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 { fltx4 retVal; SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0; SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0; SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0; SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0; return retVal; } FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 { fltx4 retVal; SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0; SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0; SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0; SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0; return retVal; } FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a= -b) ? ~0 : 0 { fltx4 retVal; SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0; SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0; SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0; SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0; return retVal; } FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) { return OrSIMD( AndSIMD( ReplacementMask, NewValue ), AndNotSIMD( ReplacementMask, OldValue ) ); } FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a { fltx4 retVal; SubFloat( retVal, 0 ) = flValue; SubFloat( retVal, 1 ) = flValue; SubFloat( retVal, 2 ) = flValue; SubFloat( retVal, 3 ) = flValue; return retVal; } /// replicate a single 32 bit integer value to all 4 components of an m128 FORCEINLINE fltx4 ReplicateIX4( int nValue ) { fltx4 retVal; SubInt( retVal, 0 ) = nValue; SubInt( retVal, 1 ) = nValue; SubInt( retVal, 2 ) = nValue; SubInt( retVal, 3 ) = nValue; return retVal; } // Round towards positive infinity FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) { fltx4 retVal; SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); return retVal; } // Round towards negative infinity FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) { fltx4 retVal; SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) ); return retVal; } FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less { fltx4 retVal; SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) ); return retVal; } FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) { fltx4 retVal; SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) ); return retVal; } FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less { fltx4 retVal; SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) ); return retVal; } FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) { fltx4 retVal; SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON ); SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON ); SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON ); SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON ); return retVal; } FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) { fltx4 retVal; SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) ); return retVal; } FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less { fltx4 retVal; SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 ); SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 ); SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 ); SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 ); return retVal; } FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a { fltx4 retVal; SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 ); SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 ); SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 ); SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 ); return retVal; } /// 1/x for all 4 values. /// 1/0 will result in a big but NOT infinite result FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) { fltx4 retVal; SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 )); SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 )); SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 )); SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 )); return retVal; } FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) { fltx4 retVal; SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 )); SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 )); SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 )); SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 )); return retVal; } // 2^x for all values (the antilog) FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) { fltx4 retVal; SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) ); SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) ); SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) ); SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) ); return retVal; } FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) { float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) + SubFloat( a, 1 ) * SubFloat( b, 1 ) + SubFloat( a, 2 ) * SubFloat( b, 2 ); return ReplicateX4( flDot ); } FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) { float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) + SubFloat( a, 1 ) * SubFloat( b, 1 ) + SubFloat( a, 2 ) * SubFloat( b, 2 ) + SubFloat( a, 3 ) * SubFloat( b, 3 ); return ReplicateX4( flDot ); } // Clamps the components of a vector to a specified minimum and maximum range. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) { return MaxSIMD( min, MinSIMD( max, in ) ); } // Squelch the w component of a vector to +0.0. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) { fltx4 retval; retval = a; SubFloat( retval, 0 ) = 0; return retval; } FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) { return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); } FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) { return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); } // load a single unaligned float into the x component of a SIMD word FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt ) { fltx4 retval; SubFloat( retval, 0 ) = *pFlt; return retval; } FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) { return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); } // for the transitional class -- load a 3-by VectorAligned and squash its w component FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) { fltx4 retval = LoadAlignedSIMD(pSIMD.Base()); // squelch w SubInt( retval, 3 ) = 0; return retval; } // construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous FORCEINLINE fltx4 LoadGatherSIMD( const float &x, const float &y, const float &z, const float &w ) { fltx4 retval = { x, y, z, w }; return retval; } FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) { *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; } FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) { *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; } FORCEINLINE void StoreUnalignedFloat( float *pSingleFloat, const fltx4 & a ) { *pSingleFloat = SubFloat( a, 0 ); } FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) { *pSIMD = SubFloat(a, 0); *(pSIMD+1) = SubFloat(a, 1); *(pSIMD+2) = SubFloat(a, 2); } // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) { StoreAlignedSIMD(pSIMD->Base(),a); } // Store the x,y,z components of the four FLTX4 parameters // into the four consecutive Vectors: // pDestination[0], pDestination[1], pDestination[2], pDestination[3] // The Vectors are assumed to be unaligned. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) Vector * const pDestination ) { StoreUnaligned3SIMD( pDestination->Base(), a ); StoreUnaligned3SIMD( (pDestination+1)->Base(), b ); StoreUnaligned3SIMD( (pDestination+2)->Base(), c ); StoreUnaligned3SIMD( (pDestination+3)->Base(), d ); } // Store the x,y,z components of the four FLTX4 parameters // into the four consecutive Vectors: // pDestination , pDestination + 1, pDestination + 2, pDestination + 3 // The Vectors are assumed to start on an ALIGNED address, that is, // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not). FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) Vector * const pDestination ) { StoreUnaligned3SIMD( pDestination->Base(), a ); StoreUnaligned3SIMD( (pDestination+1)->Base(), b ); StoreUnaligned3SIMD( (pDestination+2)->Base(), c ); StoreUnaligned3SIMD( (pDestination+3)->Base(), d ); } FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) { #define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; } SWAP_FLOATS( x, 1, y, 0 ); SWAP_FLOATS( x, 2, z, 0 ); SWAP_FLOATS( x, 3, w, 0 ); SWAP_FLOATS( y, 2, z, 1 ); SWAP_FLOATS( y, 3, w, 1 ); SWAP_FLOATS( z, 3, w, 2 ); } // find the lowest component of a.x, a.y, a.z, // and replicate it to the whole return value. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) { float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2)); return ReplicateX4(lowest); } // find the highest component of a.x, a.y, a.z, // and replicate it to the whole return value. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) { float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2)); return ReplicateX4(highest); } // Fixed-point conversion and save as SIGNED INTS. // pDest->x = Int (vSrc.x) // note: some architectures have means of doing // fixed point conversion when the fix depth is // specified as an immediate.. but there is no way // to guarantee an immediate as a parameter to function // like this. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) { (*pDest)[0] = SubFloat(vSrc, 0); (*pDest)[1] = SubFloat(vSrc, 1); (*pDest)[2] = SubFloat(vSrc, 2); (*pDest)[3] = SubFloat(vSrc, 3); } // ------------------------------------ // INTEGER SIMD OPERATIONS. // ------------------------------------ // splat all components of a vector to a signed immediate int number. FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue ) { fltx4 retval; SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue; return retval; } // Load 4 aligned words into a SIMD register FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD) { return *( reinterpret_cast< const i32x4 *> ( pSIMD ) ); } // Load 4 unaligned words into a SIMD register FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD) { return *( reinterpret_cast< const i32x4 *> ( pSIMD ) ); } // save into four words, 16-byte aligned FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) { *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; } FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) { *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; } FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) { *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; } // Load four consecutive uint16's, and turn them into floating point numbers. // This function isn't especially fast and could be made faster if anyone is // using it heavily. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts ) { fltx4 retval; SubFloat( retval, 0 ) = pInts[0]; SubFloat( retval, 1 ) = pInts[1]; SubFloat( retval, 2 ) = pInts[2]; SubFloat( retval, 3 ) = pInts[3]; } // Take a fltx4 containing fixed-point uints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) { Assert(0); /* pc has no such operation */ fltx4 retval; SubFloat( retval, 0 ) = ( (float) SubInt( vSrcA, 0 ) ); SubFloat( retval, 1 ) = ( (float) SubInt( vSrcA, 1 ) ); SubFloat( retval, 2 ) = ( (float) SubInt( vSrcA, 2 ) ); SubFloat( retval, 3 ) = ( (float) SubInt( vSrcA, 3 ) ); return retval; } #if 0 /* pc has no such op */ // Take a fltx4 containing fixed-point sints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { fltx4 retval; SubFloat( retval, 0 ) = ( (float) (reinterpret_cast(&vSrcA.m128_s32[0])) ); SubFloat( retval, 1 ) = ( (float) (reinterpret_cast(&vSrcA.m128_s32[1])) ); SubFloat( retval, 2 ) = ( (float) (reinterpret_cast(&vSrcA.m128_s32[2])) ); SubFloat( retval, 3 ) = ( (float) (reinterpret_cast(&vSrcA.m128_s32[3])) ); return retval; } /* works on fltx4's as if they are four uints. the first parameter contains the words to be shifted, the second contains the amount to shift by AS INTS for i = 0 to 3 shift = vSrcB_i*32:(i*32)+4 vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift */ FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) { i32x4 retval; SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0); SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1); SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2); SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3); return retval; } #endif #elif ( defined( _PS3 ) ) #define SN_IMPROVED_INTRINSICS ( (( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )) ||\ (defined(__SN_VER__) && (__SN_VER__ > 25002)) ) //--------------------------------------------------------------------- // PS3 implementation //--------------------------------------------------------------------- FORCEINLINE float FloatSIMD( fltx4 & a, int idx ) { #if SN_IMPROVED_INTRINSICS return vec_extract(a,idx); #else fltx4_union a_union; vec_st(a, 0, &a_union.vmxf); return a_union.m128_f32[idx]; #endif } FORCEINLINE unsigned int UIntSIMD( u32x4 & a, int idx ) { #if SN_IMPROVED_INTRINSICS return vec_extract(a,idx); #else fltx4_union a_union; vec_st(a, 0, &a_union.vmxui); return a_union.m128_u32[idx]; #endif } FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) { return vec_add( a, b ); } FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b { return vec_sub( a, b ); } FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b { return vec_madd( a, b, _VEC_ZEROF ); } FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c { return vec_madd( a, b, c ); } FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b { return vec_nmsub( a, b, c ); }; FORCEINLINE fltx4 Dot3SIMD( const fltx4& a, const fltx4& b) { // oliviern: it seems that this code could be optimized // (or maybe the latency will slow down if there is nothing to put in between) // Something like that (to verify on PS3 and SPU): // result2 = vec_madd(a, b, _VEC_ZEROF); // a0 * b0, a1 * b1, a2 * b2, a3 * b3 // result = vec_add(vec_sld(result2, result2, 4), result2); // (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0) // result = vec_add(vec_sld(result2, result2, 8), result); // (a0 * b0) + (a1 * b1) + (a2 * b2), (a1 * b1) + (a2 * b2) + (a3 * b3), (a2 * b2) + (a3 * b3) + (a0 * b0), (a3 * b3) + (a0 * b0) + ... // result = vec_splat(result, 0); // DotProduct3... // 6 SIMD instructions instead of 8 (but again with potentially one more latency - it depends if other stuff can be interleaved in between). // It may still be a bit faster in the worst case. fltx4 result; result = vec_madd( a, b, _VEC_ZEROF ); result = vec_madd( vec_sld(a,a,4), vec_sld(b,b,4), result ); result = vec_madd( vec_sld(a,a,8), vec_sld(b,b,8), result ); // replicate across all result = vec_splat(result,0); return result; } FORCEINLINE fltx4 Dot4SIMD( const fltx4& a, const fltx4& b) { // See comment in Dot3SIMD, we could reduce to 6 SIMD instructions instead of 7 (but again with potentially one more latency). // result = vec_madd(a, b, _VEC_ZEROF); // a0 * b0, a1 * b1, a2 * b2, a3 * b3 // result = vec_add(vec_sld(result, result, 4), result); // (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0) // result = vec_add(vec_sld(result, result, 8), result); // (a0 * b0) + (a1 * b1) + (a2 * b2) + (a3 * b3), ... // result = vec_splat(result, 0); // DotProduct3... // 6 SIMD instructions instead of 7 (but again with potentially one more latency - it depends if other stuff can be interleaved in between). // It may be a wash in the worst case. fltx4 result; result = vec_madd( a, b, _VEC_ZEROF ); result = vec_madd( vec_sld(a,a,4), vec_sld(b,b,4), result ); result = vec_add( vec_sld(result,result,8), result ); // replicate across all result = vec_splat(result,0); return result; } FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) { return sinf4( radians ); } FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { sincosf4( radians, &sine, &cosine ); } FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c { sincosf4( radians, &sine, &cosine ); } FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) { return acosf4( cs ); } FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) { return atan2f4( a, b ); } FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) { return asinf4( sine ); } // DivSIMD defined further down, since it uses ReciprocalSIMD FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) { return vec_max( a, b ); } FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) { return vec_min( a, b ); } FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b { return vec_and( a, b ); } FORCEINLINE fltx4 AndSIMD( const bi32x4 & a, const fltx4 & b ) // a & b { return vec_and( (fltx4)a, b ); } FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const bi32x4 & b ) // a & b { return vec_and( a, (fltx4)b ); } FORCEINLINE bi32x4 AndSIMD( const bi32x4 & a, const bi32x4 & b ) // a & b { return vec_and( a, b ); } #if 0 FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b { // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second return vec_andc( b, a); } FORCEINLINE fltx4 AndNotSIMD( const bi32x4 & a, const fltx4 & b ) // ~a & b { // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second return vec_andc( b, (fltx4)a); } FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const bi32x4 & b ) // ~a & b { // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second return (fltx4)vec_andc( b, (bi32x4)a); } FORCEINLINE bi32x4 AndNotSIMD( const bi32x4 & a, const bi32x4 & b ) // ~a & b { // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second return vec_andc( b, a); } #else template< typename T, typename U > FORCEINLINE T AndNotSIMD( const T &a, const U &b ) // ~a & b { return vec_andc( b, (T)a ); } // specialize for the case of bi, flt FORCEINLINE fltx4 AndNotSIMD( const bi32x4 &a, const fltx4 &b ) // ~a & b { return vec_andc( b, (fltx4)a ); } #endif FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b { return vec_xor( a, b ); } FORCEINLINE fltx4 XorSIMD( const bi32x4 & a, const fltx4 & b ) // a ^ b { return vec_xor( (fltx4)a, b ); } FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const bi32x4 & b ) // a ^ b { return vec_xor( a, (fltx4)b ); } FORCEINLINE bi32x4 XorSIMD( const bi32x4 & a, const bi32x4 & b ) // a ^ b { return vec_xor( a, b ); } FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b { return vec_or( a, b ); } FORCEINLINE fltx4 OrSIMD( const bi32x4 & a, const fltx4 & b ) // a | b { return vec_or( (fltx4)a, b ); } FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const bi32x4 & b ) // a | b { return vec_or( a, (fltx4)b ); } FORCEINLINE i32x4 OrSIMD( const i32x4 & a, const i32x4 & b ) // a | b { return vec_or( a, b ); } FORCEINLINE u32x4 OrSIMD( const u32x4 & a, const u32x4 & b ) // a | b { return vec_or( a, b ); } #if !defined(__SPU__) // bi32x4 typedef to same as u32x4 on SPU FORCEINLINE bi32x4 OrSIMD( const bi32x4 & a, const bi32x4 & b ) // a | b { return vec_or( a, b ); } #endif FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a { return( SubSIMD( _VEC_ZEROF, a ) ); // untested // vec_float4 signMask; // vec_float4 result; // signMask = vec_splat_s32(-1); // signMask = vec_sll(signMask, signMask); // result = vec_xor(a, signMask); // return result; } FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero? { return vec_any_eq( a, _VEC_ZEROF ); } FORCEINLINE bool IsAnyZeros( const bi32x4 & a ) // any floats are zero? { return vec_any_eq( (u32x4)a, _VEC_ZERO ); } FORCEINLINE bool IsAllZeros( const bi32x4 & a ) // all floats of a zero? { return vec_all_eq( (u32x4)a, _VEC_ZERO ); } FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero? { #if SN_IMPROVED_INTRINSICS // push 1.0 into w (NON-ZERO) fltx4 b = vec_insert(1.0f,a,3); return vec_any_eq( b, _VEC_ZEROF ); #else fltx4 b = vec_perm(a,_VEC_ONEF,_VEC_PERMUTE_XYZ0W1); return vec_any_eq( b, _VEC_ZEROF ); #endif } // for branching when a.xyzw > b.xyzw FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) { return vec_all_gt( a, b ); } // for branching when a.xyzw >= b.xyzw FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) { return vec_all_ge(a,b); } FORCEINLINE bool IsAllEqual( const fltx4 &a, const fltx4 &b ) { return vec_all_eq(a,b); } FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set { // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) int nRet = 0; fltx4_union a_union; vec_st(a,0,&a_union.vmxf); nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0 nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1 nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2 nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3 return nRet; } FORCEINLINE int TestSignSIMD( const bi32x4 & a ) // mask of which floats have the high bit set { // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) int nRet = 0; fltx4_union a_union; vec_st(a,0,&a_union.vmxbi); nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0 nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1 nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2 nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3 return nRet; } FORCEINLINE bool IsAnyNegative( const bi32x4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) { return (0 != TestSignSIMD( a )); } // Squelch the w component of a vector to +0.0. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) { return (fltx4)vec_and( (u32x4)a, _VEC_CLEAR_WMASK ); } FORCEINLINE bi32x4 SetWToZeroSIMD( const bi32x4 & a ) { return (bi32x4)vec_and( (u32x4)a, _VEC_CLEAR_WMASK ); } FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) { // NOTE: this tests the top bits of each vector element using integer math // (so it ignores NaNs - it will return true for "-NaN") return vec_any_lt( a, _VEC_ZEROF ); } FORCEINLINE bool IsAnyTrue( const fltx4 & a ) { return vec_any_ne( a, _VEC_ZEROF ); } #ifdef DIFFERENT_NATIVE_VECTOR_TYPES FORCEINLINE bool IsAnyTrue( const bi32x4 & a ) { return vec_any_ne( (vector unsigned int) a, _VEC_0L ); } #endif FORCEINLINE bi32x4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 { return (bi32x4)vec_cmpeq( a, b ); } FORCEINLINE bi32x4 CmpEqSIMD( const i32x4 & a, const i32x4 & b ) // (a==b) ? ~0:0 { return (bi32x4)vec_cmpeq( a, b ); } FORCEINLINE bi32x4 CmpEqSIMD( const u32x4 & a, const u32x4 & b ) // (a==b) ? ~0:0 { return (bi32x4)vec_cmpeq( a, b ); } FORCEINLINE bi32x4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 { return (bi32x4)vec_cmpgt( a, b ); } FORCEINLINE bi32x4 CmpGtSIMD( const i32x4 & a, const i32x4 & b ) // (a>b) ? ~0:0 { return (bi32x4)vec_cmpgt( a, b ); } FORCEINLINE bi32x4 CmpGtSIMD( const u32x4 & a, const u32x4 & b ) // (a>b) ? ~0:0 { return (bi32x4)vec_cmpgt( a, b ); } FORCEINLINE bi32x4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 { return (bi32x4)vec_cmpge( a, b ); } FORCEINLINE bi32x4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a= -b) ? ~0 : 0 { i32x4 control; control = vec_cmpb(a,b); return (bi32x4)vec_cmpeq( (u32x4)control, _VEC_ZERO ); } FORCEINLINE int CmpAnyLeSIMD( const fltx4 & a, const fltx4 & b ) { return vec_any_le( a, b ); } FORCEINLINE int CmpAnyGeSIMD( const fltx4 & a, const fltx4 & b ) { return vec_any_ge( a, b ); } FORCEINLINE int CmpAnyLtSIMD( const fltx4 & a, const fltx4 & b ) { return vec_any_lt( a, b ); } FORCEINLINE int CmpAnyLtSIMD( const bi32x4 & a, const i32x4 & b ) { return vec_any_lt( (i32x4)a, b ); } FORCEINLINE int CmpAnyGtSIMD( const fltx4 & a, const fltx4 & b ) { return vec_any_gt( a, b ); } FORCEINLINE int CmpAnyNeSIMD( const fltx4 & a, const fltx4 & b ) { return vec_any_ne( a, b ); } FORCEINLINE int CmpAnyNeSIMD( const bi32x4 & a, const bi32x4 & b ) { return vec_any_ne( a, b ); } FORCEINLINE int CmpAnyNeSIMD( const bi32x4 & a, const i32x4 & b ) { return vec_any_ne( a, (bi32x4)b ); } FORCEINLINE int CmpAllLeSIMD( const fltx4 & a, const fltx4 & b ) { return vec_all_le( a, b ); } FORCEINLINE fltx4 MaskedAssign( const bi32x4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) { return vec_sel( OldValue, NewValue, ReplacementMask ); } FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) { return vec_sel( OldValue, NewValue, (const bi32x4) ReplacementMask ); } FORCEINLINE vector signed short MaskedAssign( const vector unsigned short & ReplacementMask, const vector signed short & NewValue, const vector signed short & OldValue ) { return vec_sel( OldValue, NewValue, ReplacementMask ); } // AKA "Broadcast", "Splat" FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a { #if SN_IMPROVED_INTRINSICS return vec_splats(flValue); #else // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) float * pValue = &flValue; Assert( pValue ); Assert( ((unsigned int)pValue & 3) == 0); fltx4 result; result = vec_ld(0, pValue); result = vec_splat( vec_perm( result, result, vec_lvsl(0, pValue) ), 0 ); return result; #endif } FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a { #if SN_IMPROVED_INTRINSICS return vec_splats(*pValue); #else Assert( pValue ); fltx4 result; result = vec_ld(0, pValue); result = vec_splat( vec_perm( result, result, vec_lvsl(0, pValue) ), 0 ); return result; #endif } /// replicate a single 32 bit integer value to all 4 components of an m128 FORCEINLINE i32x4 ReplicateIX4( int nValue ) { #if SN_IMPROVED_INTRINSICS return vec_splats(nValue); #else // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!) int * pValue = &nValue; Assert( pValue ); Assert( ((unsigned int)pValue & 3) == 0); i32x4 result; result = vec_ld(0, pValue); result = vec_splat( vec_perm( result, result, vec_lvsl(0, pValue) ), 0 ); return result; #endif } FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) { return sqrtf4(a); } FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less { #if defined( _PS3 ) && !defined( SPU ) // This is exactly what the Xbox 360 does in XMVectorSqrtEst fltx4 vRecipSquareRoot = vec_rsqrte( a ); i32x4 vOne = vec_splat_s32( 1 ); i32x4 vAllOnes = vec_splat_s32( -1 ); i32x4 vShiftLeft24 = vec_splat_s32( -8 ); // -8 is the same bit pattern as 24 with a 5-bit mask fltx4 vZero = (fltx4)vec_splat_s32( 0 ); u32x4 vInputShifted = vec_sl( (u32x4)a, (u32x4)vOne ); u32x4 vInfinityShifted = vec_sl( (u32x4)vAllOnes, (u32x4)vShiftLeft24 ); bi32x4 vEqualsZero = vec_vcmpeqfp( a, vZero ); bi32x4 vEqualsInfinity = vec_vcmpequw( vInputShifted, vInfinityShifted ); fltx4 vSquareRoot = vec_madd( a, vRecipSquareRoot, _VEC_ZEROF ); bi32x4 vResultMask = vec_vcmpequw( (u32x4)vEqualsInfinity, (u32x4)vEqualsZero ); // mask has 1s wherever the square root is valid fltx4 vCorrectedSquareRoot = vec_sel( a, vSquareRoot, vResultMask ); return vCorrectedSquareRoot; #else return SqrtSIMD( a ); #endif } FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less { return vec_rsqrte( a ); } FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) { // This matches standard library function rsqrtf4 fltx4 result; vmathV4RsqrtPerElem( (VmathVector4 *)&result, (const VmathVector4 *)&a ); return result; } FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less { return vec_re( a ); } /// 1/x for all 4 values, more or less /// 1/0 will result in a big but NOT infinite result FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) { bi32x4 zero_mask = CmpEqSIMD( a, Four_Zeros ); fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); ret = ReciprocalEstSIMD( ret ); return ret; } /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. /// No error checking! FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a { // This matches standard library function recipf4 fltx4 result; vmathV4RecipPerElem ( (VmathVector4 *)&result, (const VmathVector4 *)&a ); return result; } FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b { return MulSIMD( ReciprocalSIMD( b ), a ); } FORCEINLINE fltx4 DivEstSIMD( const fltx4 & a, const fltx4 & b ) // Est(a/b) { return MulSIMD( ReciprocalEstSIMD( b ), a ); } /// 1/x for all 4 values. /// 1/0 will result in a big but NOT infinite result FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) { // Convert zeros to epsilons bi32x4 zero_mask = CmpEqSIMD( a, _VEC_ZEROF ); fltx4 a_safe = OrSIMD( a, AndSIMD( _VEC_EPSILONF, zero_mask ) ); return ReciprocalSIMD( a_safe ); // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does) // fltx4 zeroMask = CmpEqSIMD( gFour_Zeros, a ); // fltx4 a_safe = XMVectorSelect( a, gFour_Epsilons, zeroMask ); // return ReciprocalSIMD( a_safe ); } // CHRISG: is it worth doing integer bitfiddling for this? // 2^x for all values (the antilog) FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) { return exp2f4(toPower); } // a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point", // which is accurate to four bits of mantissa. FORCEINLINE fltx4 Exp2EstSIMD( const fltx4 &f ) { return exp2f4fast( f ); } // Clamps the components of a vector to a specified minimum and maximum range. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) { fltx4 result = vec_max(min, in); return vec_min(max, result); } FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) { #if SN_IMPROVED_INTRINSICS fltx4 v0, v1; Assert( pSIMD ); v0 = (fltx4)vec_lvlx( 0, (float*)pSIMD ); v1 = (fltx4)vec_lvrx( 16, (float*)pSIMD ); return vec_or(v0, v1); #else fltx4 v0, v1; vector unsigned char permMask; Assert( pSIMD ); v0 = vec_ld( 0, pSIMD ); permMask = vec_lvsl( 0, pSIMD ); v1 = vec_ld( 15, pSIMD ); return vec_perm(v0, v1, permMask); #endif } FORCEINLINE fltx4 LoadUnsignedByte4SIMD( unsigned char *pBytes ) // unpack contiguous 4 bytes into vec float 4 { #if SN_IMPROVED_INTRINSICS __vector unsigned char res_uc; __vector unsigned short res_us; __vector unsigned char vZero8 = (__vector unsigned char)vec_splat_u8(0); __vector unsigned short vZero16 = (__vector unsigned short)vec_splat_u16(0); res_uc = (__vector unsigned char)vec_lvlx(0, pBytes); res_uc = vec_mergeh( vZero8, res_uc ); res_us = vec_mergeh( vZero16, (__vector unsigned short)res_uc ); return vec_ctf( (__vector unsigned int)res_us, 0); #else vector unsigned char v0, v1; vector bool char res_uc; vector unsigned char permMask; vector bool short res_us; vector bool char vZero8 = (vector bool char)vec_splat_u8(0); vector bool short vZero16 = (vector bool short)vec_splat_u16(0); v0 = vec_ld(0, pBytes); permMask = vec_lvsl(0, pBytes); v1 = vec_ld(3, pBytes); res_uc = (vector bool char)vec_perm(v0, v1, permMask); res_uc = vec_mergeh( vZero8, res_uc ); res_us = vec_mergeh( vZero16, (vector bool short)res_uc ); return vec_ctf( (vector unsigned int)res_us, 0); #endif } FORCEINLINE fltx4 LoadSignedByte4SIMD( signed char *pBytes ) // unpack contiguous 4 bytes into vec float 4 { #if SN_IMPROVED_INTRINSICS vector signed char res_uc; vector signed short res_us; vector signed int res_ui; res_uc = (vector signed char)vec_lvlx(0, pBytes); res_us = vec_unpackh( res_uc ); res_ui = vec_unpackh( res_us ); return vec_ctf( res_ui, 0); #else vector signed char v0, v1, res_uc; vector unsigned char permMask; vector signed short res_us; vector signed int res_ui; v0 = vec_ld(0, pBytes); permMask = vec_lvsl(0, pBytes); v1 = vec_ld(3, pBytes); res_uc = vec_perm(v0, v1, permMask); res_us = vec_unpackh( res_uc ); res_ui = vec_unpackh( res_us ); return vec_ctf( res_ui, 0); #endif } // load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) { Assert( pSIMD ); fltx4 v0 = vec_ld( 0, ( float * )( pSIMD ) ); vector unsigned char permMask = vec_lvsl( 0, ( float * ) ( pSIMD ) ); fltx4 v1 = vec_ld( 11, ( float * )( pSIMD ) ); return vec_perm( v0, v1, permMask ); } // load a single unaligned float into the x component of a SIMD word FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt ) { fltx4 v0 = vec_lde( 0, const_cast(pFlt) ); vector unsigned char permMask = vec_lvsl( 0, const_cast(pFlt) ); return vec_perm( v0, v0, permMask ); } FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) { return vec_ld( 0, ( float * )pSIMD ); } #ifndef SPU // No reason to support VectorAligned on SPU. // for the transitional class -- load a 3-by VectorAligned and squash its w component FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned &pSIMD ) { fltx4 out; out = vec_ld( 0, pSIMD.Base() ); // squelch w return (fltx4)vec_and( (u32x4)out, _VEC_CLEAR_WMASK ); } // for the transitional class -- load a 3-by VectorAligned and squash its w component FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD ) { fltx4 out; out = vec_ld( 0, pSIMD->Base() ); // squelch w return (fltx4)vec_and( (u32x4)out, _VEC_CLEAR_WMASK ); } // strongly typed -- for typechecking as we transition to SIMD FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) { vec_st(a, 0, pSIMD->Base()); } #endif FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) { vec_st(a, 0, pSIMD); } FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) { #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) vec_stvlx( a, 0, pSIMD); vec_stvrx( a, 16, pSIMD); #else fltx4_union a_union; vec_st(a, 0, &a_union.vmxf); pSIMD[0] = a_union.m128_f32[0]; pSIMD[1] = a_union.m128_f32[1]; pSIMD[2] = a_union.m128_f32[2]; pSIMD[3] = a_union.m128_f32[3]; #endif } FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) { fltx4_union a_union; vec_st(a, 0, &a_union.vmxf); pSIMD[0] = a_union.m128_f32[0]; pSIMD[1] = a_union.m128_f32[1]; pSIMD[2] = a_union.m128_f32[2]; }; #ifndef SPU // No reason to support unaligned Vectors on SPU FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d ); // construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous FORCEINLINE fltx4 LoadGatherSIMD( const float &x, const float &y, const float &z, const float &w ) { #if USING_POINTLESSLY_SLOW_SONY_CODE return vmathV4MakeFromElems_V( x,y,z,w ).vec128; #else // load the float into the low word of each vector register (this exploits the unaligned load op) fltx4 vx = vec_lvlx( 0, &x ); fltx4 vy = vec_lvlx( 0, &y ); fltx4 vz = vec_lvlx( 0, &z ); fltx4 vw = vec_lvlx( 0, &w ); return Compress4SIMD( vx, vy, vz, vw ); #endif } // Store the x,y,z components of the four FLTX4 parameters // into the four consecutive Vectors: // pDestination[0], pDestination[1], pDestination[2], pDestination[3] // The Vectors are assumed to be unaligned. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) Vector * const pDestination ) { StoreUnaligned3SIMD( pDestination->Base(), a ); StoreUnaligned3SIMD( (pDestination+1)->Base(), b ); StoreUnaligned3SIMD( (pDestination+2)->Base(), c ); StoreUnaligned3SIMD( (pDestination+3)->Base(), d ); } // Store the x,y,z components of the four FLTX4 parameters // into the four consecutive Vectors: // pDestination , pDestination + 1, pDestination + 2, pDestination + 3 // The Vectors are assumed to start on an ALIGNED address, that is, // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not). FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) Vector * const pDestination ) { StoreUnaligned3SIMD( pDestination->Base(), a ); StoreUnaligned3SIMD( (pDestination+1)->Base(), b ); StoreUnaligned3SIMD( (pDestination+2)->Base(), c ); StoreUnaligned3SIMD( (pDestination+3)->Base(), d ); } #endif // Fixed-point conversion and save as SIGNED INTS. // pDest->x = Int (vSrc.x) // note: some architectures have means of doing // fixed point conversion when the fix depth is // specified as an immediate.. but there is no way // to guarantee an immediate as a parameter to function // like this. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) { i32x4 asInt = vec_cts( vSrc, 0 ); vec_st(asInt, 0, pDest->Base()); } FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) { fltx4 p0, p1, p2, p3; p0 = vec_mergeh(x, z); p1 = vec_mergeh(y, w); p2 = vec_mergel(x, z); p3 = vec_mergel(y, w); x = vec_mergeh(p0, p1); y = vec_mergel(p0, p1); z = vec_mergeh(p2, p3); w = vec_mergel(p2, p3); } // Return one in the fastest way -- faster even than loading. FORCEINLINE fltx4 LoadZeroSIMD( void ) { return _VEC_ZEROF; } FORCEINLINE i32x4 LoadZeroISIMD( void ) { return vec_splat_s32(0); } // Return one in the fastest way -- faster even than loading. FORCEINLINE fltx4 LoadOneSIMD( void ) { return _VEC_ONEF; } FORCEINLINE i32x4 LoadOneISIMD( void ) { return vec_splat_s32(1); } FORCEINLINE fltx4 SplatXSIMD( fltx4 a ) { return vec_splat(a,0); } FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) { return vec_splat(a,1); } FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) { return vec_splat(a,2); } FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) { return vec_splat(a,3); } FORCEINLINE bi32x4 SplatXSIMD( bi32x4 a ) { return vec_splat(a,0); } FORCEINLINE bi32x4 SplatYSIMD( bi32x4 a ) { return vec_splat(a,1); } FORCEINLINE bi32x4 SplatZSIMD( bi32x4 a ) { return vec_splat(a,2); } FORCEINLINE bi32x4 SplatWSIMD( bi32x4 a ) { return vec_splat(a,3); } FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) { return vec_sel(a,x, _VEC_COMPONENT_MASK_0); } FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) { return vec_sel(a,y, _VEC_COMPONENT_MASK_1); } FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) { return vec_sel(a,z, _VEC_COMPONENT_MASK_2); } FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) { return vec_sel(a,w, _VEC_COMPONENT_MASK_3); } FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) { #if SN_IMPROVED_INTRINSICS return vec_insert( flValue, a, nComponent ); #else fltx4_union a_union; a_union.vmxf = vec_ld(0,&a); a_union.m128_f32[nComponent] = flValue; return a_union.vmxf; #endif } FORCEINLINE float GetComponentSIMD( const fltx4& a, int nComponent ) { #if SN_IMPROVED_INTRINSICS return vec_extract( a, nComponent ); #else fltx4_union a_union; a_union.vmxf = vec_ld(0,&a); return a_union.m128_f32[nComponent]; #endif } FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) { return vec_sld(a,a,4); } FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) { return vec_sld(a,a,8); } FORCEINLINE fltx4 RotateRight( const fltx4 & a ) { return vec_sld(a,a,12); } FORCEINLINE fltx4 RotateRight2( const fltx4 & a ) { return vec_sld(a,a,8); } // rotate a vector left by an arbitrary number of // bits known at compile time. The bit parameter // is template because it's actually used as an // immediate field in an instruction, eg it absolutely // must be known at compile time. nBits>127 leads // to doom. // zeroes are shifted in from the right template < uint nBits, typename T > FORCEINLINE T ShiftLeftByBits(const T &a) { // hopefully the compiler, seeing nBits as a const immediate, elides these ifs if ( nBits >= 128 ) // WTF are you doing?! { return (T) LoadZeroSIMD(); } else if ( nBits == 0 ) { return a; } else if ( (nBits > 7) ) // if we have to rotate by at least one byte, do the by-octet rotation first { T t = vec_sld( a, ((T)LoadZeroSIMD()), (nBits >> 3) ); // rotated left by octets return ShiftLeftByBits< (nBits & 0x7) >( t ); } else // we need to rotate by <= 7 bits { // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift. // the splat, however, does require an immediate. Go IBM! vector unsigned int shifter = (vector unsigned int) (vec_splat_s8( ((signed char)(nBits & 0x7)) )); return (T) vec_sll( (vector signed int) a, shifter ); } } // as above, but shift right template < uint nBits, typename T > FORCEINLINE T ShiftRightByBits(const T &a) { // hopefully the compiler, seeing nBits as a const immediate, elides these ifs if ( nBits >= 128 ) // WTF are you doing?! { return (T) LoadZeroSIMD(); } else if ( nBits == 0 ) { return a; } else if ( (nBits > 7) ) // if we have to rotate by at least one byte, do the by-octet rotation first { T t = vec_sld( ((T)LoadZeroSIMD()), a, 16 - (nBits >> 3) ); // rotated right by octets -- a rotate right of one is like a rotate left of fifteen. return ShiftRightByBits< (nBits & 0x7) >( t ); } else // we need to rotate by <= 7 bits { // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift. // the splat, however, does require an immediate. Go IBM! vector unsigned int shifter = (vector unsigned int) (vec_splat_s8( ((signed char)(nBits & 0x7)) )); return (T) vec_srl( (vector unsigned int) a, shifter ); } } /**** an example of ShiftLeftByBits: fltx4 ShiftByTwentyOne( fltx4 foo ) { return ShiftLeftByBits<21>(foo); } compiles to: ShiftByTwentyOne(float __vector): 0x000059FC: 0x1060038C vspltisw v3,0 PIPE 0x00005A00: 0x1085030C vspltisb v4,5 0x00005A04: 0x104218AC vsldoi v2,v2,v3,2 02 (000059FC) REG PIPE 0x00005A08: 0x104221C4 vsl v2,v2,v4 03 (00005A04) REG 0x00005A0C: 0x4E800020 blr *****/ // find the lowest component of a.x, a.y, a.z, // and replicate it to the whole return value. // ignores a.w. // Forcing this inline should hopefully help with scheduling. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) { fltx4 result; fltx4 x = vec_splat( a, 0 ); fltx4 y = vec_splat( a, 1 ); fltx4 z = vec_splat( a, 2 ); if ( vec_any_nan( a ) ) { x = vec_all_nan( x ) ? _VEC_FLTMAX : x; y = vec_all_nan( y ) ? _VEC_FLTMAX : y; z = vec_all_nan( z ) ? _VEC_FLTMAX : z; } result = vec_min( y, x ); result = vec_min( z, result ); return result; } // find the highest component of a.x, a.y, a.z, // and replicate it to the whole return value. // ignores a.w. // Though this is only five instructions long, // they are all dependent, making this stall city. // Forcing this inline should hopefully help with scheduling. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) { fltx4 result; fltx4 x = vec_splat( a, 0 ); fltx4 y = vec_splat( a, 1 ); fltx4 z = vec_splat( a, 2 ); if ( vec_any_nan( a ) ) { x = vec_all_nan( x ) ? _VEC_FLTMIN : x; y = vec_all_nan( y ) ? _VEC_FLTMIN : y; z = vec_all_nan( z ) ? _VEC_FLTMIN : z; } result = vec_max( y, x ); result = vec_max( z, result ); return result; } // ------------------------------------ // INTEGER SIMD OPERATIONS. // ------------------------------------ // Load 4 aligned words into a SIMD register FORCEINLINE i32x4 LoadAlignedIntSIMD(const int32 * RESTRICT pSIMD) { return vec_ld(0, const_cast(pSIMD)); } // Load 4 unaligned words into a SIMD register FORCEINLINE i32x4 LoadUnalignedIntSIMD(const int32 * RESTRICT pSIMD) { i32x4 v0, v1; vector unsigned char permMask; Assert( pSIMD ); v0 = vec_ld( 0, const_cast(pSIMD) ); permMask = vec_lvsl( 0, const_cast(pSIMD) ); v1 = vec_ld( 15, const_cast(pSIMD) ); return vec_perm(v0, v1, permMask); } // save into four words, 16-byte aligned FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const i32x4 & a ) { vec_st(a,0,pSIMD); } FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) { vec_st((i32x4)a,0,pSIMD); } FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const i32x4 & a ) { vec_st(a,0,pSIMD.Base()); } FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const i32x4 & a ) { #if SN_IMPROVED_INTRINSICS // NOTE : NOT TESTED vec_stvlx(a,0,pSIMD); vec_stvrx(a,16,pSIMD); #else fltx4_union tmp; vec_st(a,0,&tmp.vmxi); pSIMD[0] = tmp.m128_u32[0]; pSIMD[1] = tmp.m128_u32[1]; pSIMD[2] = tmp.m128_u32[2]; pSIMD[3] = tmp.m128_u32[3]; #endif } // a={ a.x, a.z, b.x, b.z } // combine two fltx4s by throwing away every other field. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b ) { const int32 ALIGN16 n4shuffleACXZ[4] ALIGN16_POST = { 0x00010203, 0x08090A0B, 0x10111213, 0x18191A1B }; return vec_perm( a, b, (vec_uchar16)LoadAlignedIntSIMD( n4shuffleACXZ ) ); } // a={ a.x, b.x, c.x, d.x } // combine 4 fltx4s by throwing away 3/4s of the fields // TODO: make more efficient by doing this in a parallel way at the caller // Compress4SIMD(FourVectors.. ) FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d ) { fltx4 ab = vec_mergeh( a, b ); // a.x, b.x, a.y, b.y fltx4 cd = vec_mergeh( c, d ); // c.x, d.x... static const int32 ALIGN16 shuffleABXY[4] ALIGN16_POST = { 0x00010203, 0x04050607, 0x10111213, 0x14151617 }; return vec_perm( ab, cd, (vec_uchar16)LoadAlignedIntSIMD( shuffleABXY ) ); } // Take a fltx4 containing fixed-point uints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { return vec_ctf(vSrcA,0); } // Take a fltx4 containing fixed-point sints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { return vec_ctf(vSrcA,0); } // Take a fltx4 containing fixed-point uints and // return them as single precision floats. Each uint // will be divided by 2^immed after conversion // (eg, this is fixed point math). /* as if: FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) { return vec_ctf(vSrcA,uImmed); } */ #define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) )) // Take a fltx4 containing fixed-point sints and // return them as single precision floats. Each int // will be divided by 2^immed (eg, this is fixed point // math). /* as if: FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) { return vec_ctf(vSrcA,uImmed); } */ #define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) )) // set all components of a vector to a signed immediate int number. /* as if: FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate) { return vec_splat_s32( toImmediate ); } */ #define IntSetImmediateSIMD(x) (vec_splat_s32(x)) /* works on fltx4's as if they are four uints. the first parameter contains the words to be shifted, the second contains the amount to shift by AS INTS for i = 0 to 3 shift = vSrcB_i*32:(i*32)+4 vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift */ FORCEINLINE u32x4 IntShiftLeftWordSIMD(u32x4 vSrcA, u32x4 vSrcB) { return vec_sl(vSrcA, vSrcB); } FORCEINLINE float SubFloat( const fltx4 & a, int idx ) { #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) return( vec_extract( a, idx ) ); #else // GCC 4.1.1 // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) fltx4_union a_union; vec_st(a, 0, &a_union.vmxf); return a_union.m128_f32[idx]; #endif // GCC 4.1.1 } FORCEINLINE float & SubFloat( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_f32[idx]; } FORCEINLINE uint32 SubInt( const u32x4 & a, int idx ) { #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) return( vec_extract( a, idx ) ); #else // GCC 4.1.1 fltx4_union a_union; vec_st(a, 0, &a_union.vmxui); return a_union.m128_u32[idx]; #endif // GCC 4.1.1 } FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) { #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) return( vec_extract( (u32x4)a, idx ) ); #else fltx4_union a_union; vec_st(a, 0, &a_union.vmxf); return a_union.m128_u32[idx]; #endif } FORCEINLINE uint32 & SubInt( u32x4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_u32[idx]; } FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) { #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 ) return( vec_extract( vec_ctu( a, 0 ), idx ) ); #else u32x4 t = vec_ctu( a, 0 ); return SubInt(t,idx); #endif } // perform an Altivec permute op. There is no corresponding SSE op, so // this function is missing from that fork. This is deliberate, because // permute-based algorithms simply need to be abandoned and rebuilt // differently way for SSE. // (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm ) template< typename T, typename U > FORCEINLINE T PermuteVMX( T a, T b, U swizzleMask ) { return vec_perm( a, b, (vec_uchar16) swizzleMask ); } // __fsel(double fComparand, double fValGE, double fLT) == fComparand >= 0 ? fValGE : fLT // this is much faster than if ( aFloat > 0 ) { x = .. } #if !defined(__SPU__) #define fsel __fsel #endif inline bool IsVector3LessThan(const fltx4 &v1, const fltx4 &v2 ) { return vec_any_lt( v1, v2 ); } inline bool IsVector3GreaterOrEqual(const fltx4 &v1, const fltx4 &v2 ) { return !IsVector3LessThan( v1, v2 ); } FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) { fltx4 retVal; SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON ); SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON ); SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON ); SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON ); return retVal; } // Round towards negative infinity FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) { fltx4 retVal; SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) ); return retVal; } #elif ( defined( _X360 ) ) //--------------------------------------------------------------------- // X360 implementation //--------------------------------------------------------------------- inline bool IsVector3LessThan(const fltx4 &v1, const fltx4 &v2 ) { return !XMVector3GreaterOrEqual( v1, v2 ); } inline BOOL IsVector3GreaterOrEqual(const fltx4 &v1, const fltx4 &v2 ) { return XMVector3GreaterOrEqual( v1, v2 ); } FORCEINLINE float & FloatSIMD( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_f32[idx]; } FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_u32[idx]; } FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) { return __vaddfp( a, b ); } FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b { return __vsubfp( a, b ); } FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b { return __vmulfp( a, b ); } FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c { return __vmaddfp( a, b, c ); } FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b { return __vnmsubfp( a, b, c ); }; FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) { return __vmsum3fp( a, b ); } FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) { return __vmsum4fp( a, b ); } FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) { return XMVectorSin( radians ); } FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { XMVectorSinCos( &sine, &cosine, radians ); } FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { XMVectorSinCos( &sine, &cosine, radians ); } FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians ) { cosine = XMVectorCos( radians ); } FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) { return XMVectorASin( sine ); } FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) { return XMVectorACos( cs ); } // tan^1(a/b) .. ie, pass sin in as a and cos in as b FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) { return XMVectorATan2( a, b ); } // DivSIMD defined further down, since it uses ReciprocalSIMD FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) { return __vmaxfp( a, b ); } FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) { return __vminfp( a, b ); } FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b { return __vand( a, b ); } FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b { // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second return __vandc( b, a ); } FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b { return __vxor( a, b ); } FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b { return __vor( a, b ); } FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a { return XMVectorNegate(a); } FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero? { unsigned int equalFlags = 0; __vcmpeqfpR( a, Four_Zeros, &equalFlags ); return XMComparisonAllTrue( equalFlags ); } FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero? { unsigned int conditionregister; XMVectorEqualR(&conditionregister, a, XMVectorZero()); return XMComparisonAnyTrue(conditionregister); } FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero? { // copy a's x component into w, in case w was zero. fltx4 temp = __vrlimi(a, a, 1, 1); unsigned int conditionregister; XMVectorEqualR(&conditionregister, temp, XMVectorZero()); return XMComparisonAnyTrue(conditionregister); } // for branching when a.xyzw > b.xyzw FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) { unsigned int cr; XMVectorGreaterR(&cr,a,b); return XMComparisonAllTrue(cr); } // for branching when a.xyzw >= b.xyzw FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) { unsigned int cr; XMVectorGreaterOrEqualR(&cr,a,b); return XMComparisonAllTrue(cr); } // for branching when a.xyzw > b.xyzw FORCEINLINE bool IsAnyGreaterThan( const fltx4 &a, const fltx4 &b ) { unsigned int cr; XMVectorGreaterR(&cr,a,b); return XMComparisonAnyTrue(cr); } // for branching when a.xyzw >= b.xyzw FORCEINLINE bool IsAnyGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) { unsigned int cr; XMVectorGreaterOrEqualR(&cr,a,b); return XMComparisonAnyTrue(cr); } // For branching if all a.xyzw == b.xyzw FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) { unsigned int cr; XMVectorEqualR(&cr,a,b); return XMComparisonAllTrue(cr); } FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set { // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though) int nRet = 0; const fltx4_union & a_union = (const fltx4_union &)a; nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0 nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1 nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2 nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3 return nRet; } // Squelch the w component of a vector to +0.0. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) { return __vrlimi( a, __vzero(), 1, 0 ); } FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) { // NOTE: this tests the top bits of each vector element using integer math // (so it ignores NaNs - it will return true for "-NaN") unsigned int equalFlags = 0; fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31) signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000 __vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags ); return !XMComparisonAllTrue( equalFlags ); } FORCEINLINE bool IsAnyTrue( const fltx4 & a ) { unsigned int equalFlags = 0; __vcmpequwR( Four_Zeros, a, &equalFlags ); // compare to zero return XMComparisonAnyFalse( equalFlags ); // at least one element was not zero, eg was true } FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 { return __vcmpeqfp( a, b ); } FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 { return __vcmpgtfp( a, b ); } FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 { return __vcmpgefp( a, b ); } FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a= -b) ? ~0 : 0 { return XMVectorInBounds( a, b ); } // returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) { return __vsel( OldValue, NewValue, ReplacementMask ); } // perform an Altivec permute op. There is no corresponding SSE op, so // this function is missing from that fork. This is deliberate, because // permute-based algorithms simply need to be abandoned and rebuilt // differently way for SSE. // (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm ) template< typename T, typename U > FORCEINLINE T PermuteVMX( T a, T b, U swizzleMask ) { return __vperm( a, b, swizzleMask ); } // AKA "Broadcast", "Splat" FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a { // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) float * pValue = &flValue; Assert( pValue ); Assert( ((unsigned int)pValue & 3) == 0); return __vspltw( __lvlx( pValue, 0 ), 0 ); } FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a { Assert( pValue ); return __vspltw( __lvlx( pValue, 0 ), 0 ); } /// replicate a single 32 bit integer value to all 4 components of an m128 FORCEINLINE fltx4 ReplicateIX4( int nValue ) { // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!) int * pValue = &nValue; Assert( pValue ); Assert( ((unsigned int)pValue & 3) == 0); return __vspltw( __lvlx( pValue, 0 ), 0 ); } // Round towards positive infinity FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) { return __vrfip(a); } // Round towards nearest integer FORCEINLINE fltx4 RoundSIMD( const fltx4 &a ) { return __vrfin(a); } // Round towards negative infinity FORCEINLINE fltx4 FloorSIMD( const fltx4 &a ) { return __vrfim(a); } FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less { // This is emulated from rsqrt return XMVectorSqrtEst( a ); } FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) { // This is emulated from rsqrt return XMVectorSqrt( a ); } FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less { return __vrsqrtefp( a ); } FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) { // Convert zeros to epsilons fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); return ReciprocalSqrtEstSIMD( a_safe ); } FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) { // This uses Newton-Raphson to improve the HW result return XMVectorReciprocalSqrt( a ); } FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less { return __vrefp( a ); } /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. /// No error checking! FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a { // This uses Newton-Raphson to improve the HW result return XMVectorReciprocal( a ); } FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b { return MulSIMD( ReciprocalSIMD( b ), a ); } FORCEINLINE fltx4 DivEstSIMD( const fltx4 & a, const fltx4 & b ) // Est(a/b) { return MulSIMD( ReciprocalEstSIMD( b ), a ); } /// 1/x for all 4 values. /// 1/0 will result in a big but NOT infinite result FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) { // Convert zeros to epsilons fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); return ReciprocalEstSIMD( a_safe ); } FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) { // Convert zeros to epsilons fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); return ReciprocalSIMD( a_safe ); // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does) // fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a ); // fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask ); // return ReciprocalSIMD( a_safe ); } // CHRISG: is it worth doing integer bitfiddling for this? // 2^x for all values (the antilog) FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) { return XMVectorExp(toPower); } // a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point", // which is accurate to four bits of mantissa. FORCEINLINE fltx4 Exp2EstSIMD( const fltx4 &f ) { return XMVectorExpEst( f ); } // Clamps the components of a vector to a specified minimum and maximum range. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) { return XMVectorClamp(in, min, max); } FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) { return XMLoadVector4( pSIMD ); } // load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec). FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) { return XMLoadVector3( pSIMD ); } // load a single unaligned float into the x component of a SIMD word FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt ) { return __lvlx( pFlt, 0 ); } FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) { return *( reinterpret_cast< const fltx4 *> ( pSIMD ) ); } // for the transitional class -- load a 3-by VectorAligned and squash its w component FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) { fltx4 out = XMLoadVector3A(pSIMD.Base()); // squelch w return __vrlimi( out, __vzero(), 1, 0 ); } // for the transitional class -- load a 3-by VectorAligned and squash its w component FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD ) { fltx4 out = XMLoadVector3A(pSIMD); // squelch w return __vrlimi( out, __vzero(), 1, 0 ); } FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a ) { *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a; } FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a ) { XMStoreVector4( pSIMD, a ); } FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) { XMStoreVector3( pSIMD, a ); } // strongly typed -- for typechecking as we transition to SIMD FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) { XMStoreVector3A(pSIMD->Base(),a); } // Store the x,y,z components of the four FLTX4 parameters // into the four consecutive Vectors: // pDestination[0], pDestination[1], pDestination[2], pDestination[3] // The Vectors are assumed to be unaligned. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) Vector * const pDestination ) { // since four Vec3s == 48 bytes, we can use full-vector stores here, so long as // we arrange the data properly first. // The vrlimi ops trash the destination param which is why we require // pass-by-copy. I'm counting on the compiler to schedule these properly. b = __vrlimi( b, b, 15, 1 ); // b = y1z1__x1 c = __vrlimi( c, c, 15, 2 ); // c = z2__x2y2 a = __vrlimi( a, b, 1, 0 ); // a = x0y0z0x1 b = __vrlimi( b, c, 2|1, 0 ); // b = y1z1x2y2 c = __vrlimi( c, d, 4|2|1, 3 ); // c = z2x3y3z3 float *RESTRICT pOut = pDestination->Base(); StoreUnalignedSIMD( pOut + 0, a ); StoreUnalignedSIMD( pOut + 4, b ); StoreUnalignedSIMD( pOut + 8, c ); } // Store the x,y,z components of the four FLTX4 parameters // into the four consecutive Vectors: // pDestination , pDestination + 1, pDestination + 2, pDestination + 3 // The Vectors are assumed to start on an ALIGNED address, that is, // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not). FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) Vector * const pDestination ) { // since four Vec3s == 48 bytes, we can use full-vector stores here, so long as // we arrange the data properly first. // The vrlimi ops trash the destination param which is why we require // pass-by-copy. I'm counting on the compiler to schedule these properly. b = __vrlimi( b, b, 15, 1 ); // b = y1z1__x1 c = __vrlimi( c, c, 15, 2 ); // c = z2__x2y2 a = __vrlimi( a, b, 1, 0 ); // a = x0y0z0x1 b = __vrlimi( b, c, 2|1, 0 ); // b = y1z1x2y2 c = __vrlimi( c, d, 4|2|1, 3 ); // c = z2x3y3z3 float *RESTRICT pOut = pDestination->Base(); StoreAlignedSIMD( pOut + 0, a ); StoreAlignedSIMD( pOut + 4, b ); StoreAlignedSIMD( pOut + 8, c ); } // Fixed-point conversion and save as SIGNED INTS. // pDest->x = Int (vSrc.x) // note: some architectures have means of doing // fixed point conversion when the fix depth is // specified as an immediate.. but there is no way // to guarantee an immediate as a parameter to function // like this. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) { fltx4 asInt = __vctsxs( vSrc, 0 ); XMStoreVector4A(pDest->Base(), asInt); } FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w ) { XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w ); xyzwMatrix = XMMatrixTranspose( xyzwMatrix ); x = xyzwMatrix.r[0]; y = xyzwMatrix.r[1]; z = xyzwMatrix.r[2]; w = xyzwMatrix.r[3]; } // Return one in the fastest way -- faster even than loading. FORCEINLINE fltx4 LoadZeroSIMD( void ) { return XMVectorZero(); } // Return one in the fastest way -- faster even than loading. FORCEINLINE fltx4 LoadOneSIMD( void ) { return XMVectorSplatOne(); } FORCEINLINE fltx4 SplatXSIMD( fltx4 a ) { return XMVectorSplatX( a ); } FORCEINLINE fltx4 SplatYSIMD( fltx4 a ) { return XMVectorSplatY( a ); } FORCEINLINE fltx4 SplatZSIMD( fltx4 a ) { return XMVectorSplatZ( a ); } FORCEINLINE fltx4 SplatWSIMD( fltx4 a ) { return XMVectorSplatW( a ); } FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) { fltx4 result = __vrlimi(a, x, 8, 0); return result; } FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) { fltx4 result = __vrlimi(a, y, 4, 0); return result; } FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) { fltx4 result = __vrlimi(a, z, 2, 0); return result; } FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) { fltx4 result = __vrlimi(a, w, 1, 0); return result; } FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) { static int s_nVrlimiMask[4] = { 8, 4, 2, 1 }; fltx4 val = ReplicateX4( flValue ); fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0); return result; } FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) { fltx4 compareOne = a; return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 ); } FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) { fltx4 compareOne = a; return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 ); } FORCEINLINE fltx4 RotateRight( const fltx4 & a ) { fltx4 compareOne = a; return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 3 ); } FORCEINLINE fltx4 RotateRight2( const fltx4 & a ) { fltx4 compareOne = a; return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 ); } // rotate a vector left by an arbitrary number of // bits known at compile time. The bit parameter // is template because it's actually used as an // immediate field in an instruction, eg it absolutely // must be known at compile time. nBits>127 leads // to doom. // zeroes are shifted in from the right template < uint nBits > FORCEINLINE fltx4 ShiftLeftByBits(const fltx4 &a) { // hopefully the compiler, seeing nBits as a const immediate, elides these ifs if ( nBits >= 128 ) // WTF are you doing?! { return LoadZeroSIMD(); } else if ( nBits == 0 ) { return a; } else if ( (nBits > 7) ) // if we have to rotate by at least one byte, do the by-octet rotation first { fltx4 t = __vsldoi( a, (LoadZeroSIMD()), (nBits >> 3) ); // rotated left by octets return ShiftLeftByBits< (nBits & 0x7) >( t ); } else // we need to rotate by <= 7 bits { // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift. // the splat, however, does require an immediate. Go IBM! u32x4 shifter = u32x4 (__vspltisb( ((signed char)(nBits & 0x7)) )); return __vsl( a, shifter ); } } // as above, but shift right template < uint nBits > FORCEINLINE fltx4 ShiftRightByBits(const fltx4 &a) { // hopefully the compiler, seeing nBits as a const immediate, elides these ifs if ( nBits >= 128 ) // WTF are you doing?! { return LoadZeroSIMD(); } else if ( nBits == 0 ) { return a; } else if ( (nBits > 7) ) // if we have to rotate by at least one byte, do the by-octet rotation first { fltx4 t = __vsldoi( (LoadZeroSIMD()), a, 16 - (nBits >> 3) ); // rotated right by octets -- a rotate right of one is like a rotate left of fifteen. return ShiftRightByBits< (nBits & 0x7) >( t ); } else // we need to rotate by <= 7 bits { // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift. // the splat, however, does require an immediate. Go IBM! u32x4 shifter = u32x4 (__vspltisb( ((signed char)(nBits & 0x7)) )); return __vsr( a, shifter ); } } // find the lowest component of a.x, a.y, a.z, // and replicate it to the whole return value. // ignores a.w. // Though this is only five instructions long, // they are all dependent, making this stall city. // Forcing this inline should hopefully help with scheduling. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a ) { // a is [x,y,z,G] (where G is garbage) // rotate left by one fltx4 compareOne = a ; compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); // compareOne is [y,z,G,G] fltx4 retval = MinSIMD( a, compareOne ); // retVal is [min(x,y), min(y,z), G, G] compareOne = __vrlimi( compareOne, a, 8 , 2); // compareOne is [z, G, G, G] retval = MinSIMD( retval, compareOne ); // retVal = [ min(min(x,y),z), G, G, G ] // splat the x component out to the whole vector and return return SplatXSIMD( retval ); } // find the highest component of a.x, a.y, a.z, // and replicate it to the whole return value. // ignores a.w. // Though this is only five instructions long, // they are all dependent, making this stall city. // Forcing this inline should hopefully help with scheduling. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a ) { // a is [x,y,z,G] (where G is garbage) // rotate left by one fltx4 compareOne = a ; compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 ); // compareOne is [y,z,G,G] fltx4 retval = MaxSIMD( a, compareOne ); // retVal is [max(x,y), max(y,z), G, G] compareOne = __vrlimi( compareOne, a, 8 , 2); // compareOne is [z, G, G, G] retval = MaxSIMD( retval, compareOne ); // retVal = [ max(max(x,y),z), G, G, G ] // splat the x component out to the whole vector and return return SplatXSIMD( retval ); } // Transform many (horizontal) points in-place by a 3x4 matrix, // here already loaded onto three fltx4 registers. // The points must be stored as 16-byte aligned. They are points // and not vectors because we assume the w-component to be 1. // To spare yourself the annoyance of loading the matrix yourself, // use one of the overloads below. void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2, FLTX4 mRow3); // Transform many (horizontal) points in-place by a 3x4 matrix. // The points must be stored as 16-byte aligned. They are points // and not vectors because we assume the w-component to be 1. // In this function, the matrix need not be aligned. FORCEINLINE void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix) { return TransformManyPointsBy(pVectors, numVectors, LoadUnalignedSIMD( pMatrix[0] ), LoadUnalignedSIMD( pMatrix[1] ), LoadUnalignedSIMD( pMatrix[2] ) ); } // Transform many (horizontal) points in-place by a 3x4 matrix. // The points must be stored as 16-byte aligned. They are points // and not vectors because we assume the w-component to be 1. // In this function, the matrix must itself be aligned on a 16-byte // boundary. FORCEINLINE void TransformManyPointsByA(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix) { return TransformManyPointsBy(pVectors, numVectors, LoadAlignedSIMD( pMatrix[0] ), LoadAlignedSIMD( pMatrix[1] ), LoadAlignedSIMD( pMatrix[2] ) ); } // ------------------------------------ // INTEGER SIMD OPERATIONS. // ------------------------------------ // Load 4 aligned words into a SIMD register FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) { return XMLoadVector4A(pSIMD); } // Load 4 unaligned words into a SIMD register FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD) { return XMLoadVector4( pSIMD ); } // save into four words, 16-byte aligned FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a ) { *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a; } FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) { *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a; } FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a ) { XMStoreVector4(pSIMD, a); } // Load four consecutive uint16's, and turn them into floating point numbers. // This function isn't especially fast and could be made faster if anyone is // using it heavily. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts ) { return XMLoadUShort4(reinterpret_cast(pInts)); } // a={ a.x, a.z, b.x, b.z } // combine two fltx4s by throwing away every other field. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b ) { return XMVectorPermute( a, b, XMVectorPermuteControl( 0, 2, 4, 6 ) ); } // a={ a.x, b.x, c.x, d.x } // combine 4 fltx4s by throwing away 3/4s of the fields // TODO: make more efficient by doing this in a parallel way at the caller // Compress4SIMD(FourVectors.. ) FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d ) { fltx4 abcd = __vrlimi( a, b, 4, 3 ); // a.x, b.x, a.z, a.w abcd = __vrlimi( abcd, c, 2, 2 ); // ax, bx, cx, aw abcd = __vrlimi( abcd, d, 1, 1 ); // ax, bx, cx, dx return abcd; } // construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous FORCEINLINE fltx4 LoadGatherSIMD( const float &x, const float &y, const float &z, const float &w ) { // load the float into the low word of each vector register (this exploits the unaligned load op) fltx4 vx = __lvlx( &x, 0 ); fltx4 vy = __lvlx( &y, 0 ); fltx4 vz = __lvlx( &z, 0 ); fltx4 vw = __lvlx( &w, 0 ); return Compress4SIMD( vx, vy, vz, vw ); } // Take a fltx4 containing fixed-point uints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { return __vcfux( vSrcA, 0 ); } // Take a fltx4 containing fixed-point sints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { return __vcfsx( vSrcA, 0 ); } // Take a fltx4 containing fixed-point uints and // return them as single precision floats. Each uint // will be divided by 2^immed after conversion // (eg, this is fixed point math). /* as if: FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) { return __vcfux( vSrcA, uImmed ); } */ #define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) )) // Take a fltx4 containing fixed-point sints and // return them as single precision floats. Each int // will be divided by 2^immed (eg, this is fixed point // math). /* as if: FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed ) { return __vcfsx( vSrcA, uImmed ); } */ #define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) )) // set all components of a vector to a signed immediate int number. /* as if: FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate) { return __vspltisw( toImmediate ); } */ #define IntSetImmediateSIMD(x) (__vspltisw(x)) /* works on fltx4's as if they are four uints. the first parameter contains the words to be shifted, the second contains the amount to shift by AS INTS for i = 0 to 3 shift = vSrcB_i*32:(i*32)+4 vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift */ FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB) { return __vslw(vSrcA, vSrcB); } FORCEINLINE float SubFloat( const fltx4 & a, int idx ) { // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) const fltx4_union & a_union = (const fltx4_union &)a; return a_union.m128_f32[ idx ]; } FORCEINLINE float & SubFloat( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_f32[idx]; } FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) { fltx4 t = __vctuxs( a, 0 ); const fltx4_union & a_union = (const fltx4_union &)t; return a_union.m128_u32[idx]; } FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) { const fltx4_union & a_union = (const fltx4_union &)a; return a_union.m128_u32[idx]; } FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) { fltx4_union & a_union = (fltx4_union &)a; return a_union.m128_u32[idx]; } #else //--------------------------------------------------------------------- // Intel/SSE implementation //--------------------------------------------------------------------- FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a ) { _mm_store_ps( pSIMD, a ); } FORCEINLINE void StoreAlignedSIMD( short * RESTRICT pSIMD, const shortx8 & a ) { _mm_store_si128( (shortx8 *)pSIMD, a ); } FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a ) { _mm_storeu_ps( pSIMD, a ); } FORCEINLINE void StoreUnalignedSIMD(short* RESTRICT pSIMD, const shortx8& a) { _mm_storeu_si128((shortx8*)pSIMD, a); } FORCEINLINE void StoreUnalignedFloat( float *pSingleFloat, const fltx4 & a ) { _mm_store_ss( pSingleFloat, a ); } FORCEINLINE fltx4 RotateLeft( const fltx4 & a ); FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ); FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a ) { _mm_store_ss(pSIMD, a); _mm_store_ss(pSIMD+1, RotateLeft(a)); _mm_store_ss(pSIMD+2, RotateLeft2(a)); } // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a ) { StoreAlignedSIMD( pSIMD->Base(),a ); } // Store the x,y,z components of the four FLTX4 parameters // into the four consecutive Vectors: // pDestination[0], pDestination[1], pDestination[2], pDestination[3] // The Vectors are assumed to be unaligned. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) Vector * const pDestination ) { StoreUnaligned3SIMD( pDestination->Base(), a ); StoreUnaligned3SIMD( (pDestination+1)->Base(), b ); StoreUnaligned3SIMD( (pDestination+2)->Base(), c ); StoreUnaligned3SIMD( (pDestination+3)->Base(), d ); } // Store the x,y,z components of the four FLTX4 parameters // into the four consecutive Vectors: // pDestination , pDestination + 1, pDestination + 2, pDestination + 3 // The Vectors are assumed to start on an ALIGNED address, that is, // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not). FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate) Vector * const pDestination ) { StoreUnaligned3SIMD( pDestination->Base(), a ); StoreUnaligned3SIMD( (pDestination+1)->Base(), b ); StoreUnaligned3SIMD( (pDestination+2)->Base(), c ); StoreUnaligned3SIMD( (pDestination+3)->Base(), d ); } FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD ) { return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) ); } FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD ) { return _mm_load_si128( reinterpret_cast< const shortx8 *> ( pSIMD ) ); } FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD ) { return _mm_loadu_si128( reinterpret_cast< const shortx8 *> ( pSIMD ) ); } FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b { return _mm_and_ps( a, b ); } FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // a & ~b { return _mm_andnot_ps( a, b ); } FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b { return _mm_xor_ps( a, b ); } FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b { return _mm_or_ps( a, b ); } // Squelch the w component of a vector to +0.0. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy) FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a ) { return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) ); } // for the transitional class -- load a 3-by VectorAligned and squash its w component FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD ) { return SetWToZeroSIMD( LoadAlignedSIMD(pSIMD.Base()) ); } FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD ) { return _mm_loadu_ps( reinterpret_cast( pSIMD ) ); } FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD ) { return _mm_loadu_ps( reinterpret_cast( pSIMD ) ); } // load a single unaligned float into the x component of a SIMD word FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt ) { return _mm_load_ss(pFlt); } /// replicate a single 32 bit integer value to all 4 components of an m128 FORCEINLINE fltx4 ReplicateIX4( int i ) { fltx4 value = _mm_set_ss( * ( ( float *) &i ) );; return _mm_shuffle_ps( value, value, 0); } FORCEINLINE fltx4 ReplicateX4( float flValue ) { __m128 value = _mm_set_ss( flValue ); return _mm_shuffle_ps( value, value, 0 ); } FORCEINLINE fltx4 ReplicateX4( const float * flValue ) { __m128 value = _mm_set_ss( *flValue ); return _mm_shuffle_ps( value, value, 0 ); } FORCEINLINE float SubFloat( const fltx4 & a, int idx ) { // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) #ifndef POSIX return a.m128_f32[ idx ]; #else return (reinterpret_cast(&a))[idx]; #endif } FORCEINLINE float & SubFloat( fltx4 & a, int idx ) { #ifndef POSIX return a.m128_f32[ idx ]; #else return (reinterpret_cast(&a))[idx]; #endif } FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx ) { return (uint32)SubFloat(a,idx); } FORCEINLINE uint32 SubInt( const fltx4 & a, int idx ) { #ifndef POSIX return a.m128_u32[idx]; #else return (reinterpret_cast(&a))[idx]; #endif } FORCEINLINE uint32 & SubInt( fltx4 & a, int idx ) { #ifndef POSIX return a.m128_u32[idx]; #else return (reinterpret_cast(&a))[idx]; #endif } // Return one in the fastest way -- on the x360, faster even than loading. FORCEINLINE fltx4 LoadZeroSIMD( void ) { return Four_Zeros; } // Return one in the fastest way -- on the x360, faster even than loading. FORCEINLINE fltx4 LoadOneSIMD( void ) { return Four_Ones; } FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue ) { return OrSIMD( AndSIMD( ReplacementMask, NewValue ), AndNotSIMD( ReplacementMask, OldValue ) ); } // remember, the SSE numbers its words 3 2 1 0 // The way we want to specify shuffles is backwards from the default // MM_SHUFFLE_REV is in array index order (default is reversed) #define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a) FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) ); } FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) ); } FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) ); } FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 3, 3, 3, 3 ) ); } FORCEINLINE fltx4 ShuffleXXYY( const fltx4 &a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 1, 1 ) ); } FORCEINLINE fltx4 ShuffleXYXY( const fltx4 &a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); } FORCEINLINE fltx4 ShuffleZZWW( const fltx4 &a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 3, 3 ) ); } FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x ) { fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a ); return result; } FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y ) { fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a ); return result; } FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z ) { fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a ); return result; } FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w ) { fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a ); return result; } FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue ) { fltx4 val = ReplicateX4( flValue ); fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[nComponent] ), val, a ); return result; } // a b c d -> b c d a FORCEINLINE fltx4 RotateLeft( const fltx4 & a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) ); } // a b c d -> c d a b FORCEINLINE fltx4 RotateLeft2( const fltx4 & a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) ); } // a b c d -> d a b c FORCEINLINE fltx4 RotateRight( const fltx4 & a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 3, 0, 1, 2 ) ); } // a b c d -> c d a b FORCEINLINE fltx4 RotateRight2( const fltx4 & a ) { return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) ); } FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b { return _mm_add_ps( a, b ); } FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b { return _mm_sub_ps( a, b ); }; FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b { return _mm_mul_ps( a, b ); }; FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b { return _mm_div_ps( a, b ); }; fltx4 ReciprocalEstSIMD( const fltx4 & a ); FORCEINLINE fltx4 DivEstSIMD( const fltx4 & a, const fltx4 & b ) // Est(a/b) { return MulSIMD( ReciprocalEstSIMD( b ), a ); }; FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c { return AddSIMD( MulSIMD(a,b), c ); } FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b { return SubSIMD( c, MulSIMD(a,b) ); }; FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b ) { fltx4 m = MulSIMD( a, b ); return AddSIMD( AddSIMD( SplatXSIMD(m), SplatYSIMD(m) ), SplatZSIMD(m) ); } FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b ) { // 4 instructions, serial, order of addition varies so individual elements my differ in the LSB on some CPUs fltx4 fl4Product = MulSIMD( a, b ); fltx4 fl4YXWZ = _mm_shuffle_ps( fl4Product, fl4Product, MM_SHUFFLE_REV(1,0,3,2) ); fltx4 fl4UUVV = AddSIMD( fl4Product, fl4YXWZ ); // U = X+Y; V = Z+W fltx4 fl4VVUU = RotateLeft2( fl4UUVV ); return AddSIMD( fl4UUVV, fl4VVUU ); } //TODO: implement as four-way Taylor series (see xbox implementation) FORCEINLINE fltx4 SinSIMD( const fltx4 &radians ) { fltx4 result; SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) ); SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) ); SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) ); SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) ); return result; } FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) { // FIXME: Make a fast SSE version SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); } FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c { // FIXME: Make a fast SSE version SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) ); SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) ); SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) ); SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) ); } //TODO: implement as four-way Taylor series (see xbox implementation) FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine ) { // FIXME: Make a fast SSE version fltx4 result; SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) ); SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) ); SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) ); SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) ); return result; } FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs ) { fltx4 result; SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) ); SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) ); SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) ); SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) ); return result; } // tan^1(a/b) .. ie, pass sin in as a and cos in as b FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b ) { fltx4 result; SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) ); SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) ); SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) ); SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) ); return result; } FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a { return SubSIMD(LoadZeroSIMD(),a); } FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set { return _mm_movemask_ps( a ); } FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0) { return (0 != TestSignSIMD( a )); } FORCEINLINE bool IsAnyTrue( const fltx4 & a ) { return (0 != TestSignSIMD( a )); } FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0 { return _mm_cmpeq_ps( a, b ); } FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0 { return _mm_cmpgt_ps( a, b ); } FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0 { return _mm_cmpge_ps( a, b ); } FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a b.xyzw FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b ) { return TestSignSIMD( CmpLeSIMD( a, b ) ) == 0; } // for branching when a.xyzw >= b.xyzw FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b ) { return TestSignSIMD( CmpLtSIMD( a, b ) ) == 0; } // For branching if all a.xyzw == b.xyzw FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b ) { return TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf; } FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0 { return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) ); } FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b) { return _mm_min_ps( a, b ); } FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b) { return _mm_max_ps( a, b ); } // SSE lacks rounding operations. // Really. // You can emulate them by setting the rounding mode for the // whole processor and then converting to int, and then back again. // But every time you set the rounding mode, you clear out the // entire pipeline. So, I can't do them per operation. You // have to do it once, before the loop that would call these. // Round towards positive infinity FORCEINLINE fltx4 CeilSIMD( const fltx4 &a ) { fltx4 retVal; SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) ); SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) ); SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) ); SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) ); return retVal; } fltx4 AbsSIMD( const fltx4 & x ); // To make it more coherent with the whole API (the whole SIMD API is postfixed with SIMD except a couple of methods. Well...) fltx4 fabs( const fltx4 & x ); // Round towards negative infinity // This is the implementation that was here before; it assumes // you are in round-to-floor mode, which I guess is usually the // case for us vis-a-vis SSE. It's totally unnecessary on // VMX, which has a native floor op. FORCEINLINE fltx4 FloorSIMD( const fltx4 &val ) { fltx4 fl4Abs = fabs( val ); fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s ); ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival ); return XorSIMD( ival, XorSIMD( val, fl4Abs ) ); // restore sign bits } FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero? { return TestSignSIMD( CmpEqSIMD( a, Four_Zeros ) ) != 0; } inline bool IsAllZeros( const fltx4 & var ) { return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF; } FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less { return _mm_sqrt_ps( a ); } FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a) { return _mm_sqrt_ps( a ); } FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less { return _mm_rsqrt_ps( a ); } FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a ) { fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); ret = ReciprocalSqrtEstSIMD( ret ); return ret; } /// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a) { fltx4 guess = ReciprocalSqrtEstSIMD( a ); // newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2)); guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess )))); guess = MulSIMD( Four_PointFives, guess); return guess; } FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less { return _mm_rcp_ps( a ); } /// 1/x for all 4 values, more or less /// 1/0 will result in a big but NOT infinite result FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a ) { fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); ret = ReciprocalEstSIMD( ret ); return ret; } /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration. /// No error checking! FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a { fltx4 ret = ReciprocalEstSIMD( a ); // newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2 ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) ); return ret; } /// 1/x for all 4 values. /// 1/0 will result in a big but NOT infinite result FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a ) { fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros ); fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) ); ret = ReciprocalSIMD( ret ); return ret; } // CHRISG: is it worth doing integer bitfiddling for this? // 2^x for all values (the antilog) FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower ) { fltx4 retval; SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) ); SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) ); SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) ); SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) ); return retval; } // Clamps the components of a vector to a specified minimum and maximum range. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max) { return MaxSIMD( min, MinSIMD( max, in ) ); } FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w) { _MM_TRANSPOSE4_PS( x, y, z, w ); } FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a ) { // a is [x,y,z,G] (where G is garbage) // rotate left by one fltx4 compareOne = RotateLeft( a ); // compareOne is [y,z,G,x] fltx4 retval = MinSIMD( a, compareOne ); // retVal is [min(x,y), ... ] compareOne = RotateLeft2( a ); // compareOne is [z, G, x, y] retval = MinSIMD( retval, compareOne ); // retVal = [ min(min(x,y),z)..] // splat the x component out to the whole vector and return return SplatXSIMD( retval ); } FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a ) { // a is [x,y,z,G] (where G is garbage) // rotate left by one fltx4 compareOne = RotateLeft( a ); // compareOne is [y,z,G,x] fltx4 retval = MaxSIMD( a, compareOne ); // retVal is [max(x,y), ... ] compareOne = RotateLeft2( a ); // compareOne is [z, G, x, y] retval = MaxSIMD( retval, compareOne ); // retVal = [ max(max(x,y),z)..] // splat the x component out to the whole vector and return return SplatXSIMD( retval ); } inline bool IsVector3LessThan(const fltx4 &v1, const fltx4 &v2 ) { bi32x4 isOut = CmpLtSIMD( v1, v2 ); return IsAnyNegative( isOut ); } inline bool IsVector4LessThan(const fltx4 &v1, const fltx4 &v2 ) { bi32x4 isOut = CmpLtSIMD( v1, v2 ); return IsAnyNegative( isOut ); } // ------------------------------------ // INTEGER SIMD OPERATIONS. // ------------------------------------ #if 0 /* pc does not have these ops */ // splat all components of a vector to a signed immediate int number. FORCEINLINE fltx4 IntSetImmediateSIMD(int to) { //CHRISG: SSE2 has this, but not SSE1. What to do? fltx4 retval; SubInt( retval, 0 ) = to; SubInt( retval, 1 ) = to; SubInt( retval, 2 ) = to; SubInt( retval, 3 ) = to; return retval; } #endif // Load 4 aligned words into a SIMD register FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD) { return _mm_load_ps( reinterpret_cast(pSIMD) ); } // Load 4 unaligned words into a SIMD register FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD) { return _mm_loadu_ps( reinterpret_cast(pSIMD) ); } // save into four words, 16-byte aligned FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a ) { _mm_store_ps( reinterpret_cast(pSIMD), a ); } FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a ) { _mm_store_ps( reinterpret_cast(pSIMD.Base()), a ); } FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a ) { _mm_storeu_ps( reinterpret_cast(pSIMD), a ); } // a={ a.x, a.z, b.x, b.z } // combine two fltx4s by throwing away every other field. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b ) { return _mm_shuffle_ps( a, b, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); } // Load four consecutive uint16's, and turn them into floating point numbers. // This function isn't especially fast and could be made faster if anyone is // using it heavily. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts ) { #ifdef POSIX fltx4 retval; SubFloat( retval, 0 ) = pInts[0]; SubFloat( retval, 1 ) = pInts[1]; SubFloat( retval, 2 ) = pInts[2]; SubFloat( retval, 3 ) = pInts[3]; return retval; #else __m128i inA = _mm_loadl_epi64( (__m128i const*) pInts); // Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result. inA = _mm_unpacklo_epi16( inA, _mm_setzero_si128() ); // unpack unsigned 16's to signed 32's return _mm_cvtepi32_ps(inA); #endif } // a={ a.x, b.x, c.x, d.x } // combine 4 fltx4s by throwing away 3/4s of the fields FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d ) { fltx4 aacc = _mm_shuffle_ps( a, c, MM_SHUFFLE_REV( 0, 0, 0, 0 ) ); fltx4 bbdd = _mm_shuffle_ps( b, d, MM_SHUFFLE_REV( 0, 0, 0, 0 ) ); return MaskedAssign( LoadAlignedSIMD( g_SIMD_EveryOtherMask ), bbdd, aacc ); } // outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w } FORCEINLINE void ExpandSIMD( fltx4 const &a, fltx4 &fl4OutA, fltx4 &fl4OutB ) { fl4OutA = _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 1, 1 ) ); fl4OutB = _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 3, 3 ) ); } // construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous FORCEINLINE fltx4 LoadGatherSIMD( const float &x, const float &y, const float &z, const float &w ) { // load the float into the low word of each vector register (this exploits the unaligned load op) fltx4 vx = _mm_load_ss( &x ); fltx4 vy = _mm_load_ss( &y ); fltx4 vz = _mm_load_ss( &z ); fltx4 vw = _mm_load_ss( &w ); return Compress4SIMD( vx, vy, vz, vw ); } // CHRISG: the conversion functions all seem to operate on m64's only... // how do we make them work here? // Take a fltx4 containing fixed-point uints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA ) { fltx4 retval; SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) ); SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) ); SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) ); SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) ); return retval; } // Take a fltx4 containing fixed-point sints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { return _mm_cvtepi32_ps( (const __m128i &)vSrcA ); } FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const shortx8 &vSrcA ) { return _mm_cvtepi32_ps( vSrcA ); } #if 0 // Take a fltx4 containing fixed-point sints and // return them as single precision floats. No // fixed point conversion is done. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA ) { fltx4 retval; SubFloat( retval, 0 ) = ( (float) (reinterpret_cast(&vSrcA)[0])); SubFloat( retval, 1 ) = ( (float) (reinterpret_cast(&vSrcA)[1])); SubFloat( retval, 2 ) = ( (float) (reinterpret_cast(&vSrcA)[2])); SubFloat( retval, 3 ) = ( (float) (reinterpret_cast(&vSrcA)[3])); return retval; } #endif /* works on fltx4's as if they are four uints. the first parameter contains the words to be shifted, the second contains the amount to shift by AS INTS for i = 0 to 3 shift = vSrcB_i*32:(i*32)+4 vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift */ FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB) { i32x4 retval; SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0); SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1); SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2); SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3); return retval; } // Fixed-point conversion and save as SIGNED INTS. // pDest->x = Int (vSrc.x) // note: some architectures have means of doing // fixed point conversion when the fix depth is // specified as an immediate.. but there is no way // to guarantee an immediate as a parameter to function // like this. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc) { #if defined(_MSC_VER) && _MSC_VER >= 1900 && defined(COMPILER_MSVC64) (*pDest)[0] = (int)SubFloat(vSrc, 0); (*pDest)[1] = (int)SubFloat(vSrc, 1); (*pDest)[2] = (int)SubFloat(vSrc, 2); (*pDest)[3] = (int)SubFloat(vSrc, 3); #else __m64 bottom = _mm_cvttps_pi32( vSrc ); __m64 top = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) ); *reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom; *reinterpret_cast<__m64 *>(&(*pDest)[2]) = top; _mm_empty(); #endif } #endif // a={a.y, a.z, a.w, b.x } b={b.y, b.z, b.w, b.x } FORCEINLINE void RotateLeftDoubleSIMD( fltx4 &a, fltx4 &b ) { a = SetWSIMD( RotateLeft( a ), SplatXSIMD( b ) ); b = RotateLeft( b ); } // // Some convenience operator overloads, which are just aliasing the functions above. // Unneccessary on 360, as you already have them from xboxmath.h (same for PS3 PPU and SPU) #if !defined(PLATFORM_PPC) && !defined( POSIX ) && !defined(SPU) #if 1 // TODO: verify generation of non-bad code. // Componentwise add FORCEINLINE fltx4 operator+( FLTX4 a, FLTX4 b ) { return AddSIMD( a, b ); } // Componentwise subtract FORCEINLINE fltx4 operator-( FLTX4 a, FLTX4 b ) { return SubSIMD( a, b ); } // Componentwise multiply FORCEINLINE fltx4 operator*( FLTX4 a, FLTX4 b ) { return MulSIMD( a, b ); } // No divide. You need to think carefully about whether you want a reciprocal // or a reciprocal estimate. // bitwise and FORCEINLINE fltx4 operator&( FLTX4 a, FLTX4 b ) { return AndSIMD( a ,b ); } // bitwise or FORCEINLINE fltx4 operator|( FLTX4 a, FLTX4 b ) { return OrSIMD( a, b ); } // bitwise xor FORCEINLINE fltx4 operator^( FLTX4 a, FLTX4 b ) { return XorSIMD( a, b ); } // unary negate FORCEINLINE fltx4 operator-( FLTX4 a ) { return NegSIMD( a ); } #endif // 0 #endif #if defined(_X360) || defined(_PS3) FORCEINLINE fltx4 VectorMergeHighSIMD( fltx4 fl4SrcA, fltx4 fl4SrcB ) { #if defined( _X360 ) return __vmrghw( fl4SrcA, fl4SrcB ); #else return vec_mergeh( fl4SrcA, fl4SrcB ); #endif } FORCEINLINE fltx4 VectorMergeLowSIMD( fltx4 fl4SrcA, fltx4 fl4SrcB ) { #if defined( _X360 ) return __vmrglw( fl4SrcA, fl4SrcB ); #else return vec_mergel( fl4SrcA, fl4SrcB ); #endif } #endif #ifndef SPU // fourplanes_t, Frustrum_t are not supported on SPU // It would make sense to support FourVectors on SPU at some point. struct ALIGN16 fourplanes_t { fltx4 nX; fltx4 nY; fltx4 nZ; fltx4 dist; bi32x4 xSign; bi32x4 ySign; bi32x4 zSign; fltx4 nXAbs; fltx4 nYAbs; fltx4 nZAbs; void ComputeSignbits(); // fast SIMD loads void Set4Planes( const VPlane *pPlanes ); void Set2Planes( const VPlane *pPlanes ); void Get4Planes( VPlane *pPlanesOut ) const; void Get2Planes( VPlane *pPlanesOut ) const; // not-SIMD, much slower void GetPlane( int index, Vector *pNormal, float *pDist ) const; void SetPlane( int index, const Vector &vecNormal, float planeDist ); }; class ALIGN16 Frustum_t { public: Frustum_t(); void SetPlane( int i, const Vector &vecNormal, float dist ); void GetPlane( int i, Vector *pNormalOut, float *pDistOut ) const; void SetPlanes( const VPlane *pPlanes ); void GetPlanes( VPlane *pPlanesOut ) const; // returns false if the box is within the frustum, true if it is outside bool CullBox( const Vector &mins, const Vector &maxs ) const; bool CullBoxCenterExtents( const Vector ¢er, const Vector &extents ) const; bool CullBox( const fltx4 &fl4Mins, const fltx4 &fl4Maxs ) const; bool CullBoxCenterExtents( const fltx4 &fl4Center, const fltx4 &fl4Extents ) const; // Return true if frustum contains this bounding volume, false if any corner is outside bool Contains( const Vector &mins, const Vector &maxs ) const; // Return true if this frustum intersects the frustum, false if it is outside bool Intersects( Frustum_t &otherFrustum ) const; // Return true if this bounding volume intersects the frustum, false if it is outside bool Intersects( const Vector &mins, const Vector &maxs ) const; bool IntersectsCenterExtents( const Vector ¢er, const Vector &extents ) const; bool Intersects( const fltx4 &fl4Mins, const fltx4 &fl4Maxs ) const; bool IntersectsCenterExtents( const fltx4 &fl4Center, const fltx4 &fl4Extents ) const; void CreatePerspectiveFrustum( const Vector& origin, const Vector &forward, const Vector &right, const Vector &up, float flZNear, float flZFar, float flFovX, float flAspect ); void CreatePerspectiveFrustumFLU( const Vector& vOrigin, const Vector &vForward, const Vector &vLeft, const Vector &vUp, float flZNear, float flZFar, float flFovX, float flAspect ); // Version that accepts angles instead of vectors void CreatePerspectiveFrustum( const Vector& origin, const QAngle &angles, float flZNear, float flZFar, float flFovX, float flAspectRatio ); // Generate a frustum based on orthographic parameters void CreateOrthoFrustum( const Vector &origin, const Vector &forward, const Vector &right, const Vector &up, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar ); void CreateOrthoFrustumFLU( const Vector &vOrigin, const Vector &vForward, const Vector &vLeft, const Vector &vUp, float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar ); // The points returned correspond to the corners of the frustum faces // Points 0 to 3 correspond to the near face // Points 4 to 7 correspond to the far face // Returns points in a face in this order: // 2--3 // | | // 0--1 // Returns false if a corner couldn't be generated for some reason. bool GetCorners( Vector *pPoints ) const; fourplanes_t planes[2]; }; #endif class FourQuaternions; /// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are /// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated. class ALIGN16 FourVectors { public: fltx4 x, y, z; FourVectors(void) { } FourVectors( FourVectors const &src ) { x=src.x; y=src.y; z=src.z; } explicit FORCEINLINE FourVectors( float a ) { fltx4 aReplicated = ReplicateX4( a ); x = y = z = aReplicated; } FORCEINLINE void Init( void ) { x = Four_Zeros; y = Four_Zeros; z = Four_Zeros; } FORCEINLINE void Init( float flX, float flY, float flZ ) { x = ReplicateX4( flX ); y = ReplicateX4( flY ); z = ReplicateX4( flZ ); } FORCEINLINE FourVectors( float flX, float flY, float flZ ) { Init( flX, flY, flZ ); } FORCEINLINE void Init( fltx4 const &fl4X, fltx4 const &fl4Y, fltx4 const &fl4Z ) { x = fl4X; y = fl4Y; z = fl4Z; } FORCEINLINE FourVectors( fltx4 const &fl4X, fltx4 const &fl4Y, fltx4 const &fl4Z ) { Init( fl4X, fl4Y, fl4Z ); } /// construct a FourVectors from 4 separate Vectors FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c, Vector const &d) { LoadAndSwizzle(a,b,c,d); } /// construct a FourVectors from 4 separate Vectors FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b, VectorAligned const &c, VectorAligned const &d) { LoadAndSwizzleAligned(a,b,c,d); } // construct from twelve floats; really only useful for static const constructors. // input arrays must be aligned, and in the fourvectors' native format // (eg in xxxx,yyyy,zzzz form) // each pointer should be to an aligned array of four floats FORCEINLINE FourVectors( const float *xs , const float *ys, const float *zs ) : x( LoadAlignedSIMD(xs) ), y( LoadAlignedSIMD(ys) ), z( LoadAlignedSIMD(zs) ) {}; FORCEINLINE void DuplicateVector(Vector const &v) //< set all 4 vectors to the same vector value { x=ReplicateX4(v.x); y=ReplicateX4(v.y); z=ReplicateX4(v.z); } FORCEINLINE fltx4 const & operator[](int idx) const { return *((&x)+idx); } FORCEINLINE fltx4 & operator[](int idx) { return *((&x)+idx); } FORCEINLINE void operator+=(FourVectors const &b) //< add 4 vectors to another 4 vectors { x=AddSIMD(x,b.x); y=AddSIMD(y,b.y); z=AddSIMD(z,b.z); } FORCEINLINE void operator-=(FourVectors const &b) //< subtract 4 vectors from another 4 { x=SubSIMD(x,b.x); y=SubSIMD(y,b.y); z=SubSIMD(z,b.z); } FORCEINLINE void operator*=(FourVectors const &b) //< scale all four vectors per component scale { x=MulSIMD(x,b.x); y=MulSIMD(y,b.y); z=MulSIMD(z,b.z); } FORCEINLINE void operator*=(const fltx4 & scale) //< scale { x=MulSIMD(x,scale); y=MulSIMD(y,scale); z=MulSIMD(z,scale); } FORCEINLINE void operator*=(float scale) //< uniformly scale all 4 vectors { fltx4 scalepacked = ReplicateX4(scale); *this *= scalepacked; } FORCEINLINE fltx4 operator*(FourVectors const &b) const //< 4 dot products { fltx4 dot=MulSIMD(x,b.x); dot=MaddSIMD(y,b.y,dot); dot=MaddSIMD(z,b.z,dot); return dot; } FORCEINLINE fltx4 operator*(Vector const &b) const //< dot product all 4 vectors with 1 vector { fltx4 dot=MulSIMD(x,ReplicateX4(b.x)); dot=MaddSIMD(y,ReplicateX4(b.y), dot); dot=MaddSIMD(z,ReplicateX4(b.z), dot); return dot; } FORCEINLINE FourVectors operator*(float b) const //< scale { fltx4 scalepacked = ReplicateX4( b ); FourVectors res; res.x = MulSIMD( x, scalepacked ); res.y = MulSIMD( y, scalepacked ); res.z = MulSIMD( z, scalepacked ); return res; } FORCEINLINE FourVectors operator*( FLTX4 fl4Scale ) const //< scale { FourVectors res; res.x = MulSIMD( x, fl4Scale ); res.y = MulSIMD( y, fl4Scale ); res.z = MulSIMD( z, fl4Scale ); return res; } FORCEINLINE void VProduct( FourVectors const &b ) //< component by component mul { x=MulSIMD(x,b.x); y=MulSIMD(y,b.y); z=MulSIMD(z,b.z); } FORCEINLINE void MakeReciprocal(void) //< (x,y,z)=(1/x,1/y,1/z) { x=ReciprocalSIMD(x); y=ReciprocalSIMD(y); z=ReciprocalSIMD(z); } FORCEINLINE void MakeReciprocalSaturate(void) //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23 { x=ReciprocalSaturateSIMD(x); y=ReciprocalSaturateSIMD(y); z=ReciprocalSaturateSIMD(z); } // Assume the given matrix is a rotation, and rotate these vectors by it. // If you have a long list of FourVectors structures that you all want // to rotate by the same matrix, use FourVectors::RotateManyBy() instead. inline void RotateBy(const matrix3x4_t& matrix); /***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function: // rotate these vectors ( in place ) by the corresponding quaternions: inline void RotateBy( const FourQuaternions &quats ); ******/ /// You can use this to rotate a long array of FourVectors all by the same /// matrix. The first parameter is the head of the array. The second is the /// number of vectors to rotate. The third is the matrix. static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ); static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ); /// Assume the vectors are points, and transform them in place by the matrix. inline void TransformBy(const matrix3x4_t& matrix); /// You can use this to Transform a long array of FourVectors all by the same /// matrix. The first parameter is the head of the array. The second is the /// number of vectors to rotate. The third is the matrix. The fourth is the /// output buffer, which must not overlap the pVectors buffer. This is not /// an in-place transformation. static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut ); /// You can use this to Transform a long array of FourVectors all by the same /// matrix. The first parameter is the head of the array. The second is the /// number of vectors to rotate. The third is the matrix. The fourth is the /// output buffer, which must not overlap the pVectors buffer. /// This is an in-place transformation. static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix ); static void CalcClosestPointOnLineSIMD( const FourVectors &P, const FourVectors &vLineA, const FourVectors &vLineB, FourVectors &vClosest, fltx4 *outT = 0); static fltx4 CalcClosestPointToLineTSIMD( const FourVectors &P, const FourVectors &vLineA, const FourVectors &vLineB, FourVectors &vDir ); // X(),Y(),Z() - get at the desired component of the i'th (0..3) vector. FORCEINLINE const float & X(int idx) const { // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!) return SubFloat( (fltx4 &)x, idx ); } FORCEINLINE const float & Y(int idx) const { return SubFloat( (fltx4 &)y, idx ); } FORCEINLINE const float & Z(int idx) const { return SubFloat( (fltx4 &)z, idx ); } FORCEINLINE float & X(int idx) { return SubFloat( x, idx ); } FORCEINLINE float & Y(int idx) { return SubFloat( y, idx ); } FORCEINLINE float & Z(int idx) { return SubFloat( z, idx ); } FORCEINLINE Vector Vec(int idx) const //< unpack one of the vectors { return Vector( X(idx), Y(idx), Z(idx) ); } FORCEINLINE void operator=( FourVectors const &src ) { x=src.x; y=src.y; z=src.z; } /// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b, Vector const &c, Vector const &d) { // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360 // use an unfolded implementation here #if defined( _X360 ) || defined(_PS3) fltx4 tx = LoadUnalignedSIMD( &a.x ); fltx4 ty = LoadUnalignedSIMD( &b.x ); fltx4 tz = LoadUnalignedSIMD( &c.x ); fltx4 tw = LoadUnalignedSIMD( &d.x ); fltx4 r0 = VectorMergeHighSIMD(tx, tz); fltx4 r1 = VectorMergeHighSIMD(ty, tw); fltx4 r2 = VectorMergeLowSIMD(tx, tz); fltx4 r3 = VectorMergeLowSIMD(ty, tw); x = VectorMergeHighSIMD(r0, r1); y = VectorMergeLowSIMD(r0, r1); z = VectorMergeHighSIMD(r2, r3); #else x = LoadUnalignedSIMD( &( a.x )); y = LoadUnalignedSIMD( &( b.x )); z = LoadUnalignedSIMD( &( c.x )); fltx4 w = LoadUnalignedSIMD( &( d.x )); // now, matrix is: // x y z ? // x y z ? // x y z ? // x y z ? TransposeSIMD(x, y, z, w); #endif } FORCEINLINE void LoadAndSwizzle(Vector const &a) { LoadAndSwizzle( a, a, a, a ); } // Broadcasts a, b, c, and d into the four vectors // This is only performant if the floats are ALREADY IN MEMORY // and not on registers -- eg, // .Load( &fltArrray[0], &fltArrray[1], &fltArrray[2], &fltArrray[3] ) is okay, // .Load( fltArrray[0] * 0.5f, fltArrray[1] * 0.5f, fltArrray[2] * 0.5f, fltArrray[3] * 0.5f ) is not. FORCEINLINE void Load( const float &a, const float &b, const float &c, const float &d ) { #if defined( _X360 ) || defined( _PS3 ) fltx4 temp[4]; temp[0] = LoadUnalignedFloatSIMD( &a ); temp[1] = LoadUnalignedFloatSIMD( &b ); temp[2] = LoadUnalignedFloatSIMD( &c ); temp[3] = LoadUnalignedFloatSIMD( &d ); y = VectorMergeHighSIMD( temp[0], temp[2] ); // ac__ z = VectorMergeHighSIMD( temp[1], temp[3] ); // bd__ x = VectorMergeHighSIMD( y, z ); // abcd y = x; z = x; #else ALIGN16 float temp[4]; temp[0] = a; temp[1] = b; temp[2] = c; temp[3] = d; fltx4 v = LoadAlignedSIMD( temp ); x = v; y = v; z = v; #endif } // transform four horizontal vectors into the internal vertical ones FORCEINLINE void LoadAndSwizzle( FLTX4 a, FLTX4 b, FLTX4 c, FLTX4 d ) { #if defined( _X360 ) || defined( _PS3 ) fltx4 tx = a; fltx4 ty = b; fltx4 tz = c; fltx4 tw = d; fltx4 r0 = VectorMergeHighSIMD(tx, tz); fltx4 r1 = VectorMergeHighSIMD(ty, tw); fltx4 r2 = VectorMergeLowSIMD(tx, tz); fltx4 r3 = VectorMergeLowSIMD(ty, tw); x = VectorMergeHighSIMD(r0, r1); y = VectorMergeLowSIMD(r0, r1); z = VectorMergeHighSIMD(r2, r3); #else x = a; y = b; z = c; fltx4 w = d; // now, matrix is: // x y z ? // x y z ? // x y z ? // x y z ? TransposeSIMD(x, y, z, w); #endif } /// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op. /// all 4 vectors must be 128 bit boundary FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d) { #if defined( _X360 ) || defined( _PS3 ) fltx4 tx = LoadAlignedSIMD(a); fltx4 ty = LoadAlignedSIMD(b); fltx4 tz = LoadAlignedSIMD(c); fltx4 tw = LoadAlignedSIMD(d); fltx4 r0 = VectorMergeHighSIMD(tx, tz); fltx4 r1 = VectorMergeHighSIMD(ty, tw); fltx4 r2 = VectorMergeLowSIMD(tx, tz); fltx4 r3 = VectorMergeLowSIMD(ty, tw); x = VectorMergeHighSIMD(r0, r1); y = VectorMergeLowSIMD(r0, r1); z = VectorMergeHighSIMD(r2, r3); #else x = LoadAlignedSIMD( a ); y = LoadAlignedSIMD( b ); z = LoadAlignedSIMD( c ); fltx4 w = LoadAlignedSIMD( d ); // now, matrix is: // x y z ? // x y z ? // x y z ? // x y z ? TransposeSIMD( x, y, z, w ); #endif } FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b, Vector const &c, Vector const &d) { LoadAndSwizzleAligned( &a.x, &b.x, &c.x, &d.x ); } /// Unpack a FourVectors back into four horizontal fltx4s. /// Since the FourVectors doesn't store a w row, you can optionally /// specify your own; otherwise it will be 0. /// This function ABSOLUTELY MUST be inlined or the reference parameters will /// induce a severe load-hit-store. FORCEINLINE void TransposeOnto( fltx4 &out0, fltx4 &out1, fltx4 &out2, fltx4 &out3, FLTX4 w = Four_Zeros ) const { // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360 // use an unfolded implementation here #if defined( _X360 ) || defined(_PS3) fltx4 r0 = VectorMergeHighSIMD(x, z); fltx4 r1 = VectorMergeHighSIMD(y, w); fltx4 r2 = VectorMergeLowSIMD(x, z); fltx4 r3 = VectorMergeLowSIMD(y, w); out0 = VectorMergeHighSIMD(r0, r1); out1 = VectorMergeLowSIMD(r0, r1); out2 = VectorMergeHighSIMD(r2, r3); out3 = VectorMergeLowSIMD(r2, r3); #else out0 = x; out1 = y; out2 = z; out3 = w; TransposeSIMD(out0, out1, out2, out3); #endif } #if !defined(__SPU__) /// Store a FourVectors into four NON-CONTIGUOUS Vector*'s. FORCEINLINE void StoreUnalignedVector3SIMD( Vector * RESTRICT out0, Vector * RESTRICT out1, Vector * RESTRICT out2, Vector * RESTRICT out3 ) const; #endif /// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s. FORCEINLINE void StoreAlignedVectorSIMD( VectorAligned * RESTRICT out0, VectorAligned * RESTRICT out1, VectorAligned * RESTRICT out2, VectorAligned * RESTRICT out3 ) const; #if !defined(__SPU__) /// Store a FourVectors into four CONSECUTIVE Vectors in memory, /// where the first vector IS NOT aligned on a 16-byte boundary. FORCEINLINE void StoreUnalignedContigVector3SIMD( Vector * RESTRICT pDestination ) { fltx4 a,b,c,d; TransposeOnto(a,b,c,d); StoreFourUnalignedVector3SIMD( a, b, c, d, pDestination ); } #endif /// Store a FourVectors into four CONSECUTIVE Vectors in memory, /// where the first vector IS aligned on a 16-byte boundary. /// (since four Vector3s = 48 bytes, groups of four can be said /// to be 16-byte aligned though obviously the 2nd, 3d, and 4th /// vectors in the group individually are not) #if !defined(__SPU__) FORCEINLINE void StoreAlignedContigVector3SIMD( Vector * RESTRICT pDestination ) { fltx4 a,b,c,d; TransposeOnto(a,b,c,d); StoreFourAlignedVector3SIMD( a, b, c, d, pDestination ); } /// Store a FourVectors into four CONSECUTIVE VectorAligneds in memory FORCEINLINE void StoreAlignedContigVectorASIMD( VectorAligned * RESTRICT pDestination ) { StoreAlignedVectorSIMD( pDestination, pDestination + 1, pDestination + 2, pDestination + 3 ); } #endif /// return the squared length of all 4 vectors, the same name as used on Vector FORCEINLINE fltx4 LengthSqr( void ) const { const FourVectors &a = *this; return a * a; } /// return the squared length of all 4 vectors FORCEINLINE fltx4 length2(void) const { return (*this)*(*this); } /// return the approximate length of all 4 vectors. uses the sqrt approximation instruction FORCEINLINE fltx4 length(void) const { return SqrtEstSIMD(length2()); } /// full precision square root. upper/lower case name is an artifact - the lower case one should be changed to refelct the lower accuracy. I added the mixed case one for compat with Vector FORCEINLINE fltx4 Length( void ) const { return SqrtSIMD( length2() ); } /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction) FORCEINLINE void VectorNormalizeFast(void) { fltx4 mag_sq=(*this)*(*this); // length^2 (*this) *= ReciprocalSqrtEstSIMD(mag_sq); // *(1.0/sqrt(length^2)) } /// normalize all 4 vectors in place. FORCEINLINE void VectorNormalize(void) { fltx4 mag_sq=(*this)*(*this); // length^2 (*this) *= ReciprocalSqrtSIMD(mag_sq); // *(1.0/sqrt(length^2)) } FORCEINLINE fltx4 DistToSqr( FourVectors const &pnt ) { fltx4 fl4dX = SubSIMD( pnt.x, x ); fltx4 fl4dY = SubSIMD( pnt.y, y ); fltx4 fl4dZ = SubSIMD( pnt.z, z ); return AddSIMD( MulSIMD( fl4dX, fl4dX), AddSIMD( MulSIMD( fl4dY, fl4dY ), MulSIMD( fl4dZ, fl4dZ ) ) ); } FORCEINLINE fltx4 TValueOfClosestPointOnLine( FourVectors const &p0, FourVectors const &p1 ) const { FourVectors lineDelta = p1; lineDelta -= p0; fltx4 OOlineDirDotlineDir = ReciprocalSIMD( p1 * p1 ); FourVectors v4OurPnt = *this; v4OurPnt -= p0; return MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta ); } FORCEINLINE fltx4 DistSqrToLineSegment( FourVectors const &p0, FourVectors const &p1 ) const { FourVectors lineDelta = p1; FourVectors v4OurPnt = *this; v4OurPnt -= p0; lineDelta -= p0; fltx4 OOlineDirDotlineDir = ReciprocalSIMD( lineDelta * lineDelta ); fltx4 fl4T = MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta ); fl4T = MinSIMD( fl4T, Four_Ones ); fl4T = MaxSIMD( fl4T, Four_Zeros ); lineDelta *= fl4T; return v4OurPnt.DistToSqr( lineDelta ); } FORCEINLINE FourVectors Normalized()const { fltx4 fl4LengthInv = ReciprocalSqrtSIMD( LengthSqr() ); FourVectors out; out.x = x * fl4LengthInv; out.y = y * fl4LengthInv; out.z = z * fl4LengthInv; return out; } FORCEINLINE FourVectors NormalizedSafeX() const { fltx4 f4LenSqr = LengthSqr(); fltx4 isBigEnough = CmpGeSIMD( f4LenSqr, Four_Epsilons ); fltx4 fl4LengthInv = ReciprocalSqrtSIMD( f4LenSqr ); FourVectors out; out.x = MaskedAssign( isBigEnough, x * fl4LengthInv, Four_Ones ); out.y = AndSIMD( y * fl4LengthInv, isBigEnough ); out.z = AndSIMD( z * fl4LengthInv, isBigEnough ); return out; } FORCEINLINE FourVectors NormalizedSafeY() const { fltx4 f4LenSqr = LengthSqr(); fltx4 isBigEnough = CmpGeSIMD( f4LenSqr, Four_Epsilons ); fltx4 fl4LengthInv = ReciprocalSqrtSIMD( f4LenSqr ); FourVectors out; out.x = AndSIMD( x * fl4LengthInv, isBigEnough ); out.y = MaskedAssign( isBigEnough, y * fl4LengthInv, Four_Ones ); out.z = AndSIMD( z * fl4LengthInv, isBigEnough ); return out; } FORCEINLINE FourVectors NormalizedSafeZ() const { fltx4 f4LenSqr = LengthSqr(); fltx4 isBigEnough = CmpGeSIMD( f4LenSqr, Four_Epsilons ); fltx4 fl4LengthInv = ReciprocalSqrtSIMD( f4LenSqr ); FourVectors out; out.x = AndSIMD( x * fl4LengthInv, isBigEnough ); out.y = AndSIMD( y * fl4LengthInv, isBigEnough ); out.z = MaskedAssign( isBigEnough, z * fl4LengthInv, Four_Ones ); return out; } }; inline FourVectors CrossProduct( const FourVectors& a, const FourVectors& b ) { return FourVectors( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x ); } inline fltx4 DotProduct( const FourVectors& a, const FourVectors& b ) { return a.x * b.x + a.y * b.y + a.z * b.z; } inline FourVectors operator * ( fltx4 left, const FourVectors &right ) { return right * left; } // inline FourVectors Mul( const FourVectors &a, const fltx4 &b ) { FourVectors ret; ret.x = MulSIMD( a.x, b ); ret.y = MulSIMD( a.y, b ); ret.z = MulSIMD( a.z, b ); return ret; } inline FourVectors Mul( const FourVectors &a, const FourVectors &b ) { FourVectors ret; ret.x = MulSIMD( a.x, b.x ); ret.y = MulSIMD( a.y, b.y ); ret.z = MulSIMD( a.z, b.z ); return ret; } inline FourVectors Madd( const FourVectors &a, const fltx4 &b, const FourVectors &c ) // a*b + c { FourVectors ret; ret.x = MaddSIMD( a.x, b, c.x ); ret.y = MaddSIMD( a.y, b, c.y ); ret.z = MaddSIMD( a.z, b, c.z ); return ret; } /// form 4 cross products inline FourVectors operator ^(const FourVectors &a, const FourVectors &b) { FourVectors ret; ret.x=SubSIMD(MulSIMD(a.y,b.z),MulSIMD(a.z,b.y)); ret.y=SubSIMD(MulSIMD(a.z,b.x),MulSIMD(a.x,b.z)); ret.z=SubSIMD(MulSIMD(a.x,b.y),MulSIMD(a.y,b.x)); return ret; } inline FourVectors operator-(const FourVectors &a, const FourVectors &b) { FourVectors ret; ret.x=SubSIMD(a.x,b.x); ret.y=SubSIMD(a.y,b.y); ret.z=SubSIMD(a.z,b.z); return ret; } inline FourVectors operator+( const FourVectors &a, const FourVectors &b ) { FourVectors ret; ret.x = AddSIMD( a.x, b.x ); ret.y = AddSIMD( a.y, b.y ); ret.z = AddSIMD( a.z, b.z ); return ret; } /// component-by-componentwise MAX operator inline FourVectors maximum(const FourVectors &a, const FourVectors &b) { FourVectors ret; ret.x=MaxSIMD(a.x,b.x); ret.y=MaxSIMD(a.y,b.y); ret.z=MaxSIMD(a.z,b.z); return ret; } /// component-by-componentwise MIN operator inline FourVectors minimum(const FourVectors &a, const FourVectors &b) { FourVectors ret; ret.x=MinSIMD(a.x,b.x); ret.y=MinSIMD(a.y,b.y); ret.z=MinSIMD(a.z,b.z); return ret; } FORCEINLINE FourVectors RotateLeft( const FourVectors &src ) { FourVectors ret; ret.x = RotateLeft( src.x ); ret.y = RotateLeft( src.y ); ret.z = RotateLeft( src.z ); return ret; } FORCEINLINE FourVectors RotateRight( const FourVectors &src ) { FourVectors ret; ret.x = RotateRight( src.x ); ret.y = RotateRight( src.y ); ret.z = RotateRight( src.z ); return ret; } FORCEINLINE FourVectors MaskedAssign( const bi32x4 & ReplacementMask, const FourVectors & NewValue, const FourVectors & OldValue ) { FourVectors ret; ret.x = MaskedAssign( ReplacementMask, NewValue.x, OldValue.x ); ret.y = MaskedAssign( ReplacementMask, NewValue.y, OldValue.y ); ret.z = MaskedAssign( ReplacementMask, NewValue.z, OldValue.z ); return ret; } /// calculate reflection vector. incident and normal dir assumed normalized FORCEINLINE FourVectors VectorReflect( const FourVectors &incident, const FourVectors &normal ) { FourVectors ret = incident; fltx4 iDotNx2 = incident * normal; iDotNx2 = AddSIMD( iDotNx2, iDotNx2 ); FourVectors nPart = normal; nPart *= iDotNx2; ret -= nPart; // i-2(n*i)n return ret; } /// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector. FORCEINLINE FourVectors VectorSlide( const FourVectors &incident, const FourVectors &normal ) { FourVectors ret = incident; fltx4 iDotN = incident * normal; FourVectors nPart = normal; nPart *= iDotN; ret -= nPart; // i-(n*i)n return ret; } /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction) FORCEINLINE FourVectors VectorNormalizeFast( const FourVectors &src ) { fltx4 mag_sq = ReciprocalSqrtEstSIMD( src * src ); // *(1.0/sqrt(length^2)) FourVectors result; result.x = MulSIMD( src.x, mag_sq ); result.y = MulSIMD( src.y, mag_sq ); result.z = MulSIMD( src.z, mag_sq ); return result; } #if !defined(__SPU__) /// Store a FourVectors into four NON-CONTIGUOUS Vector*'s. FORCEINLINE void FourVectors::StoreUnalignedVector3SIMD( Vector * RESTRICT out0, Vector * RESTRICT out1, Vector * RESTRICT out2, Vector * RESTRICT out3 ) const { #ifdef _X360 fltx4 x0,x1,x2,x3, y0,y1,y2,y3, z0,z1,z2,z3; x0 = SplatXSIMD(x); // all x0x0x0x0 x1 = SplatYSIMD(x); x2 = SplatZSIMD(x); x3 = SplatWSIMD(x); y0 = SplatXSIMD(y); y1 = SplatYSIMD(y); y2 = SplatZSIMD(y); y3 = SplatWSIMD(y); z0 = SplatXSIMD(z); z1 = SplatYSIMD(z); z2 = SplatZSIMD(z); z3 = SplatWSIMD(z); __stvewx( x0, out0->Base(), 0 ); // store X word __stvewx( y0, out0->Base(), 4 ); // store Y word __stvewx( z0, out0->Base(), 8 ); // store Z word __stvewx( x1, out1->Base(), 0 ); // store X word __stvewx( y1, out1->Base(), 4 ); // store Y word __stvewx( z1, out1->Base(), 8 ); // store Z word __stvewx( x2, out2->Base(), 0 ); // store X word __stvewx( y2, out2->Base(), 4 ); // store Y word __stvewx( z2, out2->Base(), 8 ); // store Z word __stvewx( x3, out3->Base(), 0 ); // store X word __stvewx( y3, out3->Base(), 4 ); // store Y word __stvewx( z3, out3->Base(), 8 ); // store Z word #else fltx4 a,b,c,d; TransposeOnto(a,b,c,d); StoreUnaligned3SIMD( out0->Base(), a ); StoreUnaligned3SIMD( out1->Base(), b ); StoreUnaligned3SIMD( out2->Base(), c ); StoreUnaligned3SIMD( out3->Base(), d ); #endif } /// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s. FORCEINLINE void FourVectors::StoreAlignedVectorSIMD( VectorAligned * RESTRICT out0, VectorAligned * RESTRICT out1, VectorAligned * RESTRICT out2, VectorAligned * RESTRICT out3 ) const { fltx4 a,b,c,d; TransposeOnto(a,b,c,d); StoreAligned3SIMD( out0, a ); StoreAligned3SIMD( out1, b ); StoreAligned3SIMD( out2, c ); StoreAligned3SIMD( out3, d ); } #endif #if !defined(__SPU__) // Assume the given matrix is a rotation, and rotate these vectors by it. // If you have a long list of FourVectors structures that you all want // to rotate by the same matrix, use FourVectors::RotateManyBy() instead. void FourVectors::RotateBy(const matrix3x4_t& matrix) { // Splat out each of the entries in the matrix to a fltx4. Do this // in the order that we will need them, to hide latency. I'm // avoiding making an array of them, so that they'll remain in // registers. fltx4 matSplat00, matSplat01, matSplat02, matSplat10, matSplat11, matSplat12, matSplat20, matSplat21, matSplat22; // Load the matrix into local vectors. Sadly, matrix3x4_ts are // often unaligned. The w components will be the tranpose row of // the matrix, but we don't really care about that. fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] ); fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] ); fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] ); matSplat00 = SplatXSIMD( matCol0 ); matSplat01 = SplatYSIMD( matCol0 ); matSplat02 = SplatZSIMD( matCol0 ); matSplat10 = SplatXSIMD( matCol1 ); matSplat11 = SplatYSIMD( matCol1 ); matSplat12 = SplatZSIMD( matCol1 ); matSplat20 = SplatXSIMD( matCol2 ); matSplat21 = SplatYSIMD( matCol2 ); matSplat22 = SplatZSIMD( matCol2 ); // Trust in the compiler to schedule these operations correctly: fltx4 outX, outY, outZ; outX = AddSIMD( AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ), MulSIMD( z, matSplat02 ) ); outY = AddSIMD( AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ), MulSIMD( z, matSplat12 ) ); outZ = AddSIMD( AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ), MulSIMD( z, matSplat22 ) ); x = outX; y = outY; z = outZ; } // Assume the given matrix is a rotation, and rotate these vectors by it. // If you have a long list of FourVectors structures that you all want // to rotate by the same matrix, use FourVectors::RotateManyBy() instead. void FourVectors::TransformBy(const matrix3x4_t& matrix) { // Splat out each of the entries in the matrix to a fltx4. Do this // in the order that we will need them, to hide latency. I'm // avoiding making an array of them, so that they'll remain in // registers. fltx4 matSplat00, matSplat01, matSplat02, matSplat10, matSplat11, matSplat12, matSplat20, matSplat21, matSplat22; // Load the matrix into local vectors. Sadly, matrix3x4_ts are // often unaligned. The w components will be the tranpose row of // the matrix, but we don't really care about that. fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] ); fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] ); fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] ); matSplat00 = SplatXSIMD( matCol0 ); matSplat01 = SplatYSIMD( matCol0 ); matSplat02 = SplatZSIMD( matCol0 ); matSplat10 = SplatXSIMD( matCol1 ); matSplat11 = SplatYSIMD( matCol1 ); matSplat12 = SplatZSIMD( matCol1 ); matSplat20 = SplatXSIMD( matCol2 ); matSplat21 = SplatYSIMD( matCol2 ); matSplat22 = SplatZSIMD( matCol2 ); // Trust in the compiler to schedule these operations correctly: fltx4 outX, outY, outZ; outX = MaddSIMD( z, matSplat02, AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ) ); outY = MaddSIMD( z, matSplat12, AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ) ); outZ = MaddSIMD( z, matSplat22, AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ) ); x = AddSIMD( outX, ReplicateX4( matrix[0][3] )); y = AddSIMD( outY, ReplicateX4( matrix[1][3] )); z = AddSIMD( outZ, ReplicateX4( matrix[2][3] )); } #endif fltx4 NoiseSIMD( FourVectors const &v ); // vector valued noise direction FourVectors DNoiseSIMD( FourVectors const &v ); // vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html FourVectors CurlNoiseSIMD( FourVectors const &v ); //#endif // !defined SPU /// quick, low quality perlin-style noise() function suitable for real time use. /// return value is -1..1. Only reliable around +/- 1 million or so. fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z ); /// calculate the absolute value of a packed single inline fltx4 fabs( const fltx4 & x ) { return AndSIMD( x, LoadAlignedSIMD( g_SIMD_clear_signmask ) ); } // Convenience version inline fltx4 AbsSIMD( const fltx4 & x ) { return fabs( x ); } /// negate all four components of a SIMD packed single inline fltx4 fnegate( const fltx4 & x ) { return XorSIMD( x, LoadAlignedSIMD( g_SIMD_signmask ) ); } fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent); // PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some // restictions: fractional exponents are only handled with 2 bits of precision. Basically, // fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25). // negative and fractional powers are handled by the SIMD reciprocal and square root approximation // instructions and so are not especially accurate ----Note that this routine does not raise // numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)). inline fltx4 PowSIMD( const fltx4 & x, float exponent ) { return Pow_FixedPoint_Exponent_SIMD(x,(int) (4.0*exponent)); } /// (x<1)?x^(1/2.2):1. Use a 4th order polynomial to approximate x^(1/2.2) over 0..1 inline fltx4 LinearToGammaSIMD( fltx4 x ) { // y = -3.7295x4 + 8.9635x3 - 7.7397x2 + 3.443x + 0.048 x = MaxSIMD( MinSIMD( Four_Ones, x ), Four_Zeros ); return AddSIMD( Four_LinearToGammaCoefficients_E, MulSIMD( x, AddSIMD( Four_LinearToGammaCoefficients_D, MulSIMD( x, AddSIMD( Four_LinearToGammaCoefficients_C, MulSIMD( x, AddSIMD( Four_LinearToGammaCoefficients_B, MulSIMD( x, Four_LinearToGammaCoefficients_A ) ) ) ) ) ) ) ); } inline fltx4 GammaToLinearSIMD( fltx4 x ) { x = MaxSIMD( x, Four_Zeros ); x = AddSIMD( Four_GammaToLinearCoefficients_D, MulSIMD( x, AddSIMD( Four_GammaToLinearCoefficients_C, MulSIMD( x, AddSIMD( Four_GammaToLinearCoefficients_B, MulSIMD( x, Four_GammaToLinearCoefficients_A ) ) ) ) ) ); return MinSIMD( x, Four_Ones ); } /// ( x > 1 ) ? x : x^2.2 inline fltx4 GammaToLinearExtendedSIMD( fltx4 x ) { x = MaxSIMD( x, Four_Zeros ); fltx4 fl4Ret = AddSIMD( Four_GammaToLinearCoefficients_D, MulSIMD( x, AddSIMD( Four_GammaToLinearCoefficients_C, MulSIMD( x, AddSIMD( Four_GammaToLinearCoefficients_B, MulSIMD( x, Four_GammaToLinearCoefficients_A ) ) ) ) ) ); return MaskedAssign( CmpGeSIMD( x, Four_Ones ), x, fl4Ret ); } // random number generation - generate 4 random numbers quickly. void SeedRandSIMD(uint32 seed); // seed the random # generator fltx4 RandSIMD( int nContext = 0 ); // return 4 numbers in the 0..1 range // for multithreaded, you need to use these and use the argument form of RandSIMD: int GetSIMDRandContext( void ); void ReleaseSIMDRandContext( int nContext ); FORCEINLINE fltx4 RandSignedSIMD( void ) // -1..1 { return SubSIMD( MulSIMD( Four_Twos, RandSIMD() ), Four_Ones ); } FORCEINLINE fltx4 LerpSIMD ( const fltx4 &percent, const fltx4 &a, const fltx4 &b) { return AddSIMD( a, MulSIMD( SubSIMD( b, a ), percent ) ); } FORCEINLINE fltx4 RemapValClampedSIMD(const fltx4 &val, const fltx4 &a, const fltx4 &b, const fltx4 &c, const fltx4 &d) // Remap val from clamped range between a and b to new range between c and d { fltx4 range = MaskedAssign( CmpEqSIMD( a, b ), Four_Ones, SubSIMD( b, a ) ); //make sure range > 0 fltx4 cVal = MaxSIMD( Four_Zeros, MinSIMD( Four_Ones, DivSIMD( SubSIMD( val, a ), range ) ) ); //saturate return LerpSIMD( cVal, c, d ); } // SIMD versions of mathlib simplespline functions // hermite basis function for smooth interpolation // Similar to Gain() above, but very cheap to call // value should be between 0 & 1 inclusive inline fltx4 SimpleSpline( const fltx4 & value ) { // Arranged to avoid a data dependency between these two MULs: fltx4 valueDoubled = MulSIMD( value, Four_Twos ); fltx4 valueSquared = MulSIMD( value, value ); // Nice little ease-in, ease-out spline-like curve return SubSIMD( MulSIMD( Four_Threes, valueSquared ), MulSIMD( valueDoubled, valueSquared ) ); } // remaps a value in [startInterval, startInterval+rangeInterval] from linear to // spline using SimpleSpline inline fltx4 SimpleSplineRemapValWithDeltas( const fltx4 & val, const fltx4 & A, const fltx4 & BMinusA, const fltx4 & OneOverBMinusA, const fltx4 & C, const fltx4 & DMinusC ) { // if ( A == B ) // return val >= B ? D : C; fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA ); return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) ); } inline fltx4 SimpleSplineRemapValWithDeltasClamped( const fltx4 & val, const fltx4 & A, const fltx4 & BMinusA, const fltx4 & OneOverBMinusA, const fltx4 & C, const fltx4 & DMinusC ) { // if ( A == B ) // return val >= B ? D : C; fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA ); cVal = MinSIMD( Four_Ones, MaxSIMD( Four_Zeros, cVal ) ); return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) ); } FORCEINLINE fltx4 FracSIMD( const fltx4 &val ) { fltx4 fl4Abs = fabs( val ); fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s ); ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival ); return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits } #ifndef SPU // Disable on SPU for the moment as it generates a warning // warning: dereferencing type-punned pointer will break strict-aliasing rules // This is related to LoadAlignedSIMD( (float *) g_SIMD_lsbmask ) // LoadAlignedSIMD() under the hood is dereferencing the variable. FORCEINLINE fltx4 Mod2SIMD( const fltx4 &val ) { fltx4 fl4Abs = fabs( val ); fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( (float *) g_SIMD_lsbmask ), AddSIMD( fl4Abs, Four_2ToThe23s ) ), Four_2ToThe23s ); ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Twos ), ival ); return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits } #endif FORCEINLINE fltx4 Mod2SIMDPositiveInput( const fltx4 &val ) { fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( g_SIMD_lsbmask ), AddSIMD( val, Four_2ToThe23s ) ), Four_2ToThe23s ); ival = MaskedAssign( CmpGtSIMD( ival, val ), SubSIMD( ival, Four_Twos ), ival ); return SubSIMD( val, ival ); } // approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi. // no range reduction is done - for values outside of 0..1 you won't like the results FORCEINLINE fltx4 _SinEst01SIMD( const fltx4 &val ) { // really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between. // sufficient for simple oscillation. return MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) ); } FORCEINLINE fltx4 _Sin01SIMD( const fltx4 &val ) { // not a bad approximation : parabola always over-estimates. Squared parabola always // underestimates. So lets blend between them: goodsin = badsin + .225*( badsin^2-badsin) fltx4 fl4BadEst = MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) ); return AddSIMD( MulSIMD( Four_Point225s, SubSIMD( MulSIMD( fl4BadEst, fl4BadEst ), fl4BadEst ) ), fl4BadEst ); } // full range useable implementations FORCEINLINE fltx4 SinEst01SIMD( const fltx4 &val ) { fltx4 fl4Abs = fabs( val ); fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs ); bi32x4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones ); fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) ); fltx4 fl4Sin = _SinEst01SIMD( fl4val ); fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) ); return fl4Sin; } FORCEINLINE fltx4 Sin01SIMD( const fltx4 &val ) { fltx4 fl4Abs = fabs( val ); fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs ); bi32x4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones ); fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) ); fltx4 fl4Sin = _Sin01SIMD( fl4val ); fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) ); return fl4Sin; } FORCEINLINE fltx4 NatExpSIMD( const fltx4 &val ) // why is ExpSimd( x ) defined to be 2^x? { // need to write this. just stub with normal float implementation for now fltx4 fl4Result; SubFloat( fl4Result, 0 ) = exp( SubFloat( val, 0 ) ); SubFloat( fl4Result, 1 ) = exp( SubFloat( val, 1 ) ); SubFloat( fl4Result, 2 ) = exp( SubFloat( val, 2 ) ); SubFloat( fl4Result, 3 ) = exp( SubFloat( val, 3 ) ); return fl4Result; } // Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1) FORCEINLINE fltx4 PreCalcBiasParameter( const fltx4 &bias_parameter ) { // convert perlin-style-bias parameter to the value right for the approximation return SubSIMD( ReciprocalSIMD( bias_parameter ), Four_Twos ); } FORCEINLINE fltx4 BiasSIMD( const fltx4 &val, const fltx4 &precalc_param ) { // similar to bias function except pass precalced bias value from calling PreCalcBiasParameter. //!!speed!! use reciprocal est? //!!speed!! could save one op by precalcing _2_ values return DivSIMD( val, AddSIMD( MulSIMD( precalc_param, SubSIMD( Four_Ones, val ) ), Four_Ones ) ); } //----------------------------------------------------------------------------- // Box/plane test // NOTE: The w component of emins + emaxs must be 1 for this to work //----------------------------------------------------------------------------- #ifndef SPU // We don't need this on SPU right now FORCEINLINE int BoxOnPlaneSideSIMD( const fltx4& emins, const fltx4& emaxs, const cplane_t *p, float tolerance = 0.f ) { fltx4 corners[2]; fltx4 normal = LoadUnalignedSIMD( p->normal.Base() ); fltx4 dist = ReplicateX4( -p->dist ); normal = SetWSIMD( normal, dist ); fltx4 t4 = ReplicateX4( tolerance ); fltx4 negt4 = ReplicateX4( -tolerance ); bi32x4 cmp = CmpGeSIMD( normal, Four_Zeros ); corners[0] = MaskedAssign( cmp, emaxs, emins ); corners[1] = MaskedAssign( cmp, emins, emaxs ); fltx4 dot1 = Dot4SIMD( normal, corners[0] ); fltx4 dot2 = Dot4SIMD( normal, corners[1] ); cmp = CmpGeSIMD( dot1, t4 ); bi32x4 cmp2 = CmpGtSIMD( negt4, dot2 ); fltx4 result = MaskedAssign( cmp, Four_Ones, Four_Zeros ); fltx4 result2 = MaskedAssign( cmp2, Four_Twos, Four_Zeros ); result = AddSIMD( result, result2 ); intx4 sides; ConvertStoreAsIntsSIMD( &sides, result ); return sides[0]; } // k-dop bounding volume. 26-dop bounds with 13 plane-pairs plus 3 other "arbitrary bounds". The arbitrary values could be used to hold type info, etc, // which can compare against "for free" class KDop32_t { public: fltx4 m_Mins[4]; fltx4 m_Maxes[4]; FORCEINLINE bool Intersects( KDop32_t const &other ) const; FORCEINLINE void operator|=( KDop32_t const & other ); FORCEINLINE bool IsEmpty( void ) const; FORCEINLINE void Init( void ) { for( int i = 0; i < ARRAYSIZE( m_Mins ); i++ ) { m_Mins[i] = Four_FLT_MAX; m_Maxes[i] = Four_Negative_FLT_MAX; } } // given a set of points, expand the kdop to contain them void AddPointSet( Vector const *pPoints, int nPnts ); void CreateFromPointSet( Vector const *pPoints, int nPnts ); }; FORCEINLINE void KDop32_t::operator|=( KDop32_t const & other ) { m_Mins[0] = MinSIMD( m_Mins[0], other.m_Mins[0] ); m_Mins[1] = MinSIMD( m_Mins[1], other.m_Mins[1] ); m_Mins[2] = MinSIMD( m_Mins[2], other.m_Mins[2] ); m_Mins[3] = MinSIMD( m_Mins[3], other.m_Mins[3] ); m_Maxes[0] = MaxSIMD( m_Maxes[0], other.m_Maxes[0] ); m_Maxes[1] = MaxSIMD( m_Maxes[1], other.m_Maxes[1] ); m_Maxes[2] = MaxSIMD( m_Maxes[2], other.m_Maxes[2] ); m_Maxes[3] = MaxSIMD( m_Maxes[3], other.m_Maxes[3] ); } FORCEINLINE bool KDop32_t::Intersects( KDop32_t const &other ) const { bi32x4 c00 = CmpLeSIMD( m_Mins[0], other.m_Maxes[0] ); bi32x4 c01 = CmpLeSIMD( m_Mins[1], other.m_Maxes[1] ); bi32x4 c02 = CmpLeSIMD( m_Mins[2], other.m_Maxes[2] ); bi32x4 c03 = CmpLeSIMD( m_Mins[3], other.m_Maxes[3] ); bi32x4 c10 = CmpGeSIMD( m_Maxes[0], other.m_Mins[0] ); bi32x4 c11 = CmpGeSIMD( m_Maxes[1], other.m_Mins[1] ); bi32x4 c12 = CmpGeSIMD( m_Maxes[2], other.m_Mins[2] ); bi32x4 c13 = CmpGeSIMD( m_Maxes[3], other.m_Mins[3] ); bi32x4 a0 = AndSIMD( AndSIMD( c00, c01 ), AndSIMD( c02, c03 ) ); bi32x4 a1 = AndSIMD( AndSIMD( c10, c11 ), AndSIMD( c12, c13 ) ); return ! ( IsAnyZeros( AndSIMD( a1, a0 ) ) ); } FORCEINLINE bool KDop32_t::IsEmpty( void ) const { bi32x4 c00 = CmpLtSIMD( m_Maxes[0], m_Mins[0] ); bi32x4 c01 = CmpLtSIMD( m_Maxes[1], m_Mins[1] ); bi32x4 c02 = CmpLtSIMD( m_Maxes[2], m_Mins[2] ); bi32x4 c03 = CmpLtSIMD( m_Maxes[3], m_Mins[3] ); return IsAnyTrue( OrSIMD( OrSIMD( c00, c01 ), OrSIMD( c02, c03 ) ) ); } extern const fltx4 g_KDop32XDirs[4]; extern const fltx4 g_KDop32YDirs[4]; extern const fltx4 g_KDop32ZDirs[4]; #endif #if 0 // FIXME!!! If we need a version of this that runs on 360, this is a work-in-progress version that hasn't been debugged. #define _VEC_SWIZZLE_QUAT48_UNPACK (__vector unsigned char) { 16, 17, 0, 1, 16, 17, 2, 3, 16, 17, 4, 5, 16, 17, 6, 7 } #define _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT (__vector unsigned int ) { 0, 0, 1, 0 } // unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4 FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec ) { // A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 . // z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 . // w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is // w's sign bit. // fltx4 q16s = XMLoadVector3((const void *)pVec); fltx4 q16s = LoadUnaligned3SIMD( (const float * )pVec); // fltx4 shift = *( fltx4 * )&g_SIMD_Quat48_Unpack_Shift; // load the aligned shift mask that we use to shuffle z. // fltx4 permute = *( fltx4 * )&g_SIMD_Quat48_Unpack_Permute0; // load the permute word that shuffles x,y,z into their own words bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS. // q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f q16s = vec_perm( q16s, Four_Threes, _VEC_SWIZZLE_QUAT48_UNPACK ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f // q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) // q16s = vec_sl( *( u32x4 * )( void * )( &q16s ), _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT ); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) u32x4 tmp = IntShiftLeftWordSIMD( *( u32x4 * )&q16s, _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT ); q16s = *( fltx4 * )&tmp; // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1 const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s }; const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); /* fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats. // scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0); ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes ); */ // fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd ); fltx4 ret = vec_madd( q16s, vUpkMul, vUpkAdd ); // now, work out what w must be. fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self. dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones ); fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz) if ( wneg ) { ret = SetWSIMD( ret, NegSIMD( ww ) ); // ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret } else { ret = SetWSIMD( ret, ww ); // ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret } return ret; } #endif // These are not optimized right now for some platforms. We should be able to shuffle the values in some platforms. // As the methods are hard-coded we can actually avoid loading memory to do the transfer. // We should be able to create all versions. FORCEINLINE fltx4 SetWFromXSIMD( const fltx4 & a, const fltx4 & x ) { fltx4 value = SplatXSIMD( x ); return SetWSIMD( a, value ); } FORCEINLINE fltx4 SetWFromYSIMD( const fltx4 & a, const fltx4 & y ) { fltx4 value = SplatYSIMD( y ); return SetWSIMD( a, value ); } FORCEINLINE fltx4 SetWFromZSIMD( const fltx4 & a, const fltx4 & z ) { fltx4 value = SplatZSIMD( z ); return SetWSIMD( a, value ); } FORCEINLINE fltx4 CrossProductSIMD( const fltx4 &A, const fltx4 &B ) { #if defined( _X360 ) return XMVector3Cross( A, B ); #elif defined( _WIN32 ) fltx4 A1 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 1, 2, 0, 3 ) ); fltx4 B1 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 2, 0, 1, 3 ) ); fltx4 Result1 = MulSIMD( A1, B1 ); fltx4 A2 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 2, 0, 1, 3 ) ); fltx4 B2 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 1, 2, 0, 3 ) ); fltx4 Result2 = MulSIMD( A2, B2 ); return SubSIMD( Result1, Result2 ); #elif defined(_PS3) /* fltx4 perm1 = (vector unsigned char){0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x00,0x01,0x02,0x03,0x0c,0x0d,0x0e,0x0f}; fltx4 perm2 = (vector unsigned char){0x08,0x09,0x0a,0x0b,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0x0f}; fltx4 A1 = __vpermwi( A, A, perm1 ); fltx4 A2 = __vpermwi( B, B, perm2 ); fltx4 Result1 = MulSIMD( A1, B1 ); fltx4 A2 = __vpermwi( A, A, perm2 ); fltx4 B2 = __vpermwi( B, B, perm1 ); return MsubSIMD( A2, B2, Result1 ); */ return _vmathVfCross( A, B ); #else fltx4 CrossVal; SubFloat( CrossVal, 0 ) = SubFloat( A, 1 )*SubFloat( B, 2 ) - SubFloat( A, 2 )*SubFloat( B, 1 ); SubFloat( CrossVal, 1 ) = SubFloat( A, 2 )*SubFloat( B, 0 ) - SubFloat( A, 0 )*SubFloat( B, 2 ); SubFloat( CrossVal, 2 ) = SubFloat( A, 0 )*SubFloat( B, 1 ) - SubFloat( A, 1 )*SubFloat( B, 0 ); SubFloat( CrossVal, 3 ) = 0; return CrossVal; #endif } inline const fltx4 Length3SIMD(const fltx4 vec) { fltx4 scLengthSqr = Dot3SIMD(vec,vec); bi32x4 isSignificant = CmpGtSIMD(scLengthSqr, Four_Epsilons); fltx4 scLengthInv = ReciprocalSqrtSIMD(scLengthSqr); return AndSIMD(isSignificant, MulSIMD(scLengthInv, scLengthSqr)); } inline const fltx4 Normalized3SIMD (const fltx4 vec) { fltx4 scLengthSqr = Dot3SIMD(vec,vec); bi32x4 isSignificant = CmpGtSIMD(scLengthSqr, Four_Epsilons); fltx4 scLengthInv = ReciprocalSqrtSIMD(scLengthSqr); return AndSIMD(isSignificant, MulSIMD(vec, scLengthInv)); } // Some convenience operator overloads, which are just aliasing the functions above. // Unneccessary on 360, as you already have them from xboxmath.h // Componentwise add #ifndef COMPILER_GCC FORCEINLINE fltx4 operator+=( fltx4 &a, FLTX4 b ) { a = AddSIMD( a, b ); return a; } FORCEINLINE fltx4 operator-=( fltx4 &a, FLTX4 b ) { a = SubSIMD( a, b ); return a; } FORCEINLINE fltx4 operator*=( fltx4 &a, FLTX4 b ) { a = MulSIMD( a, b ); return a; } #endif #endif // _ssemath_h