tf2/tf2_src/public/mathlib/ssequaternion.h


								//========= Copyright Valve Corporation, All rights reserved. ============//

								//

								// Purpose: - defines SIMD "structure of arrays" classes and functions.

								//

								//===========================================================================//

								#ifndef SSEQUATMATH_H

								#define SSEQUATMATH_H


								#ifdef _WIN32

								#pragma once

								#endif


								#include "mathlib/ssemath.h"


								// Use this #define to allow SSE versions of Quaternion math

								// to exist on PC.

								// On PC, certain horizontal vector operations are not supported.

								// This causes the SSE implementation of quaternion math to mix the

								// vector and scalar floating point units, which is extremely

								// performance negative if you don't compile to native SSE2 (which

								// we don't as of Sept 1, 2007). So, it's best not to allow these

								// functions to exist at all. It's not good enough to simply replace

								// the contents of the functions with scalar math, because each call

								// to LoadAligned and StoreAligned will result in an unnecssary copy

								// of the quaternion, and several moves to and from the XMM registers.

								//

								// Basically, the problem you run into is that for efficient SIMD code,

								// you need to load the quaternions and vectors into SIMD registers and

								// keep them there as long as possible while doing only SIMD math,

								// whereas for efficient scalar code, each time you copy onto or ever

								// use a fltx4, it hoses your pipeline. So the difference has to be

								// in the management of temporary variables in the calling function,

								// not inside the math functions.

								//

								// If you compile assuming the presence of SSE2, the MSVC will abandon

								// the traditional x87 FPU operations altogether and make everything use

								// the SSE2 registers, which lessens this problem a little.


								// permitted only on 360, as we've done careful tuning on its Altivec math:

								#ifdef _X360

								#define ALLOW_SIMD_QUATERNION_MATH 1  // not on PC!

								#endif


								//---------------------------------------------------------------------

								// Load/store quaternions

								//---------------------------------------------------------------------

								#ifndef _X360

								#if ALLOW_SIMD_QUATERNION_MATH

								// Using STDC or SSE

								FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )

								{

									fltx4 retval = LoadAlignedSIMD( pSIMD.Base() );

									return retval;

								}


								FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )

								{

									fltx4 retval = LoadAlignedSIMD( pSIMD );

									return retval;

								}


								FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )

								{

									StoreAlignedSIMD( pSIMD->Base(), a );

								}

								#endif

								#else


								// for the transitional class -- load a QuaternionAligned

								FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )

								{

									fltx4 retval = XMLoadVector4A( pSIMD.Base() );

									return retval;

								}


								FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )

								{

									fltx4 retval = XMLoadVector4A( pSIMD );

									return retval;

								}


								FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )

								{

									XMStoreVector4A( pSIMD->Base(), a );

								}


								#endif


								#if ALLOW_SIMD_QUATERNION_MATH

								//---------------------------------------------------------------------

								// Make sure quaternions are within 180 degrees of one another, if not, reverse q

								//---------------------------------------------------------------------

								FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q )

								{

									// decide if one of the quaternions is backwards

									fltx4 a = SubSIMD( p, q );

									fltx4 b = AddSIMD( p, q );

									a = Dot4SIMD( a, a );

									b = Dot4SIMD( b, b );

									fltx4 cmp = CmpGtSIMD( a, b );

									fltx4 result = MaskedAssign( cmp, NegSIMD(q), q );

									return result;

								}


								//---------------------------------------------------------------------

								// Normalize Quaternion

								//---------------------------------------------------------------------

								#if USE_STDC_FOR_SIMD


								FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )

								{

									fltx4 radius, result;

									radius = Dot4SIMD( q, q );


									if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))

									{

										float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) );

										result = ReplicateX4( iradius );

										result = MulSIMD( result, q );

										return result;

									}

									return q;

								}


								#else


								// SSE + X360 implementation

								FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )

								{

									fltx4 radius, result, mask;

									radius = Dot4SIMD( q, q );

									mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0

									result = ReciprocalSqrtSIMD( radius );

									result = MulSIMD( result, q );

									return MaskedAssign( mask, q, result );	// if radius was 0, just return q

								}


								#endif


								//---------------------------------------------------------------------

								// 0.0 returns p, 1.0 return q.

								//---------------------------------------------------------------------

								FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									fltx4 sclp, sclq, result;

									sclq = ReplicateX4( t );

									sclp = SubSIMD( Four_Ones, sclq );

									result = MulSIMD( sclp, p );

									result = MaddSIMD( sclq, q, result );

									return QuaternionNormalizeSIMD( result );

								}


								//---------------------------------------------------------------------

								// Blend Quaternions

								//---------------------------------------------------------------------

								FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									// decide if one of the quaternions is backwards

									fltx4 q2, result;

									q2 = QuaternionAlignSIMD( p, q );

									result = QuaternionBlendNoAlignSIMD( p, q2, t );

									return result;

								}


								//---------------------------------------------------------------------

								// Multiply Quaternions

								//---------------------------------------------------------------------

								#ifndef _X360


								// SSE and STDC

								FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )

								{

									// decide if one of the quaternions is backwards

									fltx4 q2, result;

									q2 = QuaternionAlignSIMD( p, q );

									SubFloat( result, 0 ) =  SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 );

									SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 );

									SubFloat( result, 2 ) =  SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 );

									SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 );

									return result;

								}


								#else


								// X360

								extern const fltx4 g_QuatMultRowSign[4];

								FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )

								{

									fltx4 q2, row, result;

									q2 = QuaternionAlignSIMD( p, q );


									row = XMVectorSwizzle( q2, 3, 2, 1, 0 );

									row = MulSIMD( row, g_QuatMultRowSign[0] );

									result = Dot4SIMD( row, p );


									row = XMVectorSwizzle( q2, 2, 3, 0, 1 );

									row = MulSIMD( row, g_QuatMultRowSign[1] );

									row = Dot4SIMD( row, p );

									result = __vrlimi( result, row, 4, 0 );


									row = XMVectorSwizzle( q2, 1, 0, 3, 2 );

									row = MulSIMD( row, g_QuatMultRowSign[2] );

									row = Dot4SIMD( row, p );

									result = __vrlimi( result, row, 2, 0 );


									row = MulSIMD( q2, g_QuatMultRowSign[3] );

									row = Dot4SIMD( row, p );

									result = __vrlimi( result, row, 1, 0 );

									return result;

								}


								#endif


								//---------------------------------------------------------------------

								// Quaternion scale

								//---------------------------------------------------------------------

								#ifndef _X360


								// SSE and STDC

								FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )

								{

									float r;

									fltx4 q;


									// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to

									// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.

									float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) );

									sinom = min( sinom, 1.f );


									float sinsom = sin( asin( sinom ) * t );


									t = sinsom / (sinom + FLT_EPSILON);

									SubFloat( q, 0 ) = t * SubFloat( p, 0 );

									SubFloat( q, 1 ) = t * SubFloat( p, 1 );

									SubFloat( q, 2 ) = t * SubFloat( p, 2 );


									// rescale rotation

									r = 1.0f - sinsom * sinsom;


									// Assert( r >= 0 );

									if (r < 0.0f)

										r = 0.0f;

									r = sqrt( r );


									// keep sign of rotation

									SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r );

									return q;

								}


								#else


								// X360

								FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )

								{

									fltx4 sinom = Dot3SIMD( p, p );

									sinom = SqrtSIMD( sinom );

									sinom = MinSIMD( sinom, Four_Ones );

									fltx4 sinsom = ArcSinSIMD( sinom );

									fltx4 t4 = ReplicateX4( t );

									sinsom = MulSIMD( sinsom, t4 );

									sinsom = SinSIMD( sinsom );

									sinom = AddSIMD( sinom, Four_Epsilons );

									sinom = ReciprocalSIMD( sinom );

									t4 = MulSIMD( sinsom, sinom );

									fltx4 result = MulSIMD( p, t4 );


									// rescale rotation

									sinsom = MulSIMD( sinsom, sinsom );

									fltx4 r = SubSIMD( Four_Ones, sinsom );

									r = MaxSIMD( r, Four_Zeros );

									r = SqrtSIMD( r );


									// keep sign of rotation

									fltx4 cmp = CmpGeSIMD( p, Four_Zeros );

									r = MaskedAssign( cmp, r, NegSIMD( r ) );


									result = __vrlimi(result, r, 1, 0);

									return result;

								}


								#endif


								//-----------------------------------------------------------------------------

								// Quaternion sphereical linear interpolation

								//-----------------------------------------------------------------------------

								#ifndef _X360


								// SSE and STDC

								FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									float omega, cosom, sinom, sclp, sclq;


									fltx4 result;


									// 0.0 returns p, 1.0 return q.

									cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) +

										SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 );


									if ( (1.0f + cosom ) > 0.000001f )

									{

										if ( (1.0f - cosom ) > 0.000001f )

										{

											omega = acos( cosom );

											sinom = sin( omega );

											sclp = sin( (1.0f - t)*omega) / sinom;

											sclq = sin( t*omega ) / sinom;

										}

										else

										{

											// TODO: add short circuit for cosom == 1.0f?

											sclp = 1.0f - t;

											sclq = t;

										}

										SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 );

										SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 );

										SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 );

										SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 );

									}

									else

									{

										SubFloat( result, 0 ) = -SubFloat( q, 1 );

										SubFloat( result, 1 ) =  SubFloat( q, 0 );

										SubFloat( result, 2 ) = -SubFloat( q, 3 );

										SubFloat( result, 3 ) =  SubFloat( q, 2 );

										sclp = sin( (1.0f - t) * (0.5f * M_PI));

										sclq = sin( t * (0.5f * M_PI));

										SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 );

										SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 );

										SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 );

									}


									return result;

								}


								#else


								// X360

								FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									return XMQuaternionSlerp( p, q, t );

								}


								#endif


								FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									fltx4 q2, result;

									q2 = QuaternionAlignSIMD( p, q );

									result = QuaternionSlerpNoAlignSIMD( p, q2, t );

									return result;

								}


								#endif // ALLOW_SIMD_QUATERNION_MATH


								#endif // SSEQUATMATH_H