csgo/cstrike15_src/public/mathlib/ssequaternion.h


								//===== Copyright © 1996-2005, Valve Corporation, All rights reserved. ======//

								//

								// Purpose: - defines SIMD "structure of arrays" classes and functions.

								//

								//===========================================================================//

								#ifndef SSEQUATMATH_H

								#define SSEQUATMATH_H


								#ifdef _WIN32

								#pragma once

								#endif


								#include "mathlib/ssemath.h"


								// Use this #define to allow SSE versions of Quaternion math

								// to exist on PC.

								// On PC, certain horizontal vector operations are not supported.

								// This causes the SSE implementation of quaternion math to mix the

								// vector and scalar floating point units, which is extremely

								// performance negative if you don't compile to native SSE2 (which

								// we don't as of Sept 1, 2007). So, it's best not to allow these

								// functions to exist at all. It's not good enough to simply replace

								// the contents of the functions with scalar math, because each call

								// to LoadAligned and StoreAligned will result in an unnecssary copy

								// of the quaternion, and several moves to and from the XMM registers.

								//

								// Basically, the problem you run into is that for efficient SIMD code,

								// you need to load the quaternions and vectors into SIMD registers and

								// keep them there as long as possible while doing only SIMD math,

								// whereas for efficient scalar code, each time you copy onto or ever

								// use a fltx4, it hoses your pipeline. So the difference has to be

								// in the management of temporary variables in the calling function,

								// not inside the math functions.

								//

								// If you compile assuming the presence of SSE2, the MSVC will abandon

								// the traditional x87 FPU operations altogether and make everything use

								// the SSE2 registers, which lessens this problem a little.


								// permitted only on 360, as we've done careful tuning on its Altivec math.

								// FourQuaternions, however, are always allowed, because vertical ops are

								// fine on SSE.

								#ifdef PLATFORM_PPC

								#define ALLOW_SIMD_QUATERNION_MATH 1  // not on PC!

								#endif


								//---------------------------------------------------------------------

								// Load/store quaternions

								//---------------------------------------------------------------------

								#ifndef _X360

								// Using STDC or SSE

								FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )

								{

									fltx4 retval = LoadAlignedSIMD( pSIMD.Base() );

									return retval;

								}


								FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )

								{

									fltx4 retval = LoadAlignedSIMD( pSIMD->Base() );

									return retval;

								}


								FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )

								{

									StoreAlignedSIMD( pSIMD->Base(), a );

								}

								#else


								// for the transitional class -- load a QuaternionAligned

								FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )

								{

									fltx4 retval = XMLoadVector4A( pSIMD.Base() );

									return retval;

								}


								FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )

								{

									fltx4 retval = XMLoadVector4A( pSIMD );

									return retval;

								}


								FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )

								{

									XMStoreVector4A( pSIMD->Base(), a );

								}


								// From a RadianEuler packed onto a fltx4, to a quaternion

								fltx4 AngleQuaternionSIMD( FLTX4 vAngles );


								#endif


								#if ALLOW_SIMD_QUATERNION_MATH

								//---------------------------------------------------------------------

								// Make sure quaternions are within 180 degrees of one another, if not, reverse q

								//---------------------------------------------------------------------

								FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q )

								{

									// decide if one of the quaternions is backwards

									fltx4 a = SubSIMD( p, q );

									fltx4 b = AddSIMD( p, q );

									a = Dot4SIMD( a, a );

									b = Dot4SIMD( b, b );

									fltx4 cmp = (fltx4) CmpGtSIMD( a, b );

									fltx4 result = MaskedAssign( cmp, NegSIMD(q), q );

									return result;

								}


								//---------------------------------------------------------------------

								// Normalize Quaternion

								//---------------------------------------------------------------------

								#if USE_STDC_FOR_SIMD


								FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )

								{

									fltx4 radius, result;

									radius = Dot4SIMD( q, q );


									if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))

									{

										float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) );

										result = ReplicateX4( iradius );

										result = MulSIMD( result, q );

										return result;

									}

									return q;

								}


								#else


								// SSE + X360 implementation

								FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )

								{

									fltx4 radius, result, mask;

									radius = Dot4SIMD( q, q );

									mask = (fltx4) CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0

									result = ReciprocalSqrtSIMD( radius );

									result = MulSIMD( result, q );

									return MaskedAssign( mask, q, result );	// if radius was 0, just return q

								}


								#endif


								//---------------------------------------------------------------------

								// 0.0 returns p, 1.0 return q.

								//---------------------------------------------------------------------

								FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									fltx4 sclp, sclq, result;

									sclq = ReplicateX4( t );

									sclp = SubSIMD( Four_Ones, sclq );

									result = MulSIMD( sclp, p );

									result = MaddSIMD( sclq, q, result );

									return QuaternionNormalizeSIMD( result );

								}


								//---------------------------------------------------------------------

								// Blend Quaternions

								//---------------------------------------------------------------------

								FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									// decide if one of the quaternions is backwards

									fltx4 q2, result;

									q2 = QuaternionAlignSIMD( p, q );

									result = QuaternionBlendNoAlignSIMD( p, q2, t );

									return result;

								}


								//---------------------------------------------------------------------

								// Multiply Quaternions

								//---------------------------------------------------------------------

								#ifndef _X360


								// SSE and STDC

								FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )

								{

									// decide if one of the quaternions is backwards

									fltx4 q2, result;

									q2 = QuaternionAlignSIMD( p, q );

									SubFloat( result, 0 ) =  SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 );

									SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 );

									SubFloat( result, 2 ) =  SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 );

									SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 );

									return result;

								}


								#else


								// X360

								extern const fltx4 g_QuatMultRowSign[4];

								FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )

								{

									fltx4 q2, row, result;

									q2 = QuaternionAlignSIMD( p, q );


									row = XMVectorSwizzle( q2, 3, 2, 1, 0 );

									row = MulSIMD( row, g_QuatMultRowSign[0] );

									result = Dot4SIMD( row, p );


									row = XMVectorSwizzle( q2, 2, 3, 0, 1 );

									row = MulSIMD( row, g_QuatMultRowSign[1] );

									row = Dot4SIMD( row, p );

									result = __vrlimi( result, row, 4, 0 );


									row = XMVectorSwizzle( q2, 1, 0, 3, 2 );

									row = MulSIMD( row, g_QuatMultRowSign[2] );

									row = Dot4SIMD( row, p );

									result = __vrlimi( result, row, 2, 0 );


									row = MulSIMD( q2, g_QuatMultRowSign[3] );

									row = Dot4SIMD( row, p );

									result = __vrlimi( result, row, 1, 0 );

									return result;

								}


								#endif


								//---------------------------------------------------------------------

								// Quaternion scale

								//---------------------------------------------------------------------

								#ifdef _X360


								// X360

								FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )

								{

									fltx4 sinom = Dot3SIMD( p, p );

									sinom = SqrtSIMD( sinom );

									sinom = MinSIMD( sinom, Four_Ones );

									fltx4 sinsom = ArcSinSIMD( sinom );

									fltx4 t4 = ReplicateX4( t );

									sinsom = MulSIMD( sinsom, t4 );

									sinsom = SinSIMD( sinsom );

									sinom = AddSIMD( sinom, Four_Epsilons );

									sinom = ReciprocalSIMD( sinom );

									t4 = MulSIMD( sinsom, sinom );

									fltx4 result = MulSIMD( p, t4 );


									// rescale rotation

									sinsom = MulSIMD( sinsom, sinsom );

									fltx4 r = SubSIMD( Four_Ones, sinsom );

									r = MaxSIMD( r, Four_Zeros );

									r = SqrtSIMD( r );


									// keep sign of rotation

									fltx4 cmp = CmpGeSIMD( p, Four_Zeros );

									r = MaskedAssign( cmp, r, NegSIMD( r ) );


									result = __vrlimi(result, r, 1, 0);

									return result;

								}


								// X360

								// assumes t4 contains a float replicated to each slot

								FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, const fltx4 &t4 )

								{

									fltx4 sinom = Dot3SIMD( p, p );

									sinom = SqrtSIMD( sinom );

									sinom = MinSIMD( sinom, Four_Ones );

									fltx4 sinsom = ArcSinSIMD( sinom );

									sinsom = MulSIMD( sinsom, t4 );

									sinsom = SinSIMD( sinsom );

									sinom = AddSIMD( sinom, Four_Epsilons );

									sinom = ReciprocalSIMD( sinom );

									fltx4 result = MulSIMD( p, MulSIMD( sinsom, sinom ) );


									// rescale rotation

									sinsom = MulSIMD( sinsom, sinsom );

									fltx4 r = SubSIMD( Four_Ones, sinsom );

									r = MaxSIMD( r, Four_Zeros );

									r = SqrtSIMD( r );


									// keep sign of rotation

									fltx4 cmp = CmpGeSIMD( p, Four_Zeros );

									r = MaskedAssign( cmp, r, NegSIMD( r ) );


									result = __vrlimi(result, r, 1, 0);

									return result;

								}


								#elif defined(_PS3)


								// X360

								FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )

								{

									fltx4 sinom = Dot3SIMD( p, p );

									sinom = SqrtSIMD( sinom );

									sinom = MinSIMD( sinom, Four_Ones );

									fltx4 sinsom = ArcSinSIMD( sinom );

									fltx4 t4 = ReplicateX4( t );

									sinsom = MulSIMD( sinsom, t4 );

									sinsom = SinSIMD( sinsom );

									sinom = AddSIMD( sinom, Four_Epsilons );

									sinom = ReciprocalSIMD( sinom );

									t4 = MulSIMD( sinsom, sinom );

									fltx4 result = MulSIMD( p, t4 );


									// rescale rotation

									sinsom = MulSIMD( sinsom, sinsom );

									fltx4 r = SubSIMD( Four_Ones, sinsom );

									r = MaxSIMD( r, Four_Zeros );

									r = SqrtSIMD( r );


									// keep sign of rotation

									r = MaskedAssign( CmpGeSIMD( p, Four_Zeros ), r, NegSIMD( r ) );

									// set just the w component of result

									result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), r, result );


									return result;

								}


								// X360

								// assumes t4 contains a float replicated to each slot

								FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, const fltx4 &t4 )

								{

									fltx4 sinom = Dot3SIMD( p, p );

									sinom = SqrtSIMD( sinom );

									sinom = MinSIMD( sinom, Four_Ones );

									fltx4 sinsom = ArcSinSIMD( sinom );

									sinsom = MulSIMD( sinsom, t4 );

									sinsom = SinSIMD( sinsom );

									sinom = AddSIMD( sinom, Four_Epsilons );

									sinom = ReciprocalSIMD( sinom );

									fltx4 result = MulSIMD( p, MulSIMD( sinsom, sinom ) );


									// rescale rotation

									sinsom = MulSIMD( sinsom, sinsom );

									fltx4 r = SubSIMD( Four_Ones, sinsom );

									r = MaxSIMD( r, Four_Zeros );

									r = SqrtSIMD( r );


									// keep sign of rotation

									r = MaskedAssign( CmpGeSIMD( p, Four_Zeros ), r, NegSIMD( r ) );

									// set just the w component of result

									result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), r, result );


									return result;

								}


								#else


								// SSE and STDC

								FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )

								{

									float r;

									fltx4 q;


									// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to

									// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.

									float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) );

									sinom = fmin( sinom, 1.f );


									float sinsom = sin( asin( sinom ) * t );


									t = sinsom / (sinom + FLT_EPSILON);

									SubFloat( q, 0 ) = t * SubFloat( p, 0 );

									SubFloat( q, 1 ) = t * SubFloat( p, 1 );

									SubFloat( q, 2 ) = t * SubFloat( p, 2 );


									// rescale rotation

									r = 1.0f - sinsom * sinsom;


									// Assert( r >= 0 );

									if (r < 0.0f)

										r = 0.0f;

									r = sqrt( r );


									// keep sign of rotation

									SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r );

									return q;

								}


								#endif


								//-----------------------------------------------------------------------------

								// Quaternion sphereical linear interpolation

								//-----------------------------------------------------------------------------

								#ifndef _X360


								// SSE and STDC

								FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									float omega, cosom, sinom, sclp, sclq;


									fltx4 result;


									// 0.0 returns p, 1.0 return q.

									cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) +

										SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 );


									if ( (1.0f + cosom ) > 0.000001f )

									{

										if ( (1.0f - cosom ) > 0.000001f )

										{

											omega = acos( cosom );

											sinom = sin( omega );

											sclp = sin( (1.0f - t)*omega) / sinom;

											sclq = sin( t*omega ) / sinom;

										}

										else

										{

											// TODO: add short circuit for cosom == 1.0f?

											sclp = 1.0f - t;

											sclq = t;

										}

										SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 );

										SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 );

										SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 );

										SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 );

									}

									else

									{

										SubFloat( result, 0 ) = -SubFloat( q, 1 );

										SubFloat( result, 1 ) =  SubFloat( q, 0 );

										SubFloat( result, 2 ) = -SubFloat( q, 3 );

										SubFloat( result, 3 ) =  SubFloat( q, 2 );

										sclp = sin( (1.0f - t) * (0.5f * M_PI));

										sclq = sin( t * (0.5f * M_PI));

										SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 );

										SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 );

										SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 );

									}


									return result;

								}


								#else


								// X360

								FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									return XMQuaternionSlerp( p, q, t );

								}


								#endif


								FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t )

								{

									fltx4 q2, result;

									q2 = QuaternionAlignSIMD( p, q );

									result = QuaternionSlerpNoAlignSIMD( p, q2, t );

									return result;

								}


								#endif // ALLOW_SIMD_QUATERNION_MATH


								/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are

								/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.

								class ALIGN16 FourQuaternions

								{

								public:

									fltx4 x,y,z,w;


									FourQuaternions(void)

									{

									}


									FourQuaternions( const fltx4 &_x,

													 const fltx4 &_y,

													 const fltx4 &_z,

													 const fltx4 &_w )

													 : x(_x), y(_y), z(_z), w(_w)

									{}


								#if !defined(__SPU__)

									// four rotations around the same axis. angles should be in radians.

									FourQuaternions ( const fltx4 &axis,

										const float &angle0, const float &angle1, const float &angle2, const float &angle3)

									{

										FromAxisAndAngles( axis, angle0, angle1, angle2, angle3 );

									}

								#endif


									FourQuaternions( FourQuaternions const &src )

									{

										x=src.x;

										y=src.y;

										z=src.z;

										w=src.w;

									}


									FORCEINLINE void operator=( FourQuaternions const &src )

									{

										x=src.x;

										y=src.y;

										z=src.z;

										w=src.w;

									}


									/// this = this * q;

									FORCEINLINE FourQuaternions Mul( FourQuaternions const &q ) const;


									/// negate the vector part

									FORCEINLINE FourQuaternions Conjugate() const;


									/// for a quaternion representing a rotation of angle theta, return

									/// one of angle s*theta

									/// scale is four floats -- one for each quat

									FORCEINLINE FourQuaternions ScaleAngle( const fltx4 &scale ) const;


									/// ret = this * ( s * q )

									/// In other words, for a quaternion representing a rotation of angle theta, return

									/// one of angle s*theta

									/// s is four floats in a fltx4 -- one for each quaternion

									FORCEINLINE FourQuaternions MulAc( const fltx4 &s, const FourQuaternions &q ) const;


									/// ret = ( s * this ) * q

									FORCEINLINE FourQuaternions ScaleMul( const fltx4 &s, const FourQuaternions &q ) const;


									/// Slerp four quaternions at once, FROM me TO the specified out.

									FORCEINLINE FourQuaternions Slerp( const FourQuaternions &to, const fltx4 &t );


									FORCEINLINE FourQuaternions SlerpNoAlign( const FourQuaternions &originalto, const fltx4 &t );


								#if !defined(__SPU__)

									/// given an axis and four angles, populate this quaternion with the equivalent rotations

									/// (ie, make these four quaternions represent four different rotations around the same axis)

									/// angles should be in RADIANS

									FORCEINLINE FourQuaternions &FromAxisAndAngles( const fltx4 &axis,

										const float &angle0, const float &angle1,	const float &angle2, const float &angle3 );

									FORCEINLINE FourQuaternions &FromAxisAndAngles( const fltx4 &axis, const fltx4 &angles );

									// one convenience imp if you're doing this in degrees

									FORCEINLINE FourQuaternions &FromAxisAndAnglesInDegrees( const fltx4 &axis, const fltx4 &angles )

									{

										return FromAxisAndAngles( axis, MulSIMD(angles, Four_DegToRad));

									}

								#endif


									// rotate (in place) a FourVectors by this quaternion. there's a corresponding RotateBy in FourVectors.

									FORCEINLINE void RotateFourVectors( FourVectors * RESTRICT vecs ) const RESTRICT ;


									/// LoadAndSwizzleAligned - load 4 QuaternionAligneds into a FourQuaternions, performing transpose op.

									/// all 4 vectors must be 128 bit boundary

									FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d)

									{

								#if defined( _X360 )

										fltx4 tx = LoadAlignedSIMD(a);

										fltx4 ty = LoadAlignedSIMD(b);

										fltx4 tz = LoadAlignedSIMD(c);

										fltx4 tw = LoadAlignedSIMD(d);

										fltx4 r0 = __vmrghw(tx, tz);

										fltx4 r1 = __vmrghw(ty, tw);

										fltx4 r2 = __vmrglw(tx, tz);

										fltx4 r3 = __vmrglw(ty, tw);


										x = __vmrghw(r0, r1);

										y = __vmrglw(r0, r1);

										z = __vmrghw(r2, r3);

										w = __vmrglw(r2, r3);

								#else

										x		= LoadAlignedSIMD(a);

										y		= LoadAlignedSIMD(b);

										z		= LoadAlignedSIMD(c);

										w		= LoadAlignedSIMD(d);

										// now, matrix is:

										// x y z w

										// x y z w

										// x y z w

										// x y z w

										TransposeSIMD(x, y, z, w);

								#endif

									}


									FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned * RESTRICT a,

										const QuaternionAligned * RESTRICT b,

										const QuaternionAligned * RESTRICT c,

										const QuaternionAligned * RESTRICT d)

									{

										LoadAndSwizzleAligned(a->Base(), b->Base(), c->Base(), d->Base() );

									}


									/// LoadAndSwizzleAligned - load 4 consecutive QuaternionAligneds into a FourQuaternions,

									/// performing transpose op.

									/// all 4 vectors must be 128 bit boundary

									FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned *qs)

									{

								#if defined( _X360 )

										fltx4 tx = LoadAlignedSIMD(qs++);

										fltx4 ty = LoadAlignedSIMD(qs++);

										fltx4 tz = LoadAlignedSIMD(qs++);

										fltx4 tw = LoadAlignedSIMD(qs);

										fltx4 r0 = __vmrghw(tx, tz);

										fltx4 r1 = __vmrghw(ty, tw);

										fltx4 r2 = __vmrglw(tx, tz);

										fltx4 r3 = __vmrglw(ty, tw);


										x = __vmrghw(r0, r1);

										y = __vmrglw(r0, r1);

										z = __vmrghw(r2, r3);

										w = __vmrglw(r2, r3);

								#else

										x		= LoadAlignedSIMD(qs++);

										y		= LoadAlignedSIMD(qs++);

										z		= LoadAlignedSIMD(qs++);

										w		= LoadAlignedSIMD(qs++);

										// now, matrix is:

										// x y z w

										// x y z w

										// x y z w

										// x y z w

										TransposeSIMD(x, y, z, w);

								#endif

									}


									// Store the FourQuaternions out to four nonconsecutive ordinary quaternions in memory.

									FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned *a, QuaternionAligned *b, QuaternionAligned *c, QuaternionAligned *d)

									{

								#if defined( _X360 )

										fltx4 r0 = __vmrghw(x, z);

										fltx4 r1 = __vmrghw(y, w);

										fltx4 r2 = __vmrglw(x, z);

										fltx4 r3 = __vmrglw(y, w);


										fltx4 rx = __vmrghw(r0, r1);

										fltx4 ry = __vmrglw(r0, r1);

										fltx4 rz = __vmrghw(r2, r3);

										fltx4 rw = __vmrglw(r2, r3);


										StoreAlignedSIMD(a, rx);

										StoreAlignedSIMD(b, ry);

										StoreAlignedSIMD(c, rz);

										StoreAlignedSIMD(d, rw);

								#else

										fltx4 dupes[4] = { x, y, z, w };

										TransposeSIMD(dupes[0], dupes[1], dupes[2], dupes[3]);

										StoreAlignedSIMD(a, dupes[0]);

										StoreAlignedSIMD(b, dupes[1]);

										StoreAlignedSIMD(c, dupes[2]);

										StoreAlignedSIMD(d, dupes[3]);

								#endif

									}


									// Store the FourQuaternions out to four consecutive ordinary quaternions in memory.

									FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned *qs)

									{

								#if defined( _X360 )

										fltx4 r0 = __vmrghw(x, z);

										fltx4 r1 = __vmrghw(y, w);

										fltx4 r2 = __vmrglw(x, z);

										fltx4 r3 = __vmrglw(y, w);


										fltx4 rx = __vmrghw(r0, r1);

										fltx4 ry = __vmrglw(r0, r1);

										fltx4 rz = __vmrghw(r2, r3);

										fltx4 rw = __vmrglw(r2, r3);


										StoreAlignedSIMD(qs, rx);

										StoreAlignedSIMD(++qs, ry);

										StoreAlignedSIMD(++qs, rz);

										StoreAlignedSIMD(++qs, rw);

								#else

										SwizzleAndStoreAligned(qs, qs+1, qs+2, qs+3);

								#endif

									}


									// Store the FourQuaternions out to four consecutive ordinary quaternions in memory.

									// The mask specifies which of the quaternions are actually written out -- each

									// word in the fltx4 should be all binary ones or zeros. Ones means the corresponding

									// quat will be written.

									FORCEINLINE void SwizzleAndStoreAlignedMasked(QuaternionAligned * RESTRICT qs, const bi32x4 &controlMask)

									{

										fltx4 originals[4];

										originals[0] = LoadAlignedSIMD(qs);

										originals[1] = LoadAlignedSIMD(qs+1);

										originals[2] = LoadAlignedSIMD(qs+2);

										originals[3] = LoadAlignedSIMD(qs+3);


										bi32x4 masks[4] = { SplatXSIMD(controlMask),

											SplatYSIMD(controlMask),

											SplatZSIMD(controlMask),

											SplatWSIMD(controlMask)	};


								#if defined( _X360 )

										fltx4 r0 = __vmrghw(x, z);

										fltx4 r1 = __vmrghw(y, w);

										fltx4 r2 = __vmrglw(x, z);

										fltx4 r3 = __vmrglw(y, w);


										fltx4 rx = __vmrghw(r0, r1);

										fltx4 ry = __vmrglw(r0, r1);

										fltx4 rz = __vmrghw(r2, r3);

										fltx4 rw = __vmrglw(r2, r3);

								#else

										fltx4 rx = x;

										fltx4 ry = y;

										fltx4 rz = z;

										fltx4 rw = w;

										TransposeSIMD( rx, ry, rz, rw );

								#endif


										StoreAlignedSIMD( qs+0, MaskedAssign(masks[0], rx, originals[0]));

										StoreAlignedSIMD( qs+1, MaskedAssign(masks[1], ry, originals[1]));

										StoreAlignedSIMD( qs+2, MaskedAssign(masks[2], rz, originals[2]));

										StoreAlignedSIMD( qs+3, MaskedAssign(masks[3], rw, originals[3]));

									}

								};


								FORCEINLINE FourQuaternions FourQuaternions::Conjugate(  ) const

								{

									return FourQuaternions( NegSIMD(x),  NegSIMD(y),  NegSIMD(z),  w );

								}


								FORCEINLINE const fltx4 Dot(const FourQuaternions &a, const FourQuaternions &b)

								{

									return

										MaddSIMD(a.x, b.x,

											MaddSIMD(a.y, b.y,

												MaddSIMD(a.z,b.z, MulSIMD(a.w,b.w))

											)

										);

								}


								FORCEINLINE const FourQuaternions Madd(const FourQuaternions &a, const fltx4 &scale, const FourQuaternions &c)

								{

									FourQuaternions ret;

									ret.x = MaddSIMD(a.x,scale,c.x);

									ret.y = MaddSIMD(a.y,scale,c.y);

									ret.z = MaddSIMD(a.z,scale,c.z);

									ret.w = MaddSIMD(a.w,scale,c.w);

									return ret;

								}


								FORCEINLINE const FourQuaternions Mul(const FourQuaternions &a, const fltx4 &scale)

								{

									FourQuaternions ret;

									ret.x = MulSIMD(a.x,scale);

									ret.y = MulSIMD(a.y,scale);

									ret.z = MulSIMD(a.z,scale);

									ret.w = MulSIMD(a.w,scale);

									return ret;

								}


								FORCEINLINE const FourQuaternions Add(const FourQuaternions &a,const FourQuaternions &b)

								{

									FourQuaternions ret;

									ret.x = AddSIMD(a.x,b.x);

									ret.y = AddSIMD(a.y,b.y);

									ret.z = AddSIMD(a.z,b.z);

									ret.w = AddSIMD(a.w,b.w);

									return ret;

								}


								FORCEINLINE const FourQuaternions Sub(const FourQuaternions &a,const FourQuaternions &b)

								{

									FourQuaternions ret;

									ret.x = SubSIMD(a.x,b.x);

									ret.y = SubSIMD(a.y,b.y);

									ret.z = SubSIMD(a.z,b.z);

									ret.w = SubSIMD(a.w,b.w);

									return ret;

								}


								FORCEINLINE const FourQuaternions Neg(const FourQuaternions &q)

								{

									FourQuaternions ret;

									ret.x = NegSIMD(q.x);

									ret.y = NegSIMD(q.y);

									ret.z = NegSIMD(q.z);

									ret.w = NegSIMD(q.w);

									return ret;

								}


								FORCEINLINE const FourQuaternions MaskedAssign(const bi32x4 &mask, const FourQuaternions &a, const FourQuaternions &b)

								{

									FourQuaternions ret;

									ret.x = MaskedAssign(mask,a.x,b.x);

									ret.y = MaskedAssign(mask,a.y,b.y);

									ret.z = MaskedAssign(mask,a.z,b.z);

									ret.w = MaskedAssign(mask,a.w,b.w);

									return ret;

								}


								#ifdef DIFFERENT_NATIVE_VECTOR_TYPES

								FORCEINLINE const FourQuaternions MaskedAssign(const fltx4 &mask, const FourQuaternions &a, const FourQuaternions &b)

								{

									return MaskedAssign( ( bi32x4 )mask, a, b );

								}

								#endif


								FORCEINLINE FourQuaternions QuaternionAlign( const FourQuaternions &p, const FourQuaternions &q )

								{

									// decide if one of the quaternions is backwards

									bi32x4 cmp = CmpLtSIMD( Dot(p,q), Four_Zeros );

									return MaskedAssign( cmp, Neg(q), q );

								}


								FORCEINLINE const FourQuaternions QuaternionNormalize( const FourQuaternions &q )

								{

									fltx4 radius = Dot( q, q );

									bi32x4 mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0

									fltx4 invRadius = ReciprocalSqrtSIMD( radius );


									FourQuaternions ret = MaskedAssign(mask, q, Mul(q, invRadius));

									return ret;

								}


								#if !defined(__SPU__)

								FORCEINLINE FourQuaternions &FourQuaternions::FromAxisAndAngles( const fltx4 &axis,

																			   const float &angle0, const float &angle1,	const float &angle2, const float &angle3 )

								{

									return FromAxisAndAngles( axis, LoadGatherSIMD(angle0,angle1,angle2,angle3) );

								}


								FORCEINLINE FourQuaternions &FourQuaternions::FromAxisAndAngles( const fltx4 &axis,

																								const fltx4 &angles )

								{

									// compute the half theta

									fltx4 theta = MulSIMD( angles, Four_PointFives );

									// compute the sine and cosine of each angle simultaneously

									fltx4 vsines; fltx4 vcoses;

									SinCosSIMD( vsines, vcoses, theta );

									// now the sines and coses vectors contain the results for four angles.

									// for each of the angles, splat them out and then swizzle together so

									// as to get a < cos, sin, sin, sin > coefficient vector


									x = MulSIMD( vsines, SplatXSIMD( axis ) ); // sin(t0) * x, sin(t1) * x, etc

									y = MulSIMD( vsines, SplatYSIMD( axis ) );

									z = MulSIMD( vsines, SplatZSIMD( axis ) );

									w = vcoses;


									return *this;

								}

								#endif


								/// this = this * q;

								FORCEINLINE FourQuaternions FourQuaternions::Mul( FourQuaternions const &q ) const

								{

									// W = w1w2 - x1x2 - y1y2 - z1z2

									FourQuaternions ret;

									fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask );

									// as we do the multiplication, also do a dot product, so we know whether

									// one of the quats is backwards and if we therefore have to negate at the end

									fltx4 dotProduct = MulSIMD( w, q.w );


									ret.w = MulSIMD( w, q.w ); // W = w1w2

									ret.x = MulSIMD( w, q.x ); // X = w1x2

									ret.y = MulSIMD( w, q.y ); // Y = w1y2

									ret.z = MulSIMD( w, q.z ); // Z = w1z2


									dotProduct = MaddSIMD( x, q.x, dotProduct );

									ret.w = MsubSIMD( x, q.x, ret.w ); // W = w1w2 - x1x2

									ret.x = MaddSIMD( x, q.w, ret.x ); // X = w1x2 + x1w2

									ret.y = MsubSIMD( x, q.z, ret.y ); // Y = w1y2 - x1z2

									ret.z = MaddSIMD( x, q.y, ret.z ); // Z = w1z2 + x1y2


									dotProduct = MaddSIMD( y, q.y, dotProduct );

									ret.w = MsubSIMD( y, q.y, ret.w ); // W = w1w2 - x1x2 - y1y2

									ret.x = MaddSIMD( y, q.z, ret.x ); // X = w1x2 + x1w2 + y1z2

									ret.y = MaddSIMD( y, q.w, ret.y ); // Y = w1y2 - x1z2 + y1w2

									ret.z = MsubSIMD( y, q.x, ret.z ); // Z = w1z2 + x1y2 - y1x2


									dotProduct = MaddSIMD( z, q.z, dotProduct );

									ret.w = MsubSIMD( z, q.z, ret.w ); // W = w1w2 - x1x2 - y1y2 - z1z2

									ret.x = MsubSIMD( z, q.y, ret.x ); // X = w1x2 + x1w2 + y1z2 - z1y2

									ret.y = MaddSIMD( z, q.x, ret.y ); // Y = w1y2 - x1z2 + y1w2 + z1x2

									ret.z = MaddSIMD( z, q.w, ret.z ); // Z = w1z2 + x1y2 - y1x2 + z1w2


									fltx4 Zero = Four_Zeros;

									bi32x4 control = CmpLtSIMD( dotProduct, Four_Zeros );

									signMask = MaskedAssign(control, signMask, Zero); // negate quats where q1.q2 < 0

									ret.w = XorSIMD( signMask, ret.w );

									ret.x = XorSIMD( signMask, ret.x );

									ret.y = XorSIMD( signMask, ret.y );

									ret.z = XorSIMD( signMask, ret.z );


									return ret;

								}


								FORCEINLINE void FourQuaternions::RotateFourVectors( FourVectors * RESTRICT vecs ) const RESTRICT

								{

									fltx4 tmpX, tmpY, tmpZ, tmpW;

									fltx4 outX, outY, outZ;


									tmpX = SubSIMD( MaddSIMD( w, vecs->x , MulSIMD( y, vecs->z ) ),

													MulSIMD( z, vecs->y ) );


									tmpY = SubSIMD( MaddSIMD( w, vecs->y, MulSIMD( z, vecs->x ) ),

													MulSIMD( x, vecs->z ) );


									tmpZ = SubSIMD( MaddSIMD( w, vecs->z, MulSIMD( x, vecs->y ) ),

													MulSIMD( y, vecs->x ) );


									tmpW = AddSIMD( MaddSIMD( x, vecs->x, MulSIMD( y, vecs->y ) ),

												    MulSIMD( z, vecs->z ) );


									outX = AddSIMD( SubSIMD( MaddSIMD( tmpW, x, MulSIMD( tmpX, w ) ),

															 MulSIMD( tmpY, z ) ),

													MulSIMD( tmpZ, y ) );


									outY = AddSIMD( SubSIMD( MaddSIMD( tmpW, y, MulSIMD( tmpY, w ) ),

															 MulSIMD( tmpZ, x ) ),

													MulSIMD( tmpX, z ) );


									outZ = AddSIMD( SubSIMD( MaddSIMD( tmpW, z, MulSIMD( tmpZ, w ) ),

															 MulSIMD( tmpX, y ) ),

													MulSIMD( tmpY, x ) );


									// although apparently redundant, assigning the results to intermediate local variables

									// seems to improve code scheduling slightly in SN.

									vecs->x = outX;

									vecs->y = outY;

									vecs->z = outZ;

								}


								/*


								void QuaternionScale( const Quaternion &p, float t, Quaternion &q )

								{

									Assert( s_bMathlibInitialized );


									float r;


									// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to

									// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.

									float sinom = sqrt( DotProduct( &p.x, &p.x ) );

									sinom = min( sinom, 1.f );


									float sinsom = sin( asin( sinom ) * t );


									t = sinsom / (sinom + FLT_EPSILON);

									VectorScale( &p.x, t, &q.x );


									// rescale rotation

									r = 1.0f - sinsom * sinsom;


									// Assert( r >= 0 );

									if (r < 0.0f)

									r = 0.0f;

									r = sqrt( r );


									// keep sign of rotation

									if (p.w < 0)

									q.w = -r;

									else

									q.w = r;


									Assert( q.IsValid() );


									return;

								}


								*/


								FORCEINLINE FourQuaternions FourQuaternions::ScaleAngle( const fltx4 &scale ) const

								{

									FourQuaternions ret;

									static const fltx4 OneMinusEpsilon = {1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f };

									const fltx4 Zero = Four_Zeros;

									fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask );

									// work out if there are any tiny scales or angles, which are unstable

									bi32x4 tinyAngles = CmpGtSIMD(w,OneMinusEpsilon);

									bi32x4 negativeRotations = CmpLtSIMD(w, Zero); // if any w's are <0, we will need to negate later down


									// figure out the theta

									fltx4 angles = ArcCosSIMD( w );


									// test also if w > -1

									fltx4 negativeWs = XorSIMD(signMask, w);

									tinyAngles = OrSIMD( CmpGtSIMD(negativeWs, OneMinusEpsilon ), tinyAngles );


									// meanwhile start working on computing the dot product of the

									// vector component, and trust in the scheduler to interleave them

									fltx4 vLenSq = MulSIMD( x, x );

									vLenSq = MaddSIMD( y, y, vLenSq );

									vLenSq = MaddSIMD( z, z, vLenSq );


									// scale the angles

									angles = MulSIMD( angles, scale );


									// clear out the sign mask where w>=0

									signMask = MaskedAssign( negativeRotations, signMask, Zero);


									// work out the new w component and vector length

									fltx4 vLenRecip = ReciprocalSqrtSIMD(vLenSq); // interleave with Cos to hide latencies

									fltx4 sine;

									SinCosSIMD( sine, ret.w, angles );

									ret.x = MulSIMD( x, vLenRecip ); // renormalize so the vector length + w = 1

									ret.y = MulSIMD( y, vLenRecip ); // renormalize so the vector length + w = 1

									ret.z = MulSIMD( z, vLenRecip ); // renormalize so the vector length + w = 1

									ret.x = MulSIMD( ret.x, sine );

									ret.y = MulSIMD( ret.y, sine );

									ret.z = MulSIMD( ret.z, sine );


									// negate where necessary

									ret.x = XorSIMD(ret.x, signMask);

									ret.y = XorSIMD(ret.y, signMask);

									ret.z = XorSIMD(ret.z, signMask);

									ret.w = XorSIMD(ret.w, signMask);


									// finally, toss results from where cos(theta) is close to 1 -- these are non rotations.

									ret.x = MaskedAssign(tinyAngles, x, ret.x);

									ret.y = MaskedAssign(tinyAngles, y, ret.y);

									ret.z = MaskedAssign(tinyAngles, z, ret.z);

									ret.w = MaskedAssign(tinyAngles, w, ret.w);


									return ret;

								}


								//-----------------------------------------------------------------------------

								// Purpose: return = this * ( s * q )

								// In other words, for a quaternion representing a rotation of angle theta, return

								// one of angle s*theta

								// s is four floats in a fltx4 -- one for each quaternion

								//-----------------------------------------------------------------------------


								FORCEINLINE FourQuaternions FourQuaternions::MulAc( const fltx4 &s, const FourQuaternions &q ) const

								{

									/*

									void QuaternionMA( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt )

									{

										Quaternion p1, q1;


										QuaternionScale( q, s, q1 );

										QuaternionMult( p, q1, p1 );

										QuaternionNormalize( p1 );

										qt[0] = p1[0];

										qt[1] = p1[1];

										qt[2] = p1[2];

										qt[3] = p1[3];

									}

									*/


									return Mul(q.ScaleAngle(s));

								}


								FORCEINLINE FourQuaternions FourQuaternions::ScaleMul( const fltx4 &s, const FourQuaternions &q ) const

								{

									return ScaleAngle(s).Mul(q);

								}


								FORCEINLINE FourQuaternions FourQuaternions::Slerp( const FourQuaternions &originalto, const fltx4 &t )

								{

									FourQuaternions ret;

									static const fltx4 OneMinusEpsilon = {1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f };


									// align if necessary.


									// actually, before we even do that, start by computing the dot product of

									// the quaternions. it has lots of dependent ops and we can sneak it into

									// the pipeline bubbles as we figure out alignment. Of course we don't know

									// yet if we need to realign, so compute them both -- there's plenty of

									// space in the bubbles. They're roomy, those bubbles.

									fltx4 cosineOmega;

								#if 0 // Maybe I don't need to do alignment seperately, using the xb360 technique...

									FourQuaternions to;

									{

										fltx4 diffs[4], sums[4], originalToNeg[4];

										fltx4 dotIfAligned, dotIfNotAligned;


										// compute negations of the TO quaternion.

										originalToNeg[0] = NegSIMD(originalto.x);

										originalToNeg[1] = NegSIMD(originalto.y);

										originalToNeg[2] = NegSIMD(originalto.z);

										originalToNeg[3] = NegSIMD(originalto.w);


											dotIfAligned    = MulSIMD(x, originalto.x);

											dotIfNotAligned = MulSIMD(x, originalToNeg[0]);


										diffs[0] = SubSIMD(x, originalto.x);

										diffs[1] = SubSIMD(y, originalto.y);

										diffs[2] = SubSIMD(z, originalto.z);

										diffs[3] = SubSIMD(w, originalto.w);


										sums[0] = AddSIMD(x, originalto.x);

										sums[1] = AddSIMD(y, originalto.y);

										sums[2] = AddSIMD(z, originalto.z);

										sums[3] = AddSIMD(w, originalto.w);


											dotIfAligned    = MaddSIMD(y, originalto.y,     dotIfAligned);

											dotIfNotAligned = MaddSIMD(y, originalToNeg[1], dotIfNotAligned);


										fltx4 diffsDot, sumsDot;


										diffsDot = MulSIMD(diffs[0], diffs[0]); // x^2

										sumsDot  = MulSIMD(sums[0],  sums[0] ); // x^2

											// do some work on the dot products while letting the multiplies cook

											dotIfAligned    = MaddSIMD(z, originalto.z,     dotIfAligned);

											dotIfNotAligned = MaddSIMD(z, originalToNeg[2], dotIfNotAligned);


										diffsDot = MaddSIMD(diffs[1], diffs[1], diffsDot); // x^2 + y^2

										sumsDot  = MaddSIMD(sums[1],  sums[1],  sumsDot );

										diffsDot = MaddSIMD(diffs[2], diffs[2], diffsDot); // x^2 + y^2 + z^2

										sumsDot  = MaddSIMD(sums[2],  sums[2],  sumsDot );

										diffsDot = MaddSIMD(diffs[3], diffs[3], diffsDot); // x^2 + y^2 + z^2 + w^2

										sumsDot  = MaddSIMD(sums[3],  sums[3],  sumsDot );

											// do some work on the dot products while letting the multiplies cook

											dotIfAligned    = MaddSIMD(w, originalto.w,     dotIfAligned);

											dotIfNotAligned = MaddSIMD(w, originalToNeg[3], dotIfNotAligned);


										// are the differences greater than the sums?

										// if so, we need to negate that quaternion

										fltx4 mask = CmpGtSIMD(diffsDot, sumsDot); // 1 for diffs>0 and 0 elsewhere

										to.x = MaskedAssign(mask, originalToNeg[0], originalto.x);

										to.y = MaskedAssign(mask, originalToNeg[1], originalto.y);

										to.z = MaskedAssign(mask, originalToNeg[2], originalto.z);

										to.w = MaskedAssign(mask, originalToNeg[3], originalto.w);


										cosineOmega = MaskedAssign(mask, dotIfNotAligned, dotIfAligned);

									}


									// right, now to is aligned to be the short way round, and we computed

									// the dot product while we were figuring all that out.

								#else

									const FourQuaternions &to = originalto;

									cosineOmega = MulSIMD(x, to.x);

									cosineOmega = MaddSIMD(y, to.y, cosineOmega);

									cosineOmega = MaddSIMD(z, to.z, cosineOmega);

									cosineOmega = MaddSIMD(w, to.w, cosineOmega);

								#endif


									fltx4 Zero = Four_Zeros;

									bi32x4 cosOmegaLessThanZero = CmpLtSIMD(cosineOmega, Zero);

									// fltx4 shouldNegate = MaskedAssign(cosOmegaLessThanZero, Four_NegativeOnes , Four_Ones );

									fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask ); // contains a one in the sign bit -- xor against a number to negate it

									fltx4 sinOmega = Four_Ones;


									// negate cosineOmega where necessary

									cosineOmega = MaskedAssign( cosOmegaLessThanZero, XorSIMD(cosineOmega, signMask), cosineOmega );

									fltx4 oneMinusT = SubSIMD(Four_Ones,t);

									bi32x4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps


									// figure out the sin component of the diff quaternion.

									// since sin^2(t) + cos^2(t) = 1...

									sinOmega = MsubSIMD( cosineOmega, cosineOmega, sinOmega ); // = 1 - cos^2(t) = sin^2(t)

									fltx4 invSinOmega = ReciprocalSqrtSIMD( sinOmega );  // 1/sin(t)

									sinOmega = MulSIMD( sinOmega, invSinOmega ); // = sin^2(t) / sin(t) = sin(t)


									// use the arctangent technique to work out omega from  tan^-1(sin/cos)

									fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);


									// alpha = sin(omega * (1-T))/sin(omega)

									// beta  = sin(omega * T)/sin(omega)

									fltx4 alpha = MulSIMD(omega, oneMinusT);  // w(1-T)

									fltx4 beta  = MulSIMD(omega, t);		  // w(T)

									signMask = MaskedAssign(cosOmegaLessThanZero, signMask, Zero);


									alpha = SinSIMD(alpha);  // sin(w(1-T))

									beta  = SinSIMD(beta);   // sin(wT)


									alpha = MulSIMD(alpha, invSinOmega);

									beta  = MulSIMD(beta, invSinOmega);


									// depending on whether the dot product was less than zero, negate beta, or not

									beta  = XorSIMD(beta, signMask);


									// mask out singularities (where omega = 1)

									alpha = MaskedAssign( bCosOmegaLessThanOne, alpha, oneMinusT );

									beta  = MaskedAssign( bCosOmegaLessThanOne, beta , t );


								 	ret.x = MulSIMD(x, alpha);

									ret.y = MulSIMD(y, alpha);

									ret.z = MulSIMD(z, alpha);

									ret.w = MulSIMD(w, alpha);


									ret.x = MaddSIMD(to.x, beta, ret.x);

									ret.y = MaddSIMD(to.y, beta, ret.y);

									ret.z = MaddSIMD(to.z, beta, ret.z);

									ret.w = MaddSIMD(to.w, beta, ret.w);


									return ret;

								}


								FORCEINLINE FourQuaternions FourQuaternions::SlerpNoAlign( const FourQuaternions &originalto, const fltx4 &t )

								{

									FourQuaternions ret;

									static const fltx4 OneMinusEpsilon = {1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f };


									// align if necessary.


									// actually, before we even do that, start by computing the dot product of

									// the quaternions. it has lots of dependent ops and we can sneak it into

									// the pipeline bubbles as we figure out alignment. Of course we don't know

									// yet if we need to realign, so compute them both -- there's plenty of

									// space in the bubbles. They're roomy, those bubbles.

									fltx4 cosineOmega;


									const FourQuaternions &to = originalto;

									cosineOmega = MulSIMD(x, to.x);

									cosineOmega = MaddSIMD(y, to.y, cosineOmega);

									cosineOmega = MaddSIMD(z, to.z, cosineOmega);

									cosineOmega = MaddSIMD(w, to.w, cosineOmega);


									fltx4 sinOmega = Four_Ones;


									fltx4 oneMinusT = SubSIMD(Four_Ones,t);

									bi32x4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps


									// figure out the sin component of the diff quaternion.

									// since sin^2(t) + cos^2(t) = 1...

									sinOmega = MsubSIMD( cosineOmega, cosineOmega, sinOmega ); // = 1 - cos^2(t) = sin^2(t)

									fltx4 invSinOmega = ReciprocalSqrtSIMD( sinOmega );  // 1/sin(t)

									sinOmega = MulSIMD( sinOmega, invSinOmega ); // = sin^2(t) / sin(t) = sin(t)


									// use the arctangent technique to work out omega from  tan^-1(sin/cos)

									fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);


									// alpha = sin(omega * (1-T))/sin(omega)

									// beta  = sin(omega * T)/sin(omega)

									fltx4 alpha = MulSIMD(omega, oneMinusT);  // w(1-T)

									fltx4 beta  = MulSIMD(omega, t);		  // w(T)

									alpha = SinSIMD(alpha);  // sin(w(1-T))

									beta  = SinSIMD(beta);   // sin(wT)

									alpha = MulSIMD(alpha, invSinOmega);

									beta  = MulSIMD(beta, invSinOmega);


									// mask out singularities (where omega = 1)

									alpha = MaskedAssign( bCosOmegaLessThanOne, alpha, oneMinusT );

									beta  = MaskedAssign( bCosOmegaLessThanOne, beta , t );


									ret.x = MulSIMD(x, alpha);

									ret.y = MulSIMD(y, alpha);

									ret.z = MulSIMD(z, alpha);

									ret.w = MulSIMD(w, alpha);


									ret.x = MaddSIMD(to.x, beta, ret.x);

									ret.y = MaddSIMD(to.y, beta, ret.y);

									ret.z = MaddSIMD(to.z, beta, ret.z);

									ret.w = MaddSIMD(to.w, beta, ret.w);


									return ret;

								}


								/***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function:

								inline void FourVectors::RotateBy( const FourQuaternions &quats )

								{

									quats.RotateFourVectors( this );

								}

								*/


								#endif // SSEQUATMATH_H