csgo/cstrike15_src/public/mathlib/compressed_vector.h


								//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//

								//

								// Purpose:

								//

								// $NoKeywords: $

								//

								//=============================================================================//


								#ifndef COMPRESSED_VECTOR_H

								#define COMPRESSED_VECTOR_H


								#ifdef _WIN32

								#pragma once

								#endif


								#include <math.h>

								#include <float.h>


								// For vec_t, put this somewhere else?

								#include "basetypes.h"


								// For rand(). We really need a library!

								#include <stdlib.h>


								#include "tier0/dbg.h"

								#include "mathlib/vector.h"


								#include "mathlib/mathlib.h"

								#include "mathlib/ssemath.h"

								#ifdef _PS3

								#if defined(__SPU__)

								#include <spu_intrinsics.h>

								#include <vmx2spu.h>

								#endif

								#include <vectormath/cpp/vectormath_aos.h>

								#endif


								#if defined( _X360 )

								#pragma bitfield_order( push, lsb_to_msb )

								#elif defined( _PS3 )

								#pragma ms_struct on

								#pragma reverse_bitfields on

								#endif


								#ifdef OSX

								#pragma GCC diagnostic ignored "-Wtautological-compare"

								#endif


								class Quaternion48;


								FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec );


								//=========================================================

								// fit a 3D vector into 32 bits

								//=========================================================


								class Vector32

								{

								public:

									// Construction/destruction:

									Vector32(void);

									Vector32(vec_t X, vec_t Y, vec_t Z);


									// assignment

									Vector32& operator=(const Vector &vOther);

									operator Vector ();


								private:

									unsigned short x:10;

									unsigned short y:10;

									unsigned short z:10;

									unsigned short exp:2;

								};


								inline Vector32& Vector32::operator=(const Vector &vOther)

								{

									CHECK_VALID(vOther);


									static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };


									float fmax = MAX( fabs( vOther.x ), fabs( vOther.y ) );

									fmax = fpmax( fmax, fabs( vOther.z ) );


									for (exp = 0; exp < 3; exp++)

									{

										if (fmax < expScale[exp])

											break;

									}

									Assert( fmax < expScale[exp] );


									float fexp = 512.0f / expScale[exp];


									x = clamp( (int)(vOther.x * fexp) + 512, 0, 1023 );

									y = clamp( (int)(vOther.y * fexp) + 512, 0, 1023 );

									z = clamp( (int)(vOther.z * fexp) + 512, 0, 1023 );

									return *this;

								}


								inline Vector32::operator Vector ()

								{

									Vector tmp;


									static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };


									float fexp = expScale[exp] / 512.0f;


									tmp.x = (((int)x) - 512) * fexp;

									tmp.y = (((int)y) - 512) * fexp;

									tmp.z = (((int)z) - 512) * fexp;

									return tmp;

								}


								//=========================================================

								// Fit a unit vector into 32 bits

								//=========================================================


								class Normal32

								{

								public:

									// Construction/destruction:

									Normal32(void);

									Normal32(vec_t X, vec_t Y, vec_t Z);


									// assignment

									Normal32& operator=(const Vector &vOther);

									operator Vector ();


								private:

									unsigned short x:15;

									unsigned short y:15;

									unsigned short zneg:1;

								};


								inline Normal32& Normal32::operator=(const Vector &vOther)

								{

									CHECK_VALID(vOther);


									x = clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 );

									y = clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 );

									zneg = (vOther.z < 0);

									//x = vOther.x;

									//y = vOther.y;

									//z = vOther.z;

									return *this;

								}


								inline Normal32::operator Vector ()

								{

									Vector tmp;


									tmp.x = ((int)x - 16384) * (1 / 16384.0);

									tmp.y = ((int)y - 16384) * (1 / 16384.0);

									tmp.z = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y );

									if (zneg)

										tmp.z = -tmp.z;

									return tmp;

								}


								//=========================================================

								// 64 bit Quaternion

								//=========================================================


								class Quaternion64

								{

								public:

									// Construction/destruction:

									Quaternion64(void);

									Quaternion64(vec_t X, vec_t Y, vec_t Z);


									// assignment

									// Quaternion& operator=(const Quaternion64 &vOther);

									Quaternion64& operator=(const Quaternion &vOther);

									operator Quaternion () const;

									inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary


								private:

									Quaternion64( uint64 xx, uint64 yy, uint64 zz, uint64 ww ) : x(xx), y(yy), z(zz), wneg(ww) {}; // stricly for static construction

									uint64 x:21;

									uint64 y:21;

									uint64 z:21;

									uint64 wneg:1;

								};


								inline Quaternion64::operator Quaternion ()	 const

								{

								#if defined(__SPU__)

									fltx4 tmpV;

									QuaternionAligned tmpQ;


									tmpV = LoadUnalignedSIMD();

									StoreAlignedSIMD( (float *)&tmpQ, tmpV );


									return tmpQ;

								#else

									Quaternion tmp;


									// shift to -1048576, + 1048575, then round down slightly to -1.0 < x < 1.0

								 	tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);

								 	tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);

								 	tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);


									tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );

									if (wneg)

										tmp.w = -tmp.w;

									return tmp;

								#endif

								}


								inline Quaternion64& Quaternion64::operator=(const Quaternion &vOther)

								{

									CHECK_VALID(vOther);


									x = clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 );

									y = clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 );

									z = clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 );

									wneg = (vOther.w < 0);

									return *this;

								}


								inline fltx4 Quaternion64::LoadUnalignedSIMD() const

								{

								#ifdef _PS3 // assume little endian packing


								#if 1


									const static u32x4 xmask = { 0x00000000, 0x001fffff, 0, 0 }; // bottom 21 bits ( 0 .. 20 ) true

									const static u32x4 ymask = { 0x000003ff, 0xffe00000, 0, 0 }; // bits 21 .. 41 true

									const static u32x4 zmask = { 0x7ffffC00, 0x00000000, 0, 0 }; // bits 42 .. 62 true

									const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 }; // only bit 63 is true


									const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;

									// fish x, y, and z and put them into the the first words of their respective vec registers

									// the end type for these registers must be signed for the following subtract, BUT!

									// the shift has to happen as an UNSIGNED type so that it doesn't sign-extend.

									// the code as present assumes that the fused multiply-add operation has an intermediate

									// precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an

									// int op because of course 21 bits is right at the limit of floating point precision.

									i32x4 ix =  (i32x4) (ShiftLeftByBits<32>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word

									i32x4 iy =  (i32x4) (ShiftLeftByBits<11>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word

									i32x4 iz =  (i32x4) (ShiftRightByBits<10>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word


									/* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product)

									i32x4 iy =  (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word

									i32x4 iz =  (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word

									*/

									i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.


									// convert each of the vectors from int to float. (because of the way the pipeline is organized,

									// it's as fast to do this as it would have been to do by combining them into one register above

									// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll

									// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-

									// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.

									const fltx4 ONE = LoadOneSIMD();

								#if defined(__SPU__)

									fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE);

									fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE);

									fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE);

								#else

									fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE);

									fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE);

									fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE);

								#endif


									// compute the dot product

									fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z

									fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y

									fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z

									fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here


									fltx4 result = Compress4SIMD( fx, fy, fz, fw );

									// and for the coup de grace, set the sign bit of fw appropriately

									result = OrSIMD( result, (fltx4)wsignbit );


									return result;


								#else

									// original version


									/*

									union Qmask {

									struct qq {

									Quaternion64 mask;

									uint64 padding;

									} asQ ;

									u32x4 asVec;


									Qmask( const Quaternion64 &m ) : mask(m) {}

									};

									*/

									const static u32x4 xmask = { 0xfffff800, 0x00000000, 0, 0 }; // top 21 bits ( 0 .. 20 ) true

									const static u32x4 ymask = { 0x000007ff, 0xffc00000, 0, 0 }; // bits 21 .. 41 true

									const static u32x4 zmask = { 0x00000000, 0x003ffffe, 0, 0 }; // bits 42 .. 62 true

									const static u32x4 wmask = { 0x00000000, 0x00000001, 0, 0 }; // only bit 63 is true


									const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;

									// fish x, y, and z and put them into the the first words of their respective vec registers

									// the end type for these registers must be signed for the following subtract, BUT!

									// the shift has to happen as an UNSIGNED type so that it doesn't sign-extend.

									// the code as present assumes that the fused multiply-add operation has an intermediate

									// precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an

									// int op because of course 21 bits is right at the limit of floating point precision.

									i32x4 ix =  (i32x4) (ShiftRightByBits<11>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word

									i32x4 iy =  (i32x4) (ShiftLeftByBits<10>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word

									i32x4 iz =  (i32x4) (ShiftLeftByBits<31>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word

									/* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product)

									i32x4 iy =  (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word

									i32x4 iz =  (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word

									*/

									i32x4 wsignbit = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.


									// convert each of the vectors from int to float. (because of the way the pipeline is organized,

									// it's as fast to do this as it would have been to do by combining them into one register above

									// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll

									// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-

									// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.

									const fltx4 ONE = LoadOneSIMD();

								#if defined(__SPU__)

									fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE);

									fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE);

									fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE);

								#else

									fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE);

									fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE);

									fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE);

								#endif


									// compute the dot product

									fltx4 fw = MsubSIMD( ONE, fz, fz ); // 1 - z*z

									fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y

									fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z

									fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here


									fltx4 result = Compress4SIMD( fx, fy, fz, fw );

									// and for the coup de grace, set the sign bit of fw appropriately

									result = OrSIMD( result, (fltx4)wsignbit );


									return result;


								#endif


								#elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once)

									struct { float x; float y; float z; float w; } tmp;


									tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);

									tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);

									tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);

									tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );

									if (wneg)

										tmp.w = -tmp.w;


									fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };

									return ret;

								#else // naive implementation (which ends up being faster than the explicit c imp above)

									const QuaternionAligned q(Quaternion(*this)) ;

									return LoadAlignedSIMD( &q );

								#endif

								}


								//=========================================================

								// 48 bit Quaternion

								//=========================================================


								class Quaternion48

								{

								public:

									// Construction/destruction:

									Quaternion48(void);

									Quaternion48(vec_t X, vec_t Y, vec_t Z);


									// assignment

									// Quaternion& operator=(const Quaternion48 &vOther);

									Quaternion48& operator=(const Quaternion &vOther);

									operator Quaternion () const;

									inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary


								//private:

									unsigned short x:16;

									unsigned short y:16;

									unsigned short z:15;

									unsigned short wneg:1;

								};


								inline Quaternion48::operator Quaternion ()	const

								{

								#if defined(__SPU__)


									fltx4 tmpV;

									QuaternionAligned tmpQ;


									tmpV = LoadUnalignedSIMD();

									StoreAlignedSIMD( (float *)&tmpQ, tmpV );


									tmpV = UnpackQuaternion48SIMD( this );

									StoreAlignedSIMD( (float *)&tmpQ, tmpV );


									return tmpQ;


								#else


									Quaternion tmp;


									tmp.x = ((int)x - 32768) * (1 / 32768.5);

									tmp.y = ((int)y - 32768) * (1 / 32768.5);

									tmp.z = ((int)z - 16384) * (1 / 16384.5);

									tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );

									if (wneg)

										tmp.w = -tmp.w;

									return tmp;


								#endif

								}


								inline Quaternion48& Quaternion48::operator=(const Quaternion &vOther)

								{

									CHECK_VALID(vOther);


									x = clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 );

									y = clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 );

									z = clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 );

									wneg = (vOther.w < 0);

									return *this;

								}


								inline fltx4 Quaternion48::LoadUnalignedSIMD() const

								{

								#ifdef _PS3 // assume little endian packing


									const static u32x4 xmask = { 0x00000000, 0xffff0000, 0, 0 };

									const static u32x4 ymask = { 0x0000ffff, 0x00000000, 0, 0 };

									const static u32x4 zmask = { 0x7fff0000, 0x00000000, 0, 0 };

									const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 };


									const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;

									// fish x, y, and z and put them into the the first words of their respective vec registers

									i32x4 ix =  (i32x4) (ShiftLeftByBits<16>(vec_and( qbits, xmask )));

									i32x4 iy =  (i32x4) ((vec_and( qbits, ymask )));

									i32x4 iz =  (i32x4) (ShiftRightByBits<16>(vec_and( qbits, zmask )));


									// shift the w bit RIGHT so that it sits at the sign bit of the LAST word.

									i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask )));


									// convert each of the vectors from int to float. (because of the way the pipeline is organized,

									// it's as fast to do this as it would have been to do by combining them into one register above

									// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll

									// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-

									// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.

									const fltx4 ONE = LoadOneSIMD();

								#if defined(__SPU__)

									fltx4 fx = SubSIMD( vec_ctf( ix, 15 ), ONE);

									fltx4 fy = SubSIMD( vec_ctf( iy, 15 ), ONE);

									fltx4 fz = SubSIMD( vec_ctf( iz, 14 ), ONE);

								#else

									fltx4 fx = SubSIMD( vec_vcfsx( ix, 15 ), ONE);

									fltx4 fy = SubSIMD( vec_vcfsx( iy, 15 ), ONE);

									fltx4 fz = SubSIMD( vec_vcfsx( iz, 14 ), ONE);

								#endif


									// compute the dot product

									fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z

									fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y

									fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z

									fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here


									fltx4 result = Compress4SIMD( fx, fy, fz, fw );

									// and for the coup de grace, set the sign bit of fw appropriately

									result = OrSIMD( result, (fltx4)wsignbit );


									return result;


								#elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once)

									struct { float x; float y; float z; float w; } tmp;


									tmp.x = ((int)x - 32768) * (1 / 32768.5);

									tmp.y = ((int)y - 32768) * (1 / 32768.5);

									tmp.z = ((int)z - 16384) * (1 / 16384.5);

									tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );

									if (wneg)

										tmp.w = -tmp.w;


									fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };

									return ret;

								#else // naive implementation (which ends up being faster than the explicit c imp above)

									const QuaternionAligned q(Quaternion(*this)) ;

									return LoadAlignedSIMD( &q );

								#endif

								}


								//=========================================================

								// 48 bit sorted Quaternion

								//=========================================================


								class Quaternion48S

								{

								public:

									// Construction/destruction:

									Quaternion48S(void);

									Quaternion48S(vec_t X, vec_t Y, vec_t Z);


									// assignment

									// Quaternion& operator=(const Quaternion48 &vOther);

									Quaternion48S& operator=(const Quaternion &vOther);

									operator Quaternion () const;

									operator fltx4 () const RESTRICT ;

								//private:

									// shift the quaternion so that the largest value is recreated by the sqrt()

									// abcd maps modulo into quaternion xyzw starting at "offset"

									// "offset" is split into two 1 bit fields so that the data packs into 6 bytes (3 shorts)

									unsigned short a:15;		// first of the 3 consecutive smallest quaternion elements

									unsigned short offsetH:1;	// high bit of "offset"

									unsigned short b:15;

									unsigned short offsetL:1;	// low bit of "offset"

									unsigned short c:15;

									unsigned short dneg:1;		// sign of the largest quaternion element

								};


								#define SCALE48S 23168.0f		// needs to fit 2*sqrt(0.5) into 15 bits.

								#define SHIFT48S 16384			// half of 2^15 bits.


								inline Quaternion48S::operator Quaternion ()	const

								{

								#if defined(__SPU__)


									fltx4 tmpV;

									QuaternionAligned tmpQ;


									tmpV = *this;

									StoreAlignedSIMD( (float *)&tmpQ, tmpV );


									return tmpQ;


								#else


									Quaternion tmp;


									COMPILE_TIME_ASSERT( sizeof( Quaternion48S ) == 6 );


									float *ptmp = &tmp.x;

									int ia = offsetL + offsetH * 2;

									int ib = ( ia + 1 ) % 4;

									int ic = ( ia + 2 ) % 4;

									int id = ( ia + 3 ) % 4;

									ptmp[ia] = ( (int)a - SHIFT48S ) * ( 1.0f / SCALE48S );

									ptmp[ib] = ( (int)b - SHIFT48S ) * ( 1.0f / SCALE48S );

									ptmp[ic] = ( (int)c - SHIFT48S ) * ( 1.0f / SCALE48S );

									ptmp[id] = sqrt( 1.0f - ptmp[ia] * ptmp[ia] - ptmp[ib] * ptmp[ib] - ptmp[ic] * ptmp[ic] );

									if (dneg)

										ptmp[id] = -ptmp[id];


									return tmp;


								#endif

								}


								inline Quaternion48S& Quaternion48S::operator=(const Quaternion &vOther)

								{

									CHECK_VALID(vOther);


									const float *ptmp = &vOther.x;


									// find largest field, make sure that one is recreated by the sqrt to minimize error

									int i = 0;

									if ( fabs( ptmp[i] ) < fabs( ptmp[1] ) )

									{

										i = 1;

									}

									if ( fabs( ptmp[i] ) < fabs( ptmp[2] ) )

									{

										i = 2;

									}

									if ( fabs( ptmp[i] ) < fabs( ptmp[3] ) )

									{

										i = 3;

									}


									int offset = ( i + 1 ) % 4; // make "a" so that "d" is the largest element

									offsetL = offset & 1;

									offsetH = offset > 1;

									a = clamp( (int)(ptmp[ offset ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );

									b = clamp( (int)(ptmp[ ( offset + 1 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );

									c = clamp( (int)(ptmp[ ( offset + 2 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );

									dneg = ( ptmp[ ( offset + 3 ) % 4 ] < 0.0f );


									return *this;

								}


								// decode onto a SIMD register

								inline Quaternion48S::operator fltx4 ()	const RESTRICT

								{

									AssertMsg1( (((uintp) this) & 1) == 0, "Quaternion48S is unaligned at %p\n", this );

								#ifdef PLATFORM_PPC // this algorithm depends heavily on the Altivec permute op, for which there is no analogue in SSE. This function should not be used on PC.

									// define some vector constants. the shift-scale will be done as a fused multiply-add,

									// with the scale already distributed onto the shift (the part subtracted)

									const static fltx4 vrSCALE48S = { (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S) };

									const static fltx4 vrSHIFT48S = { ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S  };


									// start by hoisting the q48 onto a SIMD word.

									u32x4 source = (u32x4) LoadUnalignedSIMD( this );

									const u32x4 ZERO = (u32x4) LoadZeroSIMD();

									// also hoist the offset into an int word. Hopefully this executes in parallel with the vector ops thanks to SUPERSCALAR!

									const unsigned int offset = offsetL | ( offsetH << 1 );

									const bi32x4 vDMask = (bi32x4) LoadAlignedSIMD( g_SIMD_ComponentMask[(offset+3)%4] ); // lets vsel poke D into the right word


								#if 0 // This code can be used to deal with a situation where LoadUnalignedSIMD() fails to properly load

								    // vectors starting on halfword boundaries (rather than 32-bit aligned). Because this is a 48-bit

									// structure, sometimes it'll only be 16-bit aligned. I expected that lvlx would always load from

									// a word boundary, requiring me to shift misaligned vectors over by 16 bits, but  evidently,

									// lvlx actually works even on halfword boundaries. Who knew!

									// Anyway, this code is still here in case the problem crops up, as a hint to both cause and solution.

									if ( ((unsigned int) this) & 2 )

									{

										source = ShiftLeftByBits<16>(source);

									}

								#endif


									// mask out the offset and dneg bits. Because of the packing #pragmas, the one-bit fields are actually at the MSB

									// of the halfwords, not the LSB as you might expect.

									ALIGN16 const static uint32 vMaskTopBits[4]  = { 0x80008000, 0x80000000, 0, 0 }; // just the LSB of each the first three halfwords

									u32x4 abc = AndNotSIMD( (u32x4) LoadAlignedSIMD(vMaskTopBits), source ); // now this is just the A, B, C halfwords.

									// Next, unpack abc as unsigned numbers. We can do this with a permute op. In fact, we can exploit

									// the integer pipe and load the offset while we're loading the SIMD numbers, then use the integer offset to select

									// the permute, which will therefore also perform the rotate that maps abc to their rightful destinations.

									// the masks below are for the vperm instruction, which is a byte-by-byte mapping from source to destination.

									// it's assumed that the FIRST parameter to vperm will be ZERO, and the second the data.  (that makes the masks a little clearer)

									// in the simplest case -- imagine each letter below represents one byte; the source vector looks like

									// AABB CCxx xxxx xxxx. We're going to permute it onto the work register like

									// 00AA 00BB 00CC 0000

									ALIGN16 const static uint32 vPermutations[4][4] = {

										// offset = 0 means  a->x, b->y, c->z, d->w

										{  0x00001011, 0x00001213, 0x00001415, 0x00000000	},

										// offset = 1 means a->y, b->z, c->w, d->a

										{  0x00000000, 0x00001011, 0x00001213, 0x00001415 	},

										{  0x00001415, 0x00000000, 0x00001011, 0x00001213   },

										{  0x00001213, 0x00001415, 0x00000000, 0x00001011   }

									};

									// compute two permutations on the input data: one where the zero-word is always in the w component,

									// which lets us do a 3-way rather than 4-way dot product; and another where the zero-word corresponds to

									// wherever D is supposed to go.

									// Even though this seems redundant, the duplicated work ends up fitting into the pipeline bubbles,

									// and the savings between a 4-way and 3-way dot seem to be about 3ns.

									u32x4 abcfordot = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[0] ) );

									abc = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[offset] ) );


									// turn each of the ints into floats. Because we masked out the one-bit field at the top,

									// We can think of this as a conversion from fixed-point where there's no fractional bit.

									// This is done in line with the shift-scale operation, which is itself fused.

									// we do this twice: once for the vector with the guaranteed zero w-word, and

									// once for the vector rotated by the offset.

									fltx4 vfDest = AndNotSIMD( vDMask, MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abc, 0 ), vrSCALE48S, vrSHIFT48S ) );

									fltx4 vfDestForDot = MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abcfordot, 0 ), vrSCALE48S, vrSHIFT48S ) ;

									// compute magnitude of the vector we know to have a 0 in the w word.

									const fltx4 vDot = Dot3SIMD( vfDestForDot, vfDestForDot );

									// recover the "D" word

									const fltx4 vD = SqrtSIMD( SubSIMD( LoadOneSIMD(), vDot ) );

									// mask D into the converted-and-offset vector, then return.

									return MaskedAssign( vDMask, dneg ? NegSIMD(vD) : vD, vfDest );

								#else

									AssertMsg( false, "Quaternion48S::operator fltx4  is slow on this platform and should not be used.\n" );

									QuaternionAligned q( (Quaternion) *this );

									return LoadAlignedSIMD( &q );

								#endif

								}


								//=========================================================

								// 32 bit Quaternion

								//=========================================================


								class Quaternion32

								{

								public:

									// Construction/destruction:

									Quaternion32(void);

									Quaternion32(vec_t X, vec_t Y, vec_t Z);


									// assignment

									// Quaternion& operator=(const Quaternion48 &vOther);

									Quaternion32& operator=(const Quaternion &vOther);

									operator Quaternion ();

									inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary


								private:

									unsigned int x:11;

									unsigned int y:10;

									unsigned int z:10;

									unsigned int wneg:1;

								};


								inline Quaternion32::operator Quaternion ()

								{

								#if defined(__SPU__)


									fltx4 tmpV;

									QuaternionAligned tmpQ;


									tmpV = LoadUnalignedSIMD();

									StoreAlignedSIMD( (float *)&tmpQ, tmpV );


									return tmpQ;


								#else


									Quaternion tmp;


									tmp.x = ((int)x - 1024) * (1 / 1024.0);

									tmp.y = ((int)y - 512) * (1 / 512.0);

									tmp.z = ((int)z - 512) * (1 / 512.0);

									tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );

									if (wneg)

										tmp.w = -tmp.w;

									return tmp;


								#endif

								}


								inline Quaternion32& Quaternion32::operator=(const Quaternion &vOther)

								{

									CHECK_VALID(vOther);


									x = clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 );

									y = clamp( (int)(vOther.y * 512) + 512, 0, 1023 );

									z = clamp( (int)(vOther.z * 512) + 512, 0, 1023 );

									wneg = (vOther.w < 0);

									return *this;

								}


								inline fltx4 Quaternion32::LoadUnalignedSIMD() const

								{

								#ifdef _PS3 // assume little endian packing


									const static u32x4 xmask = { 0x000007ff, 0, 0, 0 };

									const static u32x4 ymask = { 0x001ff800, 0, 0, 0 };

									const static u32x4 zmask = { 0x7fe00000, 0, 0, 0 };

									const static u32x4 wmask = { 0x80000000, 0, 0, 0 };


									const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;

									// fish x, y, and z and put them into the the first words of their respective vec registers

									i32x4 ix =  (i32x4) ((vec_and( qbits, xmask )));

									i32x4 iy =  (i32x4) (ShiftRightByBits<11>(vec_and( qbits, ymask )));

									i32x4 iz =  (i32x4) (ShiftRightByBits<21>(vec_and( qbits, zmask )));


									// shift the w bit RIGHT so that it sits at the sign bit of the LAST word.

									i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask )));


									// convert each of the vectors from int to float. (because of the way the pipeline is organized,

									// it's as fast to do this as it would have been to do by combining them into one register above

									// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll

									// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-

									// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.

									const fltx4 ONE = LoadOneSIMD();

								#if defined(__SPU__)

									fltx4 fx = SubSIMD( vec_ctf( ix, 10 ), ONE);

									fltx4 fy = SubSIMD( vec_ctf( iy, 9 ), ONE);

									fltx4 fz = SubSIMD( vec_ctf( iz, 9 ), ONE);

								#else

									fltx4 fx = SubSIMD( vec_vcfsx( ix, 10 ), ONE);

									fltx4 fy = SubSIMD( vec_vcfsx( iy, 10 ), ONE);

									fltx4 fz = SubSIMD( vec_vcfsx( iz, 9 ), ONE);

								#endif


									// compute the dot product

									fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z

									fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y

									fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z

									fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here


									fltx4 result = Compress4SIMD( fx, fy, fz, fw );

									// and for the coup de grace, set the sign bit of fw appropriately

									result = OrSIMD( result, (fltx4)wsignbit );


									return result;


								#else


									struct { float x; float y; float z; float w; } tmp;


									tmp.x = ((int)x - 1024) * (1 / 1024.0);

									tmp.y = ((int)y - 512) * (1 / 512.0);

									tmp.z = ((int)z - 512) * (1 / 512.0);

									tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );

									if (wneg)

										tmp.w = -tmp.w;


									fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };

									return ret;


								#endif

								}


								//=========================================================

								// 16 bit float

								//=========================================================


								const int float32bias = 127;

								const int float16bias = 15;


								const float maxfloat16bits = 65504.0f;


								class float16

								{

								public:

									// float16() {};

									//float16( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }

									float16& operator=(const unsigned short &other)  { m_storage.rawWord = other; return *this; };


									void Init() { m_storage.rawWord = 0; }

								//	float16& operator=(const float16 &other) { m_storage.rawWord = other.m_storage.rawWord; return *this; }

									//	float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }

								//	operator unsigned short () { return m_storage.rawWord; }

								//	operator float () { return Convert16bitFloatTo32bits( m_storage.rawWord ); }

									unsigned short GetBits() const

									{

										return m_storage.rawWord;

									}

									float GetFloat() const

									{

										return Convert16bitFloatTo32bits( m_storage.rawWord );

									}

									void SetFloat( float in )

									{

										m_storage.rawWord = ConvertFloatTo16bits( in );

									}


									bool IsInfinity() const

									{

										return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa == 0;

									}

									bool IsNaN() const

									{

										return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa != 0;

									}


									bool operator==(const float16 other) const { return m_storage.rawWord == other.m_storage.rawWord; }

									bool operator!=(const float16 other) const { return m_storage.rawWord != other.m_storage.rawWord; }


								//	bool operator< (const float other) const	   { return GetFloat() < other; }

								//	bool operator> (const float other) const	   { return GetFloat() > other; }


									template< bool BRANCHLESS > // allows you to force branchy/branchless implementation regardless of the current platform

									static unsigned short ConvertFloatTo16bitsNonDefault( float input );

									static float Convert16bitFloatTo32bits( unsigned short input );


									// a special case useful for the pixel writer: take four input float values, which are already in memory (not on registers),

									// convert them all at once and write them sequentially through the output pointer.

									static void ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut,

										const float *a, const float *b, const float *c, const float *d  );


									// unfortunately, function templates can't have default template parameters in 2010-era C++

									inline static unsigned short ConvertFloatTo16bits( float input )

									{	// default to branchless on ppc and branchy on x86

								#ifdef PLATFORM_PPC

										return ConvertFloatTo16bitsNonDefault<true>(input);

								#else

										return ConvertFloatTo16bitsNonDefault<false>(input);

								#endif

									}


								protected:

									union float32bits

									{

										float rawFloat;

										uint32 rawAsInt;

										struct

										{

											unsigned int mantissa : 23;

											unsigned int biased_exponent : 8;

											unsigned int sign : 1;

										} bits;

									};


									union float16bits

									{

										unsigned short rawWord;

										struct

										{

											unsigned short mantissa : 10;

											unsigned short biased_exponent : 5;

											unsigned short sign : 1;

										} bits;

									};


									static bool IsNaN( float16bits in )

									{

										return in.bits.biased_exponent == 31 && in.bits.mantissa != 0;

									}

									static bool IsInfinity( float16bits in )

									{

										return in.bits.biased_exponent == 31 && in.bits.mantissa == 0;

									}


									// 0x0001 - 0x03ff

									float16bits m_storage;

								};


								class float16_with_assign : public float16

								{

								public:

									float16_with_assign() {}

									float16_with_assign( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }


									float16& operator=(const float16 &other) { m_storage.rawWord = ((float16_with_assign &)other).m_storage.rawWord; return *this; }

									float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }

								//	operator unsigned short () const { return m_storage.rawWord; }

									operator float () const { return Convert16bitFloatTo32bits( m_storage.rawWord ); }

								};


								//=========================================================

								// Fit a 3D vector in 48 bits

								//=========================================================


								class Vector48

								{

								public:

									// Construction/destruction:

									Vector48(void) {}

									Vector48(vec_t X, vec_t Y, vec_t Z) { x.SetFloat( X ); y.SetFloat( Y ); z.SetFloat( Z ); }


									// assignment

									Vector48& operator=(const Vector &vOther);

									operator Vector ();


									const float operator[]( int i ) const { return (((float16 *)this)[i]).GetFloat(); }


									float16 x;

									float16 y;

									float16 z;

								};


								// The uses of isel below are malformed because the first expression is unsigned and thus always >= 0,

								// so this whole expression maps to a simple assignment. This was found through a noisy clang

								// warning. I am preprocessing this out until it is needed.

								#if 0

								inline void float16::ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut,

																			const float *a, const float *b, const float *c, const float *d  )

								{

									COMPILE_TIME_ASSERT( sizeof(float) == 4 );

									// being meant for use on the PPC, this is tuned for that.

									// it is mostly branchless, except for the large outer for loop,

									// since there's enough instructions inside that unrolling is

									// a bad idea. This fucntion is four-at-once to simplify SIMDifying in the

									// future should a convenient SIMD way to decimate emerge

									// Also, because this is only used for the special case of converting

									// float arrays into float16 GPU textures, this turns denorms into zeroes

									// and infinities into MAXFLTs, since the shader can't deal with nonfinite

									// numbers anyway.


									// alias the input floats onto a union giving their mantissa etc

									const float32bits * const inFloat[4] = {

										reinterpret_cast<const float32bits *>(a),

										reinterpret_cast<const float32bits *>(b),

										reinterpret_cast<const float32bits *>(c),

										reinterpret_cast<const float32bits *>(d) };


									const static unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f


									const static unsigned int SIGNBIT = 0x80000000;


									for ( int i = 0 ; i < 4 ; ++i ) // performs better not unrolled (less stack spilling)

									{

										unsigned int onGPR = inFloat[i]->rawAsInt;


										// make a mask for each word; will be all 1's if the float is

										// negative, all 0s if it is positive. Can do this just by

										// using arithmetic shift to smear out the sign bit.

										int isNegative = ((int) onGPR) >> 31;


										// clamp to be within -maxfloat16bits, maxfloat16bits

										// can't just use isel because IEEE754 floats are sign-magnitude, not two's comp. However,

										// positive IEEE754s can be compared as if they were ints. So, we need to do a little extra

										// work to test the negative case efficiently.

										// clamp to -maxfloat16

								#error See above for explanation of why this and other uses of isel in this file are broken.

										int clampedNeg = isel( ((int)(onGPR & ~SIGNBIT)) -  maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat

											maxfloat16bitsAsInt | SIGNBIT, // -65504.0f

											onGPR	);

										// clamp to +maxfloat16

										int clampedPos = isel( ((int)(onGPR)) -  maxfloat16bitsAsInt, // in >= maxfloatbits

											maxfloat16bitsAsInt , // -65504.0f

											onGPR	);


										// take advantage of PPC's andc operator to effectively do a masked-move

										onGPR = ( clampedNeg & isNegative ) | ( clampedPos & ~isNegative );


										// fish out the input exponent and mantis fields directly (using the union induces an LHS)

										int inExponent = (onGPR & 0x7f800000) >> 23;

										unsigned int inMantissa = (onGPR & 0x007FFFFF);


										int exponent = inExponent - 127 + 15; // rebias the exponent

										unsigned int mantissa = isel( exponent, inMantissa >> 13, (unsigned) 0 ); // squash the mantissa to zero if the number is too small to represent (no denorms)


										float16bits output;

										// saturate the mantissa if rebiased exponent >= 31 (too big to store)

										output.bits.mantissa = isel( exponent - 31, (unsigned) 0x3ff, mantissa );

										// clamp the exponent to 0..30

										output.bits.biased_exponent = isel( exponent, isel( exponent - 31, 30, exponent ), 0 );

										output.bits.sign = isNegative; //  this doesn't lhs, but instead issues the insrdi op to a word on GPR

										pOut[i].m_storage.rawWord = output.rawWord;

									}

								}

								#endif


								#ifdef _X360

								#define __cntlzw _CountLeadingZeros

								#endif


								template< bool BRANCHLESS >

								inline unsigned short float16::ConvertFloatTo16bitsNonDefault( float input )

								{

									float16bits output;

									float32bits inFloat;

									//if ( !BRANCHLESS ) // x86 code

									{

										if ( input > maxfloat16bits )

											input = maxfloat16bits;

										else if ( input < -maxfloat16bits )

											input = -maxfloat16bits;


										inFloat.rawFloat = input;


									}

									/*

									// The use of isel is incorrect because the first expression is unsigned and therefore always passes

									// the test.

									else // PPC code

									{

										// force the float onto the stack and then a GPR so we eat the LHS only once.

										// you can't just write to one union member and then read back another;

										// the compiler is inconsistent about supporting that kind of type-punning.

										// (ie, it will work in one file, but not another.)

										memcpy(&inFloat.rawFloat, &input, sizeof(inFloat.rawFloat));

										// inFloat.rawFloat = input;

										// clamp using the GPR

										{

											const unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f

											// clamp to be <= maxfloat16bits

											uint32 &rawint = inFloat.rawAsInt; // <--- lhs

											if ( rawint & 0x80000000 ) // negative

											{

												// because floats are sign-magnitude, not two's comp, need to

												// flip the int positive briefly to do the isel comparison

								#error See above for explanation of why this and other uses of isel in this file are broken.

												rawint = isel( ((int)(rawint & ~0x80000000)) -  maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat

													maxfloat16bitsAsInt | 0x80000000, // -65504.0f

													rawint	);

											}

											else // positive

											{

												rawint = isel( ((int)(rawint)) -  maxfloat16bitsAsInt, // in >= maxfloatbits

													maxfloat16bitsAsInt , // -65504.0f

													rawint	);

											}

										}

									}

									*/

									output.bits.sign = inFloat.bits.sign;


									if ( (inFloat.bits.biased_exponent==0) )

									{

										// zero and denorm both map to zero

										output.bits.mantissa = 0;

										output.bits.biased_exponent = 0;

									}

									else if ( inFloat.bits.biased_exponent==0xff )

									{

										if ( !BRANCHLESS )

										{

											if ( (inFloat.bits.mantissa==0) )

											{

												/*

												// infinity

												output.bits.mantissa = 0;

												output.bits.biased_exponent = 31;

												*/


												// infinity maps to maxfloat

												output.bits.mantissa = 0x3ff;

												output.bits.biased_exponent = 0x1e;

											}

											else if ( (inFloat.bits.mantissa!=0) )

											{

												/*

												// NaN

												output.bits.mantissa = 1;

												output.bits.biased_exponent = 31;

												*/


												// NaN maps to zero

												output.bits.mantissa = 0;

												output.bits.biased_exponent = 0;

											}

										}

										else // branchless, only meant for PPC really bc needing the cntlzw op.

										{

											// else if ( inFloat.bits.biased_exponent==0xff )  // either infinity (biased_exponent is 0xff) or NaN.

											{

								#ifdef PLATFORM_PPC

								#if defined(__SPU__)

												int mantissamask = __builtin_clz( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise

								#else

												int mantissamask = __cntlzw( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise

								#endif

								#else

												int mantissamask = output.bits.mantissa ? -1 : 0;

								#endif

												output.bits.mantissa		= isel( mantissamask, 0x3ff, 0 ); //infinity maps to maxfloat, NaN to zero

												output.bits.biased_exponent = isel( mantissamask, 0x1e, 0 );

												output.bits.sign = inFloat.bits.sign;

											}

										}

									}

									else

									{

										// regular number

										int new_exp = inFloat.bits.biased_exponent-float32bias;

										// it's actually better to branch in these cases on PPC,

										// because the variable bit shift is such a massive penalty

										// that it's worth a branch penalty to avoid it.

										if (new_exp<-24)

										{

											// this maps to 0

											output.bits.mantissa = 0;

											output.bits.biased_exponent = 0;

										}


										if (new_exp<-14)

										{

											// this maps to a denorm

											output.bits.biased_exponent = 0;

											unsigned int exp_val = ( unsigned int )( -14 - new_exp );

											if( exp_val > 0 && exp_val < 11 )

											{

												output.bits.mantissa = ( 1 << ( 10 - exp_val ) ) + ( inFloat.bits.mantissa >> ( 13 + exp_val ) );

											}

										}

										else if (new_exp>15)

										{

								#if 0

											// map this value to infinity

											output.bits.mantissa = 0;

											output.bits.biased_exponent = 31;

								#else

											// to big. . . maps to maxfloat

											output.bits.mantissa = 0x3ff;

											output.bits.biased_exponent = 0x1e;

								#endif

										}

										else

										{

											output.bits.biased_exponent = new_exp+15;

											output.bits.mantissa = (inFloat.bits.mantissa >> 13);

										}


									}

									return output.rawWord;

								}


								inline float float16::Convert16bitFloatTo32bits( unsigned short input )

								{

									float32bits output;

									const float16bits &inFloat = *((float16bits *)&input);


									if( IsInfinity( inFloat ) )

									{

										return maxfloat16bits * ( ( inFloat.bits.sign == 1 ) ? -1.0f : 1.0f );

									}

									if( IsNaN( inFloat ) )

									{

										return 0.0;

									}

									if( inFloat.bits.biased_exponent == 0 && inFloat.bits.mantissa != 0 )

									{

										// denorm

										const float half_denorm = (1.0f/16384.0f); // 2^-14

										float mantissa = ((float)(inFloat.bits.mantissa)) / 1024.0f;

										float sgn = (inFloat.bits.sign)? -1.0f :1.0f;

										output.rawFloat = sgn*mantissa*half_denorm;

									}

									else

									{

										// regular number

										unsigned mantissa = inFloat.bits.mantissa;

										unsigned biased_exponent = inFloat.bits.biased_exponent;

										unsigned sign = ((unsigned)inFloat.bits.sign) << 31;

										biased_exponent = ( (biased_exponent - float16bias + float32bias) * (biased_exponent != 0) ) << 23;

										mantissa <<= (23-10);


										*((unsigned *)&output) = ( mantissa | biased_exponent | sign );

									}


									return output.rawFloat;

								}


								inline Vector48& Vector48::operator=(const Vector &vOther)

								{

									CHECK_VALID(vOther);


									x.SetFloat( vOther.x );

									y.SetFloat( vOther.y );

									z.SetFloat( vOther.z );

									return *this;

								}


								inline Vector48::operator Vector ()

								{

									Vector tmp;


									tmp.x = x.GetFloat();

									tmp.y = y.GetFloat();

									tmp.z = z.GetFloat();


									return tmp;

								}


								//=========================================================

								// Fit a 2D vector in 32 bits

								//=========================================================


								class Vector2d32

								{

								public:

									// Construction/destruction:

									Vector2d32(void) {}

									Vector2d32(vec_t X, vec_t Y) { x.SetFloat( X ); y.SetFloat( Y ); }


									// assignment

									Vector2d32& operator=(const Vector &vOther);

									Vector2d32& operator=(const Vector2D &vOther);


									operator Vector2D ();


									void Init( vec_t ix = 0.f, vec_t iy = 0.f);


									float16_with_assign x;

									float16_with_assign y;

								};


								inline Vector2d32& Vector2d32::operator=(const Vector2D &vOther)

								{

									x.SetFloat( vOther.x );

									y.SetFloat( vOther.y );

									return *this;

								}


								inline Vector2d32::operator Vector2D ()

								{

									Vector2D tmp;


									tmp.x = x.GetFloat();

									tmp.y = y.GetFloat();


									return tmp;

								}


								inline void Vector2d32::Init( vec_t ix, vec_t iy )

								{

									x.SetFloat(ix);

									y.SetFloat(iy);

								}


								//=========================================================

								//      FAST SIMD BATCH OPERATIONS

								//=========================================================


								#ifdef _X360

								//// Compressed vector formats: unpack Vector48 and Quaternion48 onto SIMD registers.

								// Only available on 360 for now because SSE1 lacks the necessary operations. SSE2 could

								// do it but we can't count on that yet.

								// If you have many v48's or q48's to stream, please note the functions designed to

								// work on them many at a time.


								extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit.

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16];

								extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants;

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16];


								// unpack a single vector48 at the pointer into the x,y,z components of a fltx4.

								// the w is total garbage.

								FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec )

								{

									// load the three 16-bit floats into the first 48 bits of ret:

									fltx4 ret = XMLoadVector4((const void *)&pVec->x);

									// shuffle the top 64 bits of ret down to the least significant (the z,w) -- 16 of those bits are garbage.

									ret = __vrlimi( ret, ret, 2 | 1, 2 ); // rotate left by 2 words and insert into z,w components

									// now unpack the 16-bit floats into 32-bit floats. This is a hardware op, woohoo!

									ret = __vupkd3d( ret , VPACK_FLOAT16_4 );


									return ret;

								}


								// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4

								// FIXME!!!  If we need a version of this that runs on 360, there is a work-in-progress version that hasn't been debugged lower in the file.

								FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )

								{

									// A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .

									// z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .

									// w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is

									// w's sign bit.

									fltx4 q16s = XMLoadVector3((const void *)pVec);

									fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z.

									fltx4 permute = __lvx(&g_SIMD_Quat48_Unpack_Permute0, 0); // load the permute word that shuffles x,y,z into their own words

									bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.


									q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f

									q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)


									// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1

									const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };

									const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);


									/*

									fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats.


									// scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);

									ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes  );

									*/

									fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd );


									// now, work out what w must be.

									fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.

									dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );


									fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz

									ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)

									if (wneg)

									{

										ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret

									}

									else

									{

										ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret

									}

									return ret;

								}


								// Many-at-a-time unpackers.


								/// Unpack eight consecutive Vector48's in memory onto eight SIMD registers.

								/// The Vector48 pointer must be 16-byte aligned. Eight Vector48s add up

								/// to 48 bytes long. You should maybe think about prefetching.

								FORCEINLINE void UnpackEightVector48SIMD( fltx4 &out1, fltx4 &out2, fltx4 &out3, fltx4 &out4,

																		 fltx4 &out5, fltx4 &out6, fltx4 &out7, fltx4 &out8,

																		 Vector48 * RESTRICT pVecs )

								{

									AssertMsg((reinterpret_cast<unsigned int>(pVecs) & 0x0F) == 0, "Input to UnpackEightVector48SIMD is not 16-byte aligned." );


									// first load the data onto three packed SIMD vectors, which contain eight Vector48s between them.

									// I've named them very explicitly so you can follow the movement of the input data.

									fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7;

									x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 0

									z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load  reintrepret_cast<fltx 4 *>(pVecs) + 1

									y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 2


									// Now, start unpacking. The __vupkd3d operation can turn 16-bit floats into 32-bit floats in a single op!

									// It converts the contents of the z and w words of the input fltx4 , so we need to process a word to do

									// one half, then rotate it to do the other half.

									fltx4 y1z1x2y2 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 );

									x0y0z0x1y1z1x2y2 = __vrlimi( x0y0z0x1y1z1x2y2, x0y0z0x1y1z1x2y2, 0xf, 2 ); // actually y1z1x2y2x0y0z0x1 now. For perf it's important that the first param to vrlimi also be the assignee.

									fltx4 x4y4z4x5 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 );

									z2x3y3z3x4y4z4x5 = __vrlimi( z2x3y3z3x4y4z4x5, z2x3y3z3x4y4z4x5, 0xf, 2 );

									fltx4 z6x7y7z7 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 );

									y5z5x6y6z6x7y7z7 = __vrlimi( y5z5x6y6z6x7y7z7, y5z5x6y6z6x7y7z7, 0xf, 2 );

									fltx4 x0y0z0x1 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 );

									fltx4 z2x3y3z3 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 );

									fltx4 y5z5x6y6 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 );


									// permute to populate the out-registers with part of their vectors:

									out1 = x0y0z0x1;	// DONE

									out2 = __vpermwi( y1z1x2y2, VPERMWI_CONST(0, 0, 1, 0) ); // __y1z1__

									out3 = __vpermwi( y1z1x2y2, VPERMWI_CONST(2, 3, 0, 0) ); // x2y2____

									out4 = __vpermwi( z2x3y3z3, VPERMWI_CONST(1, 2, 3, 0) ); // x3y3z3__ // DONE

									out5 = x4y4z4x5;	// DONE

									out6 = __vpermwi( y5z5x6y6, VPERMWI_CONST(0, 0, 1, 0) ); // __y5z5__

									out7 = __vpermwi( y5z5x6y6, VPERMWI_CONST(2, 3, 0, 0) ); // x6y6____

									out8 = __vpermwi( z6x7y7z7, VPERMWI_CONST(1, 2, 3, 0) ); // x7y7z7__ // DONE


									// there are four more to finish, which we do with a masked insert

									out2 = __vrlimi( out2, x0y0z0x1, 8, 3 ); // x1y1z1__

									out3 = __vrlimi( out3, z2x3y3z3, 2, 2 ); // x2y2x2__

									out6 = __vrlimi( out6, x4y4z4x5, 8, 3 ); // x5y5z5__

									out7 = __vrlimi( out7, z6x7y7z7, 2, 2 ); // x6y6z6__


									// and we're done!

								}


								/// Unpack eight consecutive Quaternion48's in memory onto eight SIMD registers.

								/// The Quaternion48 pointer must be 16-byte aligned. Eight Quaternion48s add up

								/// to 48 bytes long. You should maybe think about prefetching.

								//

								// This could  be improved with verticalization, so that the W sqrts happen

								// on two rather than eight vectors, and then transposing. This would make

								// the initial permuatation even more complicated.

								FORCEINLINE void UnpackEightQuaternion48SIMD( fltx4 &out0, fltx4 &out1, fltx4 &out2, fltx4 &out3,

																			 fltx4 &out4, fltx4 &out5, fltx4 &out6, fltx4 &out7,

																			 Quaternion48 * RESTRICT pVecs )

								{

									AssertMsg((reinterpret_cast<unsigned int>(pVecs) & 0x0F) == 0, "Input to UnpackEightQuaternion48SIMD is not 16-byte aligned." );

									// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1

									const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };

									const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);

									const fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z left by one bit.


									// first load the data onto three packed SIMD vectors, which contain eight Quaternion48s between them.

									// I've named them very explicitly so you can follow the movement of the input data.

									fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7;

									x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 0

									z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load  reintrepret_cast<fltx 4 *>(pVecs) + 1

									y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 2


									// shove each quat onto its own fltx4, by using the permute operation

									// each halfword argument goes into the bottom 16 bits of the floating

									// point rep of 3.0f, then we use a magic constant to scale them.

									out0 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute0) ); // __x0__y0__z0____

									out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____

									// postpone 2 since it straddles two words, we'll get back to it

									out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2  // z2 is important, goes into out2

									out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5  // x5 is important, goes into out5

									// 5 straddles two words

									out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____

									out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____

									// now get back to the straddlers, which we make by blending together a prior output and the other source word

									out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute2)  ); // __x2__y2__z2____

									out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute5)  ); // __x5__y5__z5____


									// the top bit of the z component in each word isn't part of the number; it's

									// a flag indicating whether the eventual w component should be negative.

									// so, we need to move the 0x00008000 bit of the z word onto the top bit

									// of the w word, which is a rotation two bytes right, or 14 bytes left.

									fltx4 wneg[8];

									// juggle all the z halfwords left one bit (toss the wneg sign bit, multiply by two)

									wneg[0] = __vsldoi( out0, out0, 14 );

									out0 = __vslh(out0, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

									wneg[1] = __vsldoi( out1, out1, 14 );

									out1 = __vslh(out1, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

									wneg[2] = __vsldoi( out2, out2, 14 );

									out2 = __vslh(out2, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

									wneg[3] = __vsldoi( out3, out3, 14 );

									out3 = __vslh(out3, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

									wneg[4] = __vsldoi( out4, out4, 14 );

									out4 = __vslh(out4, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

									wneg[5] = __vsldoi( out5, out5, 14 );

									out5 = __vslh(out5, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

									wneg[6] = __vsldoi( out6, out6, 14 );

									out6 = __vslh(out6, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

									wneg[7] = __vsldoi( out7, out7, 14 );

									out7 = __vslh(out7, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)


									// create a mask that is just the sign bit of the w word.

									fltx4 vAllOneBits = __vspltisw(-1); // Shift 31

									fltx4 signMask = __vslw(vAllOneBits, vAllOneBits);  // all the sign bits

									signMask = __vrlimi( signMask, Four_Zeros, 14, 0 ); // zero out x,y,z words


									// this macro defines the operations that will be performed on each of the eight words:

									// * scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);

									// * take the xyz dot product to get 1 - w^2

									// * subtract from one to get w^2

									// * square root to get zero

									// * OR in the wneg sign mask to get sign for zero.

									// though the macro makes it look like these are being done in serial,

									// in fact the compiler will reorder them to minimize stalls.

									fltx4 ONE = Four_Ones;

									fltx4 dotxyz[8];

									fltx4 ww[8];

									// out0 = __vmaddfp( out0, vUpkMul, vUpkAdd );

									//  dotxyz[0] = Dot3SIMD( out0, out0 );

									// clamnp dotxyz if it's more than 1.0

									// all components are 1 - dotxyz

									// clear all but w's sign bit in wneg

									// all components are sqrt(1-dotxyz)

									// toggle w's sign where necessary

									// insert one element from the ww vector into the w component of ret

								#define COMPUTE( target, number ) \

									target ## number = __vmaddfp( target ## number, vUpkMul, vUpkAdd ); \

									dotxyz[number] = Dot3SIMD( target ## number, target ## number ); \

									dotxyz[number] = __vminfp( dotxyz[number], ONE ); \

									ww[number] = SubSIMD( ONE, dotxyz[number] ); \

									wneg[number] = AndSIMD( wneg[number], signMask ) ; \

									ww[number] = SqrtSIMD(ww[number]); \

									ww[number] = OrSIMD( ww[number], wneg[number] ); \

									target ## number = __vrlimi( target ## number, ww[number], 1, 0 );


									COMPUTE(out, 0);

									COMPUTE(out, 1);

									COMPUTE(out, 2);

									COMPUTE(out, 3);

									COMPUTE(out, 4);

									COMPUTE(out, 5);

									COMPUTE(out, 6);

									COMPUTE(out, 7);


								#undef COMPUTE

								}


								#elif defined(_PS3)


								// unpack a single vector48 at the pointer into the x,y,z components of a fltx4.

								// the w is total garbage.

								FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec )

								{

									// PS3 libs just give us this

									Vectormath::Aos::Vector3 ret;

									Vectormath::Aos::loadHalfFloats( ret, reinterpret_cast<const uint16_t *>(&pVec->x) );

									return ret.get128();


								}


								extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit.

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16];

								extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants;

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16];

								extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16];


								// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4

								FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )

								{

									// A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .

									// z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .

									// w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is

									// w's sign bit.

									fltx4 q16s = LoadUnaligned3SIMD((const void *)pVec);

								#if defined(__SPU__)

									vec_ushort8 shift = vec_ld( 0, (short unsigned int *)g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z.

									vec_uchar16 permute = vec_ld(0, (unsigned char *)g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words

								#else

									vec_ushort8 shift = vec_ld( 0, g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z.

									vec_uchar16 permute = vec_ld(0, g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words

								#endif

									bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.


									q16s = vec_perm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f

								#if defined(__SPU__)

									q16s = (fltx4) vec_sl( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

								#else

									q16s = (fltx4) vec_vslh( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)

								#endif


									// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1

									const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };

									const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);


									fltx4 ret = vec_madd( q16s, vUpkMul, vUpkAdd );


									// now, work out what w must be.

									fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.

									dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );


									fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz

									ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)


									// insert one element from the ww vector into the w component of ret

									ret = MaskedAssign( LoadAlignedSIMD(g_SIMD_ComponentMask[3]), wneg ? NegSIMD(ww) : ww, ret );


									return ret;

								}


								#endif


								#if defined( _X360 )

								#pragma bitfield_order( pop )

								#elif defined( _PS3 )

								#pragma ms_struct off

								#pragma reverse_bitfields off

								#endif


								#endif