//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============// // // Purpose: // // $NoKeywords: $ // //=============================================================================// #ifndef COMPRESSED_VECTOR_H #define COMPRESSED_VECTOR_H #ifdef _WIN32 #pragma once #endif #include #include // For vec_t, put this somewhere else? #include "basetypes.h" // For rand(). We really need a library! #include #include "tier0/dbg.h" #include "mathlib/vector.h" #include "mathlib/mathlib.h" #include "mathlib/ssemath.h" #ifdef _PS3 #if defined(__SPU__) #include #include #endif #include #endif #if defined( _X360 ) #pragma bitfield_order( push, lsb_to_msb ) #elif defined( _PS3 ) #pragma ms_struct on #pragma reverse_bitfields on #endif #ifdef OSX #pragma GCC diagnostic ignored "-Wtautological-compare" #endif class Quaternion48; FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec ); //========================================================= // fit a 3D vector into 32 bits //========================================================= class Vector32 { public: // Construction/destruction: Vector32(void); Vector32(vec_t X, vec_t Y, vec_t Z); // assignment Vector32& operator=(const Vector &vOther); operator Vector (); private: unsigned short x:10; unsigned short y:10; unsigned short z:10; unsigned short exp:2; }; inline Vector32& Vector32::operator=(const Vector &vOther) { CHECK_VALID(vOther); static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f }; float fmax = MAX( fabs( vOther.x ), fabs( vOther.y ) ); fmax = fpmax( fmax, fabs( vOther.z ) ); for (exp = 0; exp < 3; exp++) { if (fmax < expScale[exp]) break; } Assert( fmax < expScale[exp] ); float fexp = 512.0f / expScale[exp]; x = clamp( (int)(vOther.x * fexp) + 512, 0, 1023 ); y = clamp( (int)(vOther.y * fexp) + 512, 0, 1023 ); z = clamp( (int)(vOther.z * fexp) + 512, 0, 1023 ); return *this; } inline Vector32::operator Vector () { Vector tmp; static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f }; float fexp = expScale[exp] / 512.0f; tmp.x = (((int)x) - 512) * fexp; tmp.y = (((int)y) - 512) * fexp; tmp.z = (((int)z) - 512) * fexp; return tmp; } //========================================================= // Fit a unit vector into 32 bits //========================================================= class Normal32 { public: // Construction/destruction: Normal32(void); Normal32(vec_t X, vec_t Y, vec_t Z); // assignment Normal32& operator=(const Vector &vOther); operator Vector (); private: unsigned short x:15; unsigned short y:15; unsigned short zneg:1; }; inline Normal32& Normal32::operator=(const Vector &vOther) { CHECK_VALID(vOther); x = clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 ); y = clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 ); zneg = (vOther.z < 0); //x = vOther.x; //y = vOther.y; //z = vOther.z; return *this; } inline Normal32::operator Vector () { Vector tmp; tmp.x = ((int)x - 16384) * (1 / 16384.0); tmp.y = ((int)y - 16384) * (1 / 16384.0); tmp.z = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y ); if (zneg) tmp.z = -tmp.z; return tmp; } //========================================================= // 64 bit Quaternion //========================================================= class Quaternion64 { public: // Construction/destruction: Quaternion64(void); Quaternion64(vec_t X, vec_t Y, vec_t Z); // assignment // Quaternion& operator=(const Quaternion64 &vOther); Quaternion64& operator=(const Quaternion &vOther); operator Quaternion () const; inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary private: Quaternion64( uint64 xx, uint64 yy, uint64 zz, uint64 ww ) : x(xx), y(yy), z(zz), wneg(ww) {}; // stricly for static construction uint64 x:21; uint64 y:21; uint64 z:21; uint64 wneg:1; }; inline Quaternion64::operator Quaternion () const { #if defined(__SPU__) fltx4 tmpV; QuaternionAligned tmpQ; tmpV = LoadUnalignedSIMD(); StoreAlignedSIMD( (float *)&tmpQ, tmpV ); return tmpQ; #else Quaternion tmp; // shift to -1048576, + 1048575, then round down slightly to -1.0 < x < 1.0 tmp.x = ((int)x - 1048576) * (1 / 1048576.5f); tmp.y = ((int)y - 1048576) * (1 / 1048576.5f); tmp.z = ((int)z - 1048576) * (1 / 1048576.5f); tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); if (wneg) tmp.w = -tmp.w; return tmp; #endif } inline Quaternion64& Quaternion64::operator=(const Quaternion &vOther) { CHECK_VALID(vOther); x = clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 ); y = clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 ); z = clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 ); wneg = (vOther.w < 0); return *this; } inline fltx4 Quaternion64::LoadUnalignedSIMD() const { #ifdef _PS3 // assume little endian packing #if 1 const static u32x4 xmask = { 0x00000000, 0x001fffff, 0, 0 }; // bottom 21 bits ( 0 .. 20 ) true const static u32x4 ymask = { 0x000003ff, 0xffe00000, 0, 0 }; // bits 21 .. 41 true const static u32x4 zmask = { 0x7ffffC00, 0x00000000, 0, 0 }; // bits 42 .. 62 true const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 }; // only bit 63 is true const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ; // fish x, y, and z and put them into the the first words of their respective vec registers // the end type for these registers must be signed for the following subtract, BUT! // the shift has to happen as an UNSIGNED type so that it doesn't sign-extend. // the code as present assumes that the fused multiply-add operation has an intermediate // precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an // int op because of course 21 bits is right at the limit of floating point precision. i32x4 ix = (i32x4) (ShiftLeftByBits<32>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word i32x4 iy = (i32x4) (ShiftLeftByBits<11>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word i32x4 iz = (i32x4) (ShiftRightByBits<10>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word /* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product) i32x4 iy = (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word i32x4 iz = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word */ i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word. // convert each of the vectors from int to float. (because of the way the pipeline is organized, // it's as fast to do this as it would have been to do by combining them into one register above // and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll // map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float- // by-float conversion above, but the better stability of the vector quaternion ops makes that okay. const fltx4 ONE = LoadOneSIMD(); #if defined(__SPU__) fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE); fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE); fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE); #else fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE); fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE); fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE); #endif // compute the dot product fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here fltx4 result = Compress4SIMD( fx, fy, fz, fw ); // and for the coup de grace, set the sign bit of fw appropriately result = OrSIMD( result, (fltx4)wsignbit ); return result; #else // original version /* union Qmask { struct qq { Quaternion64 mask; uint64 padding; } asQ ; u32x4 asVec; Qmask( const Quaternion64 &m ) : mask(m) {} }; */ const static u32x4 xmask = { 0xfffff800, 0x00000000, 0, 0 }; // top 21 bits ( 0 .. 20 ) true const static u32x4 ymask = { 0x000007ff, 0xffc00000, 0, 0 }; // bits 21 .. 41 true const static u32x4 zmask = { 0x00000000, 0x003ffffe, 0, 0 }; // bits 42 .. 62 true const static u32x4 wmask = { 0x00000000, 0x00000001, 0, 0 }; // only bit 63 is true const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ; // fish x, y, and z and put them into the the first words of their respective vec registers // the end type for these registers must be signed for the following subtract, BUT! // the shift has to happen as an UNSIGNED type so that it doesn't sign-extend. // the code as present assumes that the fused multiply-add operation has an intermediate // precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an // int op because of course 21 bits is right at the limit of floating point precision. i32x4 ix = (i32x4) (ShiftRightByBits<11>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word i32x4 iy = (i32x4) (ShiftLeftByBits<10>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word i32x4 iz = (i32x4) (ShiftLeftByBits<31>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word /* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product) i32x4 iy = (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word i32x4 iz = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word */ i32x4 wsignbit = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word. // convert each of the vectors from int to float. (because of the way the pipeline is organized, // it's as fast to do this as it would have been to do by combining them into one register above // and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll // map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float- // by-float conversion above, but the better stability of the vector quaternion ops makes that okay. const fltx4 ONE = LoadOneSIMD(); #if defined(__SPU__) fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE); fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE); fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE); #else fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE); fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE); fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE); #endif // compute the dot product fltx4 fw = MsubSIMD( ONE, fz, fz ); // 1 - z*z fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here fltx4 result = Compress4SIMD( fx, fy, fz, fw ); // and for the coup de grace, set the sign bit of fw appropriately result = OrSIMD( result, (fltx4)wsignbit ); return result; #endif #elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once) struct { float x; float y; float z; float w; } tmp; tmp.x = ((int)x - 1048576) * (1 / 1048576.5f); tmp.y = ((int)y - 1048576) * (1 / 1048576.5f); tmp.z = ((int)z - 1048576) * (1 / 1048576.5f); tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); if (wneg) tmp.w = -tmp.w; fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w }; return ret; #else // naive implementation (which ends up being faster than the explicit c imp above) const QuaternionAligned q(Quaternion(*this)) ; return LoadAlignedSIMD( &q ); #endif } //========================================================= // 48 bit Quaternion //========================================================= class Quaternion48 { public: // Construction/destruction: Quaternion48(void); Quaternion48(vec_t X, vec_t Y, vec_t Z); // assignment // Quaternion& operator=(const Quaternion48 &vOther); Quaternion48& operator=(const Quaternion &vOther); operator Quaternion () const; inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary //private: unsigned short x:16; unsigned short y:16; unsigned short z:15; unsigned short wneg:1; }; inline Quaternion48::operator Quaternion () const { #if defined(__SPU__) fltx4 tmpV; QuaternionAligned tmpQ; tmpV = LoadUnalignedSIMD(); StoreAlignedSIMD( (float *)&tmpQ, tmpV ); tmpV = UnpackQuaternion48SIMD( this ); StoreAlignedSIMD( (float *)&tmpQ, tmpV ); return tmpQ; #else Quaternion tmp; tmp.x = ((int)x - 32768) * (1 / 32768.5); tmp.y = ((int)y - 32768) * (1 / 32768.5); tmp.z = ((int)z - 16384) * (1 / 16384.5); tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); if (wneg) tmp.w = -tmp.w; return tmp; #endif } inline Quaternion48& Quaternion48::operator=(const Quaternion &vOther) { CHECK_VALID(vOther); x = clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 ); y = clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 ); z = clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 ); wneg = (vOther.w < 0); return *this; } inline fltx4 Quaternion48::LoadUnalignedSIMD() const { #ifdef _PS3 // assume little endian packing const static u32x4 xmask = { 0x00000000, 0xffff0000, 0, 0 }; const static u32x4 ymask = { 0x0000ffff, 0x00000000, 0, 0 }; const static u32x4 zmask = { 0x7fff0000, 0x00000000, 0, 0 }; const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 }; const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ; // fish x, y, and z and put them into the the first words of their respective vec registers i32x4 ix = (i32x4) (ShiftLeftByBits<16>(vec_and( qbits, xmask ))); i32x4 iy = (i32x4) ((vec_and( qbits, ymask ))); i32x4 iz = (i32x4) (ShiftRightByBits<16>(vec_and( qbits, zmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word. i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask ))); // convert each of the vectors from int to float. (because of the way the pipeline is organized, // it's as fast to do this as it would have been to do by combining them into one register above // and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll // map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float- // by-float conversion above, but the better stability of the vector quaternion ops makes that okay. const fltx4 ONE = LoadOneSIMD(); #if defined(__SPU__) fltx4 fx = SubSIMD( vec_ctf( ix, 15 ), ONE); fltx4 fy = SubSIMD( vec_ctf( iy, 15 ), ONE); fltx4 fz = SubSIMD( vec_ctf( iz, 14 ), ONE); #else fltx4 fx = SubSIMD( vec_vcfsx( ix, 15 ), ONE); fltx4 fy = SubSIMD( vec_vcfsx( iy, 15 ), ONE); fltx4 fz = SubSIMD( vec_vcfsx( iz, 14 ), ONE); #endif // compute the dot product fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here fltx4 result = Compress4SIMD( fx, fy, fz, fw ); // and for the coup de grace, set the sign bit of fw appropriately result = OrSIMD( result, (fltx4)wsignbit ); return result; #elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once) struct { float x; float y; float z; float w; } tmp; tmp.x = ((int)x - 32768) * (1 / 32768.5); tmp.y = ((int)y - 32768) * (1 / 32768.5); tmp.z = ((int)z - 16384) * (1 / 16384.5); tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); if (wneg) tmp.w = -tmp.w; fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w }; return ret; #else // naive implementation (which ends up being faster than the explicit c imp above) const QuaternionAligned q(Quaternion(*this)) ; return LoadAlignedSIMD( &q ); #endif } //========================================================= // 48 bit sorted Quaternion //========================================================= class Quaternion48S { public: // Construction/destruction: Quaternion48S(void); Quaternion48S(vec_t X, vec_t Y, vec_t Z); // assignment // Quaternion& operator=(const Quaternion48 &vOther); Quaternion48S& operator=(const Quaternion &vOther); operator Quaternion () const; operator fltx4 () const RESTRICT ; //private: // shift the quaternion so that the largest value is recreated by the sqrt() // abcd maps modulo into quaternion xyzw starting at "offset" // "offset" is split into two 1 bit fields so that the data packs into 6 bytes (3 shorts) unsigned short a:15; // first of the 3 consecutive smallest quaternion elements unsigned short offsetH:1; // high bit of "offset" unsigned short b:15; unsigned short offsetL:1; // low bit of "offset" unsigned short c:15; unsigned short dneg:1; // sign of the largest quaternion element }; #define SCALE48S 23168.0f // needs to fit 2*sqrt(0.5) into 15 bits. #define SHIFT48S 16384 // half of 2^15 bits. inline Quaternion48S::operator Quaternion () const { #if defined(__SPU__) fltx4 tmpV; QuaternionAligned tmpQ; tmpV = *this; StoreAlignedSIMD( (float *)&tmpQ, tmpV ); return tmpQ; #else Quaternion tmp; COMPILE_TIME_ASSERT( sizeof( Quaternion48S ) == 6 ); float *ptmp = &tmp.x; int ia = offsetL + offsetH * 2; int ib = ( ia + 1 ) % 4; int ic = ( ia + 2 ) % 4; int id = ( ia + 3 ) % 4; ptmp[ia] = ( (int)a - SHIFT48S ) * ( 1.0f / SCALE48S ); ptmp[ib] = ( (int)b - SHIFT48S ) * ( 1.0f / SCALE48S ); ptmp[ic] = ( (int)c - SHIFT48S ) * ( 1.0f / SCALE48S ); ptmp[id] = sqrt( 1.0f - ptmp[ia] * ptmp[ia] - ptmp[ib] * ptmp[ib] - ptmp[ic] * ptmp[ic] ); if (dneg) ptmp[id] = -ptmp[id]; return tmp; #endif } inline Quaternion48S& Quaternion48S::operator=(const Quaternion &vOther) { CHECK_VALID(vOther); const float *ptmp = &vOther.x; // find largest field, make sure that one is recreated by the sqrt to minimize error int i = 0; if ( fabs( ptmp[i] ) < fabs( ptmp[1] ) ) { i = 1; } if ( fabs( ptmp[i] ) < fabs( ptmp[2] ) ) { i = 2; } if ( fabs( ptmp[i] ) < fabs( ptmp[3] ) ) { i = 3; } int offset = ( i + 1 ) % 4; // make "a" so that "d" is the largest element offsetL = offset & 1; offsetH = offset > 1; a = clamp( (int)(ptmp[ offset ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) ); b = clamp( (int)(ptmp[ ( offset + 1 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) ); c = clamp( (int)(ptmp[ ( offset + 2 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) ); dneg = ( ptmp[ ( offset + 3 ) % 4 ] < 0.0f ); return *this; } // decode onto a SIMD register inline Quaternion48S::operator fltx4 () const RESTRICT { AssertMsg1( (((uintp) this) & 1) == 0, "Quaternion48S is unaligned at %p\n", this ); #ifdef PLATFORM_PPC // this algorithm depends heavily on the Altivec permute op, for which there is no analogue in SSE. This function should not be used on PC. // define some vector constants. the shift-scale will be done as a fused multiply-add, // with the scale already distributed onto the shift (the part subtracted) const static fltx4 vrSCALE48S = { (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S) }; const static fltx4 vrSHIFT48S = { ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S }; // start by hoisting the q48 onto a SIMD word. u32x4 source = (u32x4) LoadUnalignedSIMD( this ); const u32x4 ZERO = (u32x4) LoadZeroSIMD(); // also hoist the offset into an int word. Hopefully this executes in parallel with the vector ops thanks to SUPERSCALAR! const unsigned int offset = offsetL | ( offsetH << 1 ); const bi32x4 vDMask = (bi32x4) LoadAlignedSIMD( g_SIMD_ComponentMask[(offset+3)%4] ); // lets vsel poke D into the right word #if 0 // This code can be used to deal with a situation where LoadUnalignedSIMD() fails to properly load // vectors starting on halfword boundaries (rather than 32-bit aligned). Because this is a 48-bit // structure, sometimes it'll only be 16-bit aligned. I expected that lvlx would always load from // a word boundary, requiring me to shift misaligned vectors over by 16 bits, but evidently, // lvlx actually works even on halfword boundaries. Who knew! // Anyway, this code is still here in case the problem crops up, as a hint to both cause and solution. if ( ((unsigned int) this) & 2 ) { source = ShiftLeftByBits<16>(source); } #endif // mask out the offset and dneg bits. Because of the packing #pragmas, the one-bit fields are actually at the MSB // of the halfwords, not the LSB as you might expect. ALIGN16 const static uint32 vMaskTopBits[4] = { 0x80008000, 0x80000000, 0, 0 }; // just the LSB of each the first three halfwords u32x4 abc = AndNotSIMD( (u32x4) LoadAlignedSIMD(vMaskTopBits), source ); // now this is just the A, B, C halfwords. // Next, unpack abc as unsigned numbers. We can do this with a permute op. In fact, we can exploit // the integer pipe and load the offset while we're loading the SIMD numbers, then use the integer offset to select // the permute, which will therefore also perform the rotate that maps abc to their rightful destinations. // the masks below are for the vperm instruction, which is a byte-by-byte mapping from source to destination. // it's assumed that the FIRST parameter to vperm will be ZERO, and the second the data. (that makes the masks a little clearer) // in the simplest case -- imagine each letter below represents one byte; the source vector looks like // AABB CCxx xxxx xxxx. We're going to permute it onto the work register like // 00AA 00BB 00CC 0000 ALIGN16 const static uint32 vPermutations[4][4] = { // offset = 0 means a->x, b->y, c->z, d->w { 0x00001011, 0x00001213, 0x00001415, 0x00000000 }, // offset = 1 means a->y, b->z, c->w, d->a { 0x00000000, 0x00001011, 0x00001213, 0x00001415 }, { 0x00001415, 0x00000000, 0x00001011, 0x00001213 }, { 0x00001213, 0x00001415, 0x00000000, 0x00001011 } }; // compute two permutations on the input data: one where the zero-word is always in the w component, // which lets us do a 3-way rather than 4-way dot product; and another where the zero-word corresponds to // wherever D is supposed to go. // Even though this seems redundant, the duplicated work ends up fitting into the pipeline bubbles, // and the savings between a 4-way and 3-way dot seem to be about 3ns. u32x4 abcfordot = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[0] ) ); abc = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[offset] ) ); // turn each of the ints into floats. Because we masked out the one-bit field at the top, // We can think of this as a conversion from fixed-point where there's no fractional bit. // This is done in line with the shift-scale operation, which is itself fused. // we do this twice: once for the vector with the guaranteed zero w-word, and // once for the vector rotated by the offset. fltx4 vfDest = AndNotSIMD( vDMask, MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abc, 0 ), vrSCALE48S, vrSHIFT48S ) ); fltx4 vfDestForDot = MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abcfordot, 0 ), vrSCALE48S, vrSHIFT48S ) ; // compute magnitude of the vector we know to have a 0 in the w word. const fltx4 vDot = Dot3SIMD( vfDestForDot, vfDestForDot ); // recover the "D" word const fltx4 vD = SqrtSIMD( SubSIMD( LoadOneSIMD(), vDot ) ); // mask D into the converted-and-offset vector, then return. return MaskedAssign( vDMask, dneg ? NegSIMD(vD) : vD, vfDest ); #else AssertMsg( false, "Quaternion48S::operator fltx4 is slow on this platform and should not be used.\n" ); QuaternionAligned q( (Quaternion) *this ); return LoadAlignedSIMD( &q ); #endif } //========================================================= // 32 bit Quaternion //========================================================= class Quaternion32 { public: // Construction/destruction: Quaternion32(void); Quaternion32(vec_t X, vec_t Y, vec_t Z); // assignment // Quaternion& operator=(const Quaternion48 &vOther); Quaternion32& operator=(const Quaternion &vOther); operator Quaternion (); inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary private: unsigned int x:11; unsigned int y:10; unsigned int z:10; unsigned int wneg:1; }; inline Quaternion32::operator Quaternion () { #if defined(__SPU__) fltx4 tmpV; QuaternionAligned tmpQ; tmpV = LoadUnalignedSIMD(); StoreAlignedSIMD( (float *)&tmpQ, tmpV ); return tmpQ; #else Quaternion tmp; tmp.x = ((int)x - 1024) * (1 / 1024.0); tmp.y = ((int)y - 512) * (1 / 512.0); tmp.z = ((int)z - 512) * (1 / 512.0); tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); if (wneg) tmp.w = -tmp.w; return tmp; #endif } inline Quaternion32& Quaternion32::operator=(const Quaternion &vOther) { CHECK_VALID(vOther); x = clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 ); y = clamp( (int)(vOther.y * 512) + 512, 0, 1023 ); z = clamp( (int)(vOther.z * 512) + 512, 0, 1023 ); wneg = (vOther.w < 0); return *this; } inline fltx4 Quaternion32::LoadUnalignedSIMD() const { #ifdef _PS3 // assume little endian packing const static u32x4 xmask = { 0x000007ff, 0, 0, 0 }; const static u32x4 ymask = { 0x001ff800, 0, 0, 0 }; const static u32x4 zmask = { 0x7fe00000, 0, 0, 0 }; const static u32x4 wmask = { 0x80000000, 0, 0, 0 }; const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ; // fish x, y, and z and put them into the the first words of their respective vec registers i32x4 ix = (i32x4) ((vec_and( qbits, xmask ))); i32x4 iy = (i32x4) (ShiftRightByBits<11>(vec_and( qbits, ymask ))); i32x4 iz = (i32x4) (ShiftRightByBits<21>(vec_and( qbits, zmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word. i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask ))); // convert each of the vectors from int to float. (because of the way the pipeline is organized, // it's as fast to do this as it would have been to do by combining them into one register above // and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll // map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float- // by-float conversion above, but the better stability of the vector quaternion ops makes that okay. const fltx4 ONE = LoadOneSIMD(); #if defined(__SPU__) fltx4 fx = SubSIMD( vec_ctf( ix, 10 ), ONE); fltx4 fy = SubSIMD( vec_ctf( iy, 9 ), ONE); fltx4 fz = SubSIMD( vec_ctf( iz, 9 ), ONE); #else fltx4 fx = SubSIMD( vec_vcfsx( ix, 10 ), ONE); fltx4 fy = SubSIMD( vec_vcfsx( iy, 10 ), ONE); fltx4 fz = SubSIMD( vec_vcfsx( iz, 9 ), ONE); #endif // compute the dot product fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here fltx4 result = Compress4SIMD( fx, fy, fz, fw ); // and for the coup de grace, set the sign bit of fw appropriately result = OrSIMD( result, (fltx4)wsignbit ); return result; #else struct { float x; float y; float z; float w; } tmp; tmp.x = ((int)x - 1024) * (1 / 1024.0); tmp.y = ((int)y - 512) * (1 / 512.0); tmp.z = ((int)z - 512) * (1 / 512.0); tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z ); if (wneg) tmp.w = -tmp.w; fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w }; return ret; #endif } //========================================================= // 16 bit float //========================================================= const int float32bias = 127; const int float16bias = 15; const float maxfloat16bits = 65504.0f; class float16 { public: // float16() {}; //float16( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); } float16& operator=(const unsigned short &other) { m_storage.rawWord = other; return *this; }; void Init() { m_storage.rawWord = 0; } // float16& operator=(const float16 &other) { m_storage.rawWord = other.m_storage.rawWord; return *this; } // float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; } // operator unsigned short () { return m_storage.rawWord; } // operator float () { return Convert16bitFloatTo32bits( m_storage.rawWord ); } unsigned short GetBits() const { return m_storage.rawWord; } float GetFloat() const { return Convert16bitFloatTo32bits( m_storage.rawWord ); } void SetFloat( float in ) { m_storage.rawWord = ConvertFloatTo16bits( in ); } bool IsInfinity() const { return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa == 0; } bool IsNaN() const { return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa != 0; } bool operator==(const float16 other) const { return m_storage.rawWord == other.m_storage.rawWord; } bool operator!=(const float16 other) const { return m_storage.rawWord != other.m_storage.rawWord; } // bool operator< (const float other) const { return GetFloat() < other; } // bool operator> (const float other) const { return GetFloat() > other; } template< bool BRANCHLESS > // allows you to force branchy/branchless implementation regardless of the current platform static unsigned short ConvertFloatTo16bitsNonDefault( float input ); static float Convert16bitFloatTo32bits( unsigned short input ); // a special case useful for the pixel writer: take four input float values, which are already in memory (not on registers), // convert them all at once and write them sequentially through the output pointer. static void ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut, const float *a, const float *b, const float *c, const float *d ); // unfortunately, function templates can't have default template parameters in 2010-era C++ inline static unsigned short ConvertFloatTo16bits( float input ) { // default to branchless on ppc and branchy on x86 #ifdef PLATFORM_PPC return ConvertFloatTo16bitsNonDefault(input); #else return ConvertFloatTo16bitsNonDefault(input); #endif } protected: union float32bits { float rawFloat; uint32 rawAsInt; struct { unsigned int mantissa : 23; unsigned int biased_exponent : 8; unsigned int sign : 1; } bits; }; union float16bits { unsigned short rawWord; struct { unsigned short mantissa : 10; unsigned short biased_exponent : 5; unsigned short sign : 1; } bits; }; static bool IsNaN( float16bits in ) { return in.bits.biased_exponent == 31 && in.bits.mantissa != 0; } static bool IsInfinity( float16bits in ) { return in.bits.biased_exponent == 31 && in.bits.mantissa == 0; } // 0x0001 - 0x03ff float16bits m_storage; }; class float16_with_assign : public float16 { public: float16_with_assign() {} float16_with_assign( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); } float16& operator=(const float16 &other) { m_storage.rawWord = ((float16_with_assign &)other).m_storage.rawWord; return *this; } float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; } // operator unsigned short () const { return m_storage.rawWord; } operator float () const { return Convert16bitFloatTo32bits( m_storage.rawWord ); } }; //========================================================= // Fit a 3D vector in 48 bits //========================================================= class Vector48 { public: // Construction/destruction: Vector48(void) {} Vector48(vec_t X, vec_t Y, vec_t Z) { x.SetFloat( X ); y.SetFloat( Y ); z.SetFloat( Z ); } // assignment Vector48& operator=(const Vector &vOther); operator Vector (); const float operator[]( int i ) const { return (((float16 *)this)[i]).GetFloat(); } float16 x; float16 y; float16 z; }; // The uses of isel below are malformed because the first expression is unsigned and thus always >= 0, // so this whole expression maps to a simple assignment. This was found through a noisy clang // warning. I am preprocessing this out until it is needed. #if 0 inline void float16::ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut, const float *a, const float *b, const float *c, const float *d ) { COMPILE_TIME_ASSERT( sizeof(float) == 4 ); // being meant for use on the PPC, this is tuned for that. // it is mostly branchless, except for the large outer for loop, // since there's enough instructions inside that unrolling is // a bad idea. This fucntion is four-at-once to simplify SIMDifying in the // future should a convenient SIMD way to decimate emerge // Also, because this is only used for the special case of converting // float arrays into float16 GPU textures, this turns denorms into zeroes // and infinities into MAXFLTs, since the shader can't deal with nonfinite // numbers anyway. // alias the input floats onto a union giving their mantissa etc const float32bits * const inFloat[4] = { reinterpret_cast(a), reinterpret_cast(b), reinterpret_cast(c), reinterpret_cast(d) }; const static unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f const static unsigned int SIGNBIT = 0x80000000; for ( int i = 0 ; i < 4 ; ++i ) // performs better not unrolled (less stack spilling) { unsigned int onGPR = inFloat[i]->rawAsInt; // make a mask for each word; will be all 1's if the float is // negative, all 0s if it is positive. Can do this just by // using arithmetic shift to smear out the sign bit. int isNegative = ((int) onGPR) >> 31; // clamp to be within -maxfloat16bits, maxfloat16bits // can't just use isel because IEEE754 floats are sign-magnitude, not two's comp. However, // positive IEEE754s can be compared as if they were ints. So, we need to do a little extra // work to test the negative case efficiently. // clamp to -maxfloat16 #error See above for explanation of why this and other uses of isel in this file are broken. int clampedNeg = isel( ((int)(onGPR & ~SIGNBIT)) - maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat maxfloat16bitsAsInt | SIGNBIT, // -65504.0f onGPR ); // clamp to +maxfloat16 int clampedPos = isel( ((int)(onGPR)) - maxfloat16bitsAsInt, // in >= maxfloatbits maxfloat16bitsAsInt , // -65504.0f onGPR ); // take advantage of PPC's andc operator to effectively do a masked-move onGPR = ( clampedNeg & isNegative ) | ( clampedPos & ~isNegative ); // fish out the input exponent and mantis fields directly (using the union induces an LHS) int inExponent = (onGPR & 0x7f800000) >> 23; unsigned int inMantissa = (onGPR & 0x007FFFFF); int exponent = inExponent - 127 + 15; // rebias the exponent unsigned int mantissa = isel( exponent, inMantissa >> 13, (unsigned) 0 ); // squash the mantissa to zero if the number is too small to represent (no denorms) float16bits output; // saturate the mantissa if rebiased exponent >= 31 (too big to store) output.bits.mantissa = isel( exponent - 31, (unsigned) 0x3ff, mantissa ); // clamp the exponent to 0..30 output.bits.biased_exponent = isel( exponent, isel( exponent - 31, 30, exponent ), 0 ); output.bits.sign = isNegative; // this doesn't lhs, but instead issues the insrdi op to a word on GPR pOut[i].m_storage.rawWord = output.rawWord; } } #endif #ifdef _X360 #define __cntlzw _CountLeadingZeros #endif template< bool BRANCHLESS > inline unsigned short float16::ConvertFloatTo16bitsNonDefault( float input ) { float16bits output; float32bits inFloat; //if ( !BRANCHLESS ) // x86 code { if ( input > maxfloat16bits ) input = maxfloat16bits; else if ( input < -maxfloat16bits ) input = -maxfloat16bits; inFloat.rawFloat = input; } /* // The use of isel is incorrect because the first expression is unsigned and therefore always passes // the test. else // PPC code { // force the float onto the stack and then a GPR so we eat the LHS only once. // you can't just write to one union member and then read back another; // the compiler is inconsistent about supporting that kind of type-punning. // (ie, it will work in one file, but not another.) memcpy(&inFloat.rawFloat, &input, sizeof(inFloat.rawFloat)); // inFloat.rawFloat = input; // clamp using the GPR { const unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f // clamp to be <= maxfloat16bits uint32 &rawint = inFloat.rawAsInt; // <--- lhs if ( rawint & 0x80000000 ) // negative { // because floats are sign-magnitude, not two's comp, need to // flip the int positive briefly to do the isel comparison #error See above for explanation of why this and other uses of isel in this file are broken. rawint = isel( ((int)(rawint & ~0x80000000)) - maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat maxfloat16bitsAsInt | 0x80000000, // -65504.0f rawint ); } else // positive { rawint = isel( ((int)(rawint)) - maxfloat16bitsAsInt, // in >= maxfloatbits maxfloat16bitsAsInt , // -65504.0f rawint ); } } } */ output.bits.sign = inFloat.bits.sign; if ( (inFloat.bits.biased_exponent==0) ) { // zero and denorm both map to zero output.bits.mantissa = 0; output.bits.biased_exponent = 0; } else if ( inFloat.bits.biased_exponent==0xff ) { if ( !BRANCHLESS ) { if ( (inFloat.bits.mantissa==0) ) { /* // infinity output.bits.mantissa = 0; output.bits.biased_exponent = 31; */ // infinity maps to maxfloat output.bits.mantissa = 0x3ff; output.bits.biased_exponent = 0x1e; } else if ( (inFloat.bits.mantissa!=0) ) { /* // NaN output.bits.mantissa = 1; output.bits.biased_exponent = 31; */ // NaN maps to zero output.bits.mantissa = 0; output.bits.biased_exponent = 0; } } else // branchless, only meant for PPC really bc needing the cntlzw op. { // else if ( inFloat.bits.biased_exponent==0xff ) // either infinity (biased_exponent is 0xff) or NaN. { #ifdef PLATFORM_PPC #if defined(__SPU__) int mantissamask = __builtin_clz( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise #else int mantissamask = __cntlzw( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise #endif #else int mantissamask = output.bits.mantissa ? -1 : 0; #endif output.bits.mantissa = isel( mantissamask, 0x3ff, 0 ); //infinity maps to maxfloat, NaN to zero output.bits.biased_exponent = isel( mantissamask, 0x1e, 0 ); output.bits.sign = inFloat.bits.sign; } } } else { // regular number int new_exp = inFloat.bits.biased_exponent-float32bias; // it's actually better to branch in these cases on PPC, // because the variable bit shift is such a massive penalty // that it's worth a branch penalty to avoid it. if (new_exp<-24) { // this maps to 0 output.bits.mantissa = 0; output.bits.biased_exponent = 0; } if (new_exp<-14) { // this maps to a denorm output.bits.biased_exponent = 0; unsigned int exp_val = ( unsigned int )( -14 - new_exp ); if( exp_val > 0 && exp_val < 11 ) { output.bits.mantissa = ( 1 << ( 10 - exp_val ) ) + ( inFloat.bits.mantissa >> ( 13 + exp_val ) ); } } else if (new_exp>15) { #if 0 // map this value to infinity output.bits.mantissa = 0; output.bits.biased_exponent = 31; #else // to big. . . maps to maxfloat output.bits.mantissa = 0x3ff; output.bits.biased_exponent = 0x1e; #endif } else { output.bits.biased_exponent = new_exp+15; output.bits.mantissa = (inFloat.bits.mantissa >> 13); } } return output.rawWord; } inline float float16::Convert16bitFloatTo32bits( unsigned short input ) { float32bits output; const float16bits &inFloat = *((float16bits *)&input); if( IsInfinity( inFloat ) ) { return maxfloat16bits * ( ( inFloat.bits.sign == 1 ) ? -1.0f : 1.0f ); } if( IsNaN( inFloat ) ) { return 0.0; } if( inFloat.bits.biased_exponent == 0 && inFloat.bits.mantissa != 0 ) { // denorm const float half_denorm = (1.0f/16384.0f); // 2^-14 float mantissa = ((float)(inFloat.bits.mantissa)) / 1024.0f; float sgn = (inFloat.bits.sign)? -1.0f :1.0f; output.rawFloat = sgn*mantissa*half_denorm; } else { // regular number unsigned mantissa = inFloat.bits.mantissa; unsigned biased_exponent = inFloat.bits.biased_exponent; unsigned sign = ((unsigned)inFloat.bits.sign) << 31; biased_exponent = ( (biased_exponent - float16bias + float32bias) * (biased_exponent != 0) ) << 23; mantissa <<= (23-10); *((unsigned *)&output) = ( mantissa | biased_exponent | sign ); } return output.rawFloat; } inline Vector48& Vector48::operator=(const Vector &vOther) { CHECK_VALID(vOther); x.SetFloat( vOther.x ); y.SetFloat( vOther.y ); z.SetFloat( vOther.z ); return *this; } inline Vector48::operator Vector () { Vector tmp; tmp.x = x.GetFloat(); tmp.y = y.GetFloat(); tmp.z = z.GetFloat(); return tmp; } //========================================================= // Fit a 2D vector in 32 bits //========================================================= class Vector2d32 { public: // Construction/destruction: Vector2d32(void) {} Vector2d32(vec_t X, vec_t Y) { x.SetFloat( X ); y.SetFloat( Y ); } // assignment Vector2d32& operator=(const Vector &vOther); Vector2d32& operator=(const Vector2D &vOther); operator Vector2D (); void Init( vec_t ix = 0.f, vec_t iy = 0.f); float16_with_assign x; float16_with_assign y; }; inline Vector2d32& Vector2d32::operator=(const Vector2D &vOther) { x.SetFloat( vOther.x ); y.SetFloat( vOther.y ); return *this; } inline Vector2d32::operator Vector2D () { Vector2D tmp; tmp.x = x.GetFloat(); tmp.y = y.GetFloat(); return tmp; } inline void Vector2d32::Init( vec_t ix, vec_t iy ) { x.SetFloat(ix); y.SetFloat(iy); } //========================================================= // FAST SIMD BATCH OPERATIONS //========================================================= #ifdef _X360 //// Compressed vector formats: unpack Vector48 and Quaternion48 onto SIMD registers. // Only available on 360 for now because SSE1 lacks the necessary operations. SSE2 could // do it but we can't count on that yet. // If you have many v48's or q48's to stream, please note the functions designed to // work on them many at a time. extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16]; extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16]; // unpack a single vector48 at the pointer into the x,y,z components of a fltx4. // the w is total garbage. FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec ) { // load the three 16-bit floats into the first 48 bits of ret: fltx4 ret = XMLoadVector4((const void *)&pVec->x); // shuffle the top 64 bits of ret down to the least significant (the z,w) -- 16 of those bits are garbage. ret = __vrlimi( ret, ret, 2 | 1, 2 ); // rotate left by 2 words and insert into z,w components // now unpack the 16-bit floats into 32-bit floats. This is a hardware op, woohoo! ret = __vupkd3d( ret , VPACK_FLOAT16_4 ); return ret; } // unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4 // FIXME!!! If we need a version of this that runs on 360, there is a work-in-progress version that hasn't been debugged lower in the file. FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec ) { // A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 . // z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 . // w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is // w's sign bit. fltx4 q16s = XMLoadVector3((const void *)pVec); fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z. fltx4 permute = __lvx(&g_SIMD_Quat48_Unpack_Permute0, 0); // load the permute word that shuffles x,y,z into their own words bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS. q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1 const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s }; const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); /* fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats. // scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0); ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes ); */ fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd ); // now, work out what w must be. fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self. dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones ); fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz) if (wneg) { ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret } else { ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret } return ret; } // Many-at-a-time unpackers. /// Unpack eight consecutive Vector48's in memory onto eight SIMD registers. /// The Vector48 pointer must be 16-byte aligned. Eight Vector48s add up /// to 48 bytes long. You should maybe think about prefetching. FORCEINLINE void UnpackEightVector48SIMD( fltx4 &out1, fltx4 &out2, fltx4 &out3, fltx4 &out4, fltx4 &out5, fltx4 &out6, fltx4 &out7, fltx4 &out8, Vector48 * RESTRICT pVecs ) { AssertMsg((reinterpret_cast(pVecs) & 0x0F) == 0, "Input to UnpackEightVector48SIMD is not 16-byte aligned." ); // first load the data onto three packed SIMD vectors, which contain eight Vector48s between them. // I've named them very explicitly so you can follow the movement of the input data. fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7; x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast(pVecs) + 0 z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load reintrepret_cast(pVecs) + 1 y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast(pVecs) + 2 // Now, start unpacking. The __vupkd3d operation can turn 16-bit floats into 32-bit floats in a single op! // It converts the contents of the z and w words of the input fltx4 , so we need to process a word to do // one half, then rotate it to do the other half. fltx4 y1z1x2y2 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 ); x0y0z0x1y1z1x2y2 = __vrlimi( x0y0z0x1y1z1x2y2, x0y0z0x1y1z1x2y2, 0xf, 2 ); // actually y1z1x2y2x0y0z0x1 now. For perf it's important that the first param to vrlimi also be the assignee. fltx4 x4y4z4x5 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 ); z2x3y3z3x4y4z4x5 = __vrlimi( z2x3y3z3x4y4z4x5, z2x3y3z3x4y4z4x5, 0xf, 2 ); fltx4 z6x7y7z7 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 ); y5z5x6y6z6x7y7z7 = __vrlimi( y5z5x6y6z6x7y7z7, y5z5x6y6z6x7y7z7, 0xf, 2 ); fltx4 x0y0z0x1 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 ); fltx4 z2x3y3z3 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 ); fltx4 y5z5x6y6 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 ); // permute to populate the out-registers with part of their vectors: out1 = x0y0z0x1; // DONE out2 = __vpermwi( y1z1x2y2, VPERMWI_CONST(0, 0, 1, 0) ); // __y1z1__ out3 = __vpermwi( y1z1x2y2, VPERMWI_CONST(2, 3, 0, 0) ); // x2y2____ out4 = __vpermwi( z2x3y3z3, VPERMWI_CONST(1, 2, 3, 0) ); // x3y3z3__ // DONE out5 = x4y4z4x5; // DONE out6 = __vpermwi( y5z5x6y6, VPERMWI_CONST(0, 0, 1, 0) ); // __y5z5__ out7 = __vpermwi( y5z5x6y6, VPERMWI_CONST(2, 3, 0, 0) ); // x6y6____ out8 = __vpermwi( z6x7y7z7, VPERMWI_CONST(1, 2, 3, 0) ); // x7y7z7__ // DONE // there are four more to finish, which we do with a masked insert out2 = __vrlimi( out2, x0y0z0x1, 8, 3 ); // x1y1z1__ out3 = __vrlimi( out3, z2x3y3z3, 2, 2 ); // x2y2x2__ out6 = __vrlimi( out6, x4y4z4x5, 8, 3 ); // x5y5z5__ out7 = __vrlimi( out7, z6x7y7z7, 2, 2 ); // x6y6z6__ // and we're done! } /// Unpack eight consecutive Quaternion48's in memory onto eight SIMD registers. /// The Quaternion48 pointer must be 16-byte aligned. Eight Quaternion48s add up /// to 48 bytes long. You should maybe think about prefetching. // // This could be improved with verticalization, so that the W sqrts happen // on two rather than eight vectors, and then transposing. This would make // the initial permuatation even more complicated. FORCEINLINE void UnpackEightQuaternion48SIMD( fltx4 &out0, fltx4 &out1, fltx4 &out2, fltx4 &out3, fltx4 &out4, fltx4 &out5, fltx4 &out6, fltx4 &out7, Quaternion48 * RESTRICT pVecs ) { AssertMsg((reinterpret_cast(pVecs) & 0x0F) == 0, "Input to UnpackEightQuaternion48SIMD is not 16-byte aligned." ); // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1 const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s }; const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); const fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z left by one bit. // first load the data onto three packed SIMD vectors, which contain eight Quaternion48s between them. // I've named them very explicitly so you can follow the movement of the input data. fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7; x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast(pVecs) + 0 z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load reintrepret_cast(pVecs) + 1 y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast(pVecs) + 2 // shove each quat onto its own fltx4, by using the permute operation // each halfword argument goes into the bottom 16 bits of the floating // point rep of 3.0f, then we use a magic constant to scale them. out0 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast(&g_SIMD_Quat48_Unpack_Permute0) ); // __x0__y0__z0____ out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast(&g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____ // postpone 2 since it straddles two words, we'll get back to it out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast(&g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2 // z2 is important, goes into out2 out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast(&g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5 // x5 is important, goes into out5 // 5 straddles two words out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast(&g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____ out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast(&g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____ // now get back to the straddlers, which we make by blending together a prior output and the other source word out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast(&g_SIMD_Quat48_Unpack_Permute2) ); // __x2__y2__z2____ out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast(&g_SIMD_Quat48_Unpack_Permute5) ); // __x5__y5__z5____ // the top bit of the z component in each word isn't part of the number; it's // a flag indicating whether the eventual w component should be negative. // so, we need to move the 0x00008000 bit of the z word onto the top bit // of the w word, which is a rotation two bytes right, or 14 bytes left. fltx4 wneg[8]; // juggle all the z halfwords left one bit (toss the wneg sign bit, multiply by two) wneg[0] = __vsldoi( out0, out0, 14 ); out0 = __vslh(out0, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) wneg[1] = __vsldoi( out1, out1, 14 ); out1 = __vslh(out1, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) wneg[2] = __vsldoi( out2, out2, 14 ); out2 = __vslh(out2, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) wneg[3] = __vsldoi( out3, out3, 14 ); out3 = __vslh(out3, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) wneg[4] = __vsldoi( out4, out4, 14 ); out4 = __vslh(out4, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) wneg[5] = __vsldoi( out5, out5, 14 ); out5 = __vslh(out5, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) wneg[6] = __vsldoi( out6, out6, 14 ); out6 = __vslh(out6, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) wneg[7] = __vsldoi( out7, out7, 14 ); out7 = __vslh(out7, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) // create a mask that is just the sign bit of the w word. fltx4 vAllOneBits = __vspltisw(-1); // Shift 31 fltx4 signMask = __vslw(vAllOneBits, vAllOneBits); // all the sign bits signMask = __vrlimi( signMask, Four_Zeros, 14, 0 ); // zero out x,y,z words // this macro defines the operations that will be performed on each of the eight words: // * scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0); // * take the xyz dot product to get 1 - w^2 // * subtract from one to get w^2 // * square root to get zero // * OR in the wneg sign mask to get sign for zero. // though the macro makes it look like these are being done in serial, // in fact the compiler will reorder them to minimize stalls. fltx4 ONE = Four_Ones; fltx4 dotxyz[8]; fltx4 ww[8]; // out0 = __vmaddfp( out0, vUpkMul, vUpkAdd ); // dotxyz[0] = Dot3SIMD( out0, out0 ); // clamnp dotxyz if it's more than 1.0 // all components are 1 - dotxyz // clear all but w's sign bit in wneg // all components are sqrt(1-dotxyz) // toggle w's sign where necessary // insert one element from the ww vector into the w component of ret #define COMPUTE( target, number ) \ target ## number = __vmaddfp( target ## number, vUpkMul, vUpkAdd ); \ dotxyz[number] = Dot3SIMD( target ## number, target ## number ); \ dotxyz[number] = __vminfp( dotxyz[number], ONE ); \ ww[number] = SubSIMD( ONE, dotxyz[number] ); \ wneg[number] = AndSIMD( wneg[number], signMask ) ; \ ww[number] = SqrtSIMD(ww[number]); \ ww[number] = OrSIMD( ww[number], wneg[number] ); \ target ## number = __vrlimi( target ## number, ww[number], 1, 0 ); COMPUTE(out, 0); COMPUTE(out, 1); COMPUTE(out, 2); COMPUTE(out, 3); COMPUTE(out, 4); COMPUTE(out, 5); COMPUTE(out, 6); COMPUTE(out, 7); #undef COMPUTE } #elif defined(_PS3) // unpack a single vector48 at the pointer into the x,y,z components of a fltx4. // the w is total garbage. FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec ) { // PS3 libs just give us this Vectormath::Aos::Vector3 ret; Vectormath::Aos::loadHalfFloats( ret, reinterpret_cast(&pVec->x) ); return ret.get128(); } extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16]; extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16]; extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16]; // unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4 FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec ) { // A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 . // z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 . // w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is // w's sign bit. fltx4 q16s = LoadUnaligned3SIMD((const void *)pVec); #if defined(__SPU__) vec_ushort8 shift = vec_ld( 0, (short unsigned int *)g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z. vec_uchar16 permute = vec_ld(0, (unsigned char *)g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words #else vec_ushort8 shift = vec_ld( 0, g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z. vec_uchar16 permute = vec_ld(0, g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words #endif bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS. q16s = vec_perm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f #if defined(__SPU__) q16s = (fltx4) vec_sl( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) #else q16s = (fltx4) vec_vslh( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16) #endif // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1 const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s }; const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); fltx4 ret = vec_madd( q16s, vUpkMul, vUpkAdd ); // now, work out what w must be. fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self. dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones ); fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz) // insert one element from the ww vector into the w component of ret ret = MaskedAssign( LoadAlignedSIMD(g_SIMD_ComponentMask[3]), wneg ? NegSIMD(ww) : ww, ret ); return ret; } #endif #if defined( _X360 ) #pragma bitfield_order( pop ) #elif defined( _PS3 ) #pragma ms_struct off #pragma reverse_bitfields off #endif #endif