Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1620 lines
60 KiB

  1. //========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose:
  4. //
  5. // $NoKeywords: $
  6. //
  7. //=============================================================================//
  8. #ifndef COMPRESSED_VECTOR_H
  9. #define COMPRESSED_VECTOR_H
  10. #ifdef _WIN32
  11. #pragma once
  12. #endif
  13. #include <math.h>
  14. #include <float.h>
  15. // For vec_t, put this somewhere else?
  16. #include "basetypes.h"
  17. // For rand(). We really need a library!
  18. #include <stdlib.h>
  19. #include "tier0/dbg.h"
  20. #include "mathlib/vector.h"
  21. #include "mathlib/mathlib.h"
  22. #include "mathlib/ssemath.h"
  23. #ifdef _PS3
  24. #if defined(__SPU__)
  25. #include <spu_intrinsics.h>
  26. #include <vmx2spu.h>
  27. #endif
  28. #include <vectormath/cpp/vectormath_aos.h>
  29. #endif
  30. #if defined( _X360 )
  31. #pragma bitfield_order( push, lsb_to_msb )
  32. #elif defined( _PS3 )
  33. #pragma ms_struct on
  34. #pragma reverse_bitfields on
  35. #endif
  36. #ifdef OSX
  37. #pragma GCC diagnostic ignored "-Wtautological-compare"
  38. #endif
  39. class Quaternion48;
  40. FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec );
  41. //=========================================================
  42. // fit a 3D vector into 32 bits
  43. //=========================================================
  44. class Vector32
  45. {
  46. public:
  47. // Construction/destruction:
  48. Vector32(void);
  49. Vector32(vec_t X, vec_t Y, vec_t Z);
  50. // assignment
  51. Vector32& operator=(const Vector &vOther);
  52. operator Vector ();
  53. private:
  54. unsigned short x:10;
  55. unsigned short y:10;
  56. unsigned short z:10;
  57. unsigned short exp:2;
  58. };
  59. inline Vector32& Vector32::operator=(const Vector &vOther)
  60. {
  61. CHECK_VALID(vOther);
  62. static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };
  63. float fmax = MAX( fabs( vOther.x ), fabs( vOther.y ) );
  64. fmax = fpmax( fmax, fabs( vOther.z ) );
  65. for (exp = 0; exp < 3; exp++)
  66. {
  67. if (fmax < expScale[exp])
  68. break;
  69. }
  70. Assert( fmax < expScale[exp] );
  71. float fexp = 512.0f / expScale[exp];
  72. x = clamp( (int)(vOther.x * fexp) + 512, 0, 1023 );
  73. y = clamp( (int)(vOther.y * fexp) + 512, 0, 1023 );
  74. z = clamp( (int)(vOther.z * fexp) + 512, 0, 1023 );
  75. return *this;
  76. }
  77. inline Vector32::operator Vector ()
  78. {
  79. Vector tmp;
  80. static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };
  81. float fexp = expScale[exp] / 512.0f;
  82. tmp.x = (((int)x) - 512) * fexp;
  83. tmp.y = (((int)y) - 512) * fexp;
  84. tmp.z = (((int)z) - 512) * fexp;
  85. return tmp;
  86. }
  87. //=========================================================
  88. // Fit a unit vector into 32 bits
  89. //=========================================================
  90. class Normal32
  91. {
  92. public:
  93. // Construction/destruction:
  94. Normal32(void);
  95. Normal32(vec_t X, vec_t Y, vec_t Z);
  96. // assignment
  97. Normal32& operator=(const Vector &vOther);
  98. operator Vector ();
  99. private:
  100. unsigned short x:15;
  101. unsigned short y:15;
  102. unsigned short zneg:1;
  103. };
  104. inline Normal32& Normal32::operator=(const Vector &vOther)
  105. {
  106. CHECK_VALID(vOther);
  107. x = clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 );
  108. y = clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 );
  109. zneg = (vOther.z < 0);
  110. //x = vOther.x;
  111. //y = vOther.y;
  112. //z = vOther.z;
  113. return *this;
  114. }
  115. inline Normal32::operator Vector ()
  116. {
  117. Vector tmp;
  118. tmp.x = ((int)x - 16384) * (1 / 16384.0);
  119. tmp.y = ((int)y - 16384) * (1 / 16384.0);
  120. tmp.z = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y );
  121. if (zneg)
  122. tmp.z = -tmp.z;
  123. return tmp;
  124. }
  125. //=========================================================
  126. // 64 bit Quaternion
  127. //=========================================================
  128. class Quaternion64
  129. {
  130. public:
  131. // Construction/destruction:
  132. Quaternion64(void);
  133. Quaternion64(vec_t X, vec_t Y, vec_t Z);
  134. // assignment
  135. // Quaternion& operator=(const Quaternion64 &vOther);
  136. Quaternion64& operator=(const Quaternion &vOther);
  137. operator Quaternion () const;
  138. inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
  139. private:
  140. Quaternion64( uint64 xx, uint64 yy, uint64 zz, uint64 ww ) : x(xx), y(yy), z(zz), wneg(ww) {}; // stricly for static construction
  141. uint64 x:21;
  142. uint64 y:21;
  143. uint64 z:21;
  144. uint64 wneg:1;
  145. };
  146. inline Quaternion64::operator Quaternion () const
  147. {
  148. #if defined(__SPU__)
  149. fltx4 tmpV;
  150. QuaternionAligned tmpQ;
  151. tmpV = LoadUnalignedSIMD();
  152. StoreAlignedSIMD( (float *)&tmpQ, tmpV );
  153. return tmpQ;
  154. #else
  155. Quaternion tmp;
  156. // shift to -1048576, + 1048575, then round down slightly to -1.0 < x < 1.0
  157. tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);
  158. tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);
  159. tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);
  160. tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
  161. if (wneg)
  162. tmp.w = -tmp.w;
  163. return tmp;
  164. #endif
  165. }
  166. inline Quaternion64& Quaternion64::operator=(const Quaternion &vOther)
  167. {
  168. CHECK_VALID(vOther);
  169. x = clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 );
  170. y = clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 );
  171. z = clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 );
  172. wneg = (vOther.w < 0);
  173. return *this;
  174. }
  175. inline fltx4 Quaternion64::LoadUnalignedSIMD() const
  176. {
  177. #ifdef _PS3 // assume little endian packing
  178. #if 1
  179. const static u32x4 xmask = { 0x00000000, 0x001fffff, 0, 0 }; // bottom 21 bits ( 0 .. 20 ) true
  180. const static u32x4 ymask = { 0x000003ff, 0xffe00000, 0, 0 }; // bits 21 .. 41 true
  181. const static u32x4 zmask = { 0x7ffffC00, 0x00000000, 0, 0 }; // bits 42 .. 62 true
  182. const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 }; // only bit 63 is true
  183. const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
  184. // fish x, y, and z and put them into the the first words of their respective vec registers
  185. // the end type for these registers must be signed for the following subtract, BUT!
  186. // the shift has to happen as an UNSIGNED type so that it doesn't sign-extend.
  187. // the code as present assumes that the fused multiply-add operation has an intermediate
  188. // precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an
  189. // int op because of course 21 bits is right at the limit of floating point precision.
  190. i32x4 ix = (i32x4) (ShiftLeftByBits<32>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word
  191. i32x4 iy = (i32x4) (ShiftLeftByBits<11>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word
  192. i32x4 iz = (i32x4) (ShiftRightByBits<10>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word
  193. /* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product)
  194. i32x4 iy = (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word
  195. i32x4 iz = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word
  196. */
  197. i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
  198. // convert each of the vectors from int to float. (because of the way the pipeline is organized,
  199. // it's as fast to do this as it would have been to do by combining them into one register above
  200. // and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
  201. // map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
  202. // by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
  203. const fltx4 ONE = LoadOneSIMD();
  204. #if defined(__SPU__)
  205. fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE);
  206. fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE);
  207. fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE);
  208. #else
  209. fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE);
  210. fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE);
  211. fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE);
  212. #endif
  213. // compute the dot product
  214. fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
  215. fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
  216. fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
  217. fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
  218. fltx4 result = Compress4SIMD( fx, fy, fz, fw );
  219. // and for the coup de grace, set the sign bit of fw appropriately
  220. result = OrSIMD( result, (fltx4)wsignbit );
  221. return result;
  222. #else
  223. // original version
  224. /*
  225. union Qmask {
  226. struct qq {
  227. Quaternion64 mask;
  228. uint64 padding;
  229. } asQ ;
  230. u32x4 asVec;
  231. Qmask( const Quaternion64 &m ) : mask(m) {}
  232. };
  233. */
  234. const static u32x4 xmask = { 0xfffff800, 0x00000000, 0, 0 }; // top 21 bits ( 0 .. 20 ) true
  235. const static u32x4 ymask = { 0x000007ff, 0xffc00000, 0, 0 }; // bits 21 .. 41 true
  236. const static u32x4 zmask = { 0x00000000, 0x003ffffe, 0, 0 }; // bits 42 .. 62 true
  237. const static u32x4 wmask = { 0x00000000, 0x00000001, 0, 0 }; // only bit 63 is true
  238. const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
  239. // fish x, y, and z and put them into the the first words of their respective vec registers
  240. // the end type for these registers must be signed for the following subtract, BUT!
  241. // the shift has to happen as an UNSIGNED type so that it doesn't sign-extend.
  242. // the code as present assumes that the fused multiply-add operation has an intermediate
  243. // precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an
  244. // int op because of course 21 bits is right at the limit of floating point precision.
  245. i32x4 ix = (i32x4) (ShiftRightByBits<11>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word
  246. i32x4 iy = (i32x4) (ShiftLeftByBits<10>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word
  247. i32x4 iz = (i32x4) (ShiftLeftByBits<31>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word
  248. /* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product)
  249. i32x4 iy = (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word
  250. i32x4 iz = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word
  251. */
  252. i32x4 wsignbit = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
  253. // convert each of the vectors from int to float. (because of the way the pipeline is organized,
  254. // it's as fast to do this as it would have been to do by combining them into one register above
  255. // and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
  256. // map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
  257. // by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
  258. const fltx4 ONE = LoadOneSIMD();
  259. #if defined(__SPU__)
  260. fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE);
  261. fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE);
  262. fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE);
  263. #else
  264. fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE);
  265. fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE);
  266. fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE);
  267. #endif
  268. // compute the dot product
  269. fltx4 fw = MsubSIMD( ONE, fz, fz ); // 1 - z*z
  270. fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
  271. fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
  272. fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
  273. fltx4 result = Compress4SIMD( fx, fy, fz, fw );
  274. // and for the coup de grace, set the sign bit of fw appropriately
  275. result = OrSIMD( result, (fltx4)wsignbit );
  276. return result;
  277. #endif
  278. #elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once)
  279. struct { float x; float y; float z; float w; } tmp;
  280. tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);
  281. tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);
  282. tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);
  283. tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
  284. if (wneg)
  285. tmp.w = -tmp.w;
  286. fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
  287. return ret;
  288. #else // naive implementation (which ends up being faster than the explicit c imp above)
  289. const QuaternionAligned q(Quaternion(*this)) ;
  290. return LoadAlignedSIMD( &q );
  291. #endif
  292. }
  293. //=========================================================
  294. // 48 bit Quaternion
  295. //=========================================================
  296. class Quaternion48
  297. {
  298. public:
  299. // Construction/destruction:
  300. Quaternion48(void);
  301. Quaternion48(vec_t X, vec_t Y, vec_t Z);
  302. // assignment
  303. // Quaternion& operator=(const Quaternion48 &vOther);
  304. Quaternion48& operator=(const Quaternion &vOther);
  305. operator Quaternion () const;
  306. inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
  307. //private:
  308. unsigned short x:16;
  309. unsigned short y:16;
  310. unsigned short z:15;
  311. unsigned short wneg:1;
  312. };
  313. inline Quaternion48::operator Quaternion () const
  314. {
  315. #if defined(__SPU__)
  316. fltx4 tmpV;
  317. QuaternionAligned tmpQ;
  318. tmpV = LoadUnalignedSIMD();
  319. StoreAlignedSIMD( (float *)&tmpQ, tmpV );
  320. tmpV = UnpackQuaternion48SIMD( this );
  321. StoreAlignedSIMD( (float *)&tmpQ, tmpV );
  322. return tmpQ;
  323. #else
  324. Quaternion tmp;
  325. tmp.x = ((int)x - 32768) * (1 / 32768.5);
  326. tmp.y = ((int)y - 32768) * (1 / 32768.5);
  327. tmp.z = ((int)z - 16384) * (1 / 16384.5);
  328. tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
  329. if (wneg)
  330. tmp.w = -tmp.w;
  331. return tmp;
  332. #endif
  333. }
  334. inline Quaternion48& Quaternion48::operator=(const Quaternion &vOther)
  335. {
  336. CHECK_VALID(vOther);
  337. x = clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 );
  338. y = clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 );
  339. z = clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 );
  340. wneg = (vOther.w < 0);
  341. return *this;
  342. }
  343. inline fltx4 Quaternion48::LoadUnalignedSIMD() const
  344. {
  345. #ifdef _PS3 // assume little endian packing
  346. const static u32x4 xmask = { 0x00000000, 0xffff0000, 0, 0 };
  347. const static u32x4 ymask = { 0x0000ffff, 0x00000000, 0, 0 };
  348. const static u32x4 zmask = { 0x7fff0000, 0x00000000, 0, 0 };
  349. const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 };
  350. const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
  351. // fish x, y, and z and put them into the the first words of their respective vec registers
  352. i32x4 ix = (i32x4) (ShiftLeftByBits<16>(vec_and( qbits, xmask )));
  353. i32x4 iy = (i32x4) ((vec_and( qbits, ymask )));
  354. i32x4 iz = (i32x4) (ShiftRightByBits<16>(vec_and( qbits, zmask )));
  355. // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
  356. i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask )));
  357. // convert each of the vectors from int to float. (because of the way the pipeline is organized,
  358. // it's as fast to do this as it would have been to do by combining them into one register above
  359. // and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
  360. // map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
  361. // by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
  362. const fltx4 ONE = LoadOneSIMD();
  363. #if defined(__SPU__)
  364. fltx4 fx = SubSIMD( vec_ctf( ix, 15 ), ONE);
  365. fltx4 fy = SubSIMD( vec_ctf( iy, 15 ), ONE);
  366. fltx4 fz = SubSIMD( vec_ctf( iz, 14 ), ONE);
  367. #else
  368. fltx4 fx = SubSIMD( vec_vcfsx( ix, 15 ), ONE);
  369. fltx4 fy = SubSIMD( vec_vcfsx( iy, 15 ), ONE);
  370. fltx4 fz = SubSIMD( vec_vcfsx( iz, 14 ), ONE);
  371. #endif
  372. // compute the dot product
  373. fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
  374. fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
  375. fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
  376. fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
  377. fltx4 result = Compress4SIMD( fx, fy, fz, fw );
  378. // and for the coup de grace, set the sign bit of fw appropriately
  379. result = OrSIMD( result, (fltx4)wsignbit );
  380. return result;
  381. #elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once)
  382. struct { float x; float y; float z; float w; } tmp;
  383. tmp.x = ((int)x - 32768) * (1 / 32768.5);
  384. tmp.y = ((int)y - 32768) * (1 / 32768.5);
  385. tmp.z = ((int)z - 16384) * (1 / 16384.5);
  386. tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
  387. if (wneg)
  388. tmp.w = -tmp.w;
  389. fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
  390. return ret;
  391. #else // naive implementation (which ends up being faster than the explicit c imp above)
  392. const QuaternionAligned q(Quaternion(*this)) ;
  393. return LoadAlignedSIMD( &q );
  394. #endif
  395. }
  396. //=========================================================
  397. // 48 bit sorted Quaternion
  398. //=========================================================
  399. class Quaternion48S
  400. {
  401. public:
  402. // Construction/destruction:
  403. Quaternion48S(void);
  404. Quaternion48S(vec_t X, vec_t Y, vec_t Z);
  405. // assignment
  406. // Quaternion& operator=(const Quaternion48 &vOther);
  407. Quaternion48S& operator=(const Quaternion &vOther);
  408. operator Quaternion () const;
  409. operator fltx4 () const RESTRICT ;
  410. //private:
  411. // shift the quaternion so that the largest value is recreated by the sqrt()
  412. // abcd maps modulo into quaternion xyzw starting at "offset"
  413. // "offset" is split into two 1 bit fields so that the data packs into 6 bytes (3 shorts)
  414. unsigned short a:15; // first of the 3 consecutive smallest quaternion elements
  415. unsigned short offsetH:1; // high bit of "offset"
  416. unsigned short b:15;
  417. unsigned short offsetL:1; // low bit of "offset"
  418. unsigned short c:15;
  419. unsigned short dneg:1; // sign of the largest quaternion element
  420. };
  421. #define SCALE48S 23168.0f // needs to fit 2*sqrt(0.5) into 15 bits.
  422. #define SHIFT48S 16384 // half of 2^15 bits.
  423. inline Quaternion48S::operator Quaternion () const
  424. {
  425. #if defined(__SPU__)
  426. fltx4 tmpV;
  427. QuaternionAligned tmpQ;
  428. tmpV = *this;
  429. StoreAlignedSIMD( (float *)&tmpQ, tmpV );
  430. return tmpQ;
  431. #else
  432. Quaternion tmp;
  433. COMPILE_TIME_ASSERT( sizeof( Quaternion48S ) == 6 );
  434. float *ptmp = &tmp.x;
  435. int ia = offsetL + offsetH * 2;
  436. int ib = ( ia + 1 ) % 4;
  437. int ic = ( ia + 2 ) % 4;
  438. int id = ( ia + 3 ) % 4;
  439. ptmp[ia] = ( (int)a - SHIFT48S ) * ( 1.0f / SCALE48S );
  440. ptmp[ib] = ( (int)b - SHIFT48S ) * ( 1.0f / SCALE48S );
  441. ptmp[ic] = ( (int)c - SHIFT48S ) * ( 1.0f / SCALE48S );
  442. ptmp[id] = sqrt( 1.0f - ptmp[ia] * ptmp[ia] - ptmp[ib] * ptmp[ib] - ptmp[ic] * ptmp[ic] );
  443. if (dneg)
  444. ptmp[id] = -ptmp[id];
  445. return tmp;
  446. #endif
  447. }
  448. inline Quaternion48S& Quaternion48S::operator=(const Quaternion &vOther)
  449. {
  450. CHECK_VALID(vOther);
  451. const float *ptmp = &vOther.x;
  452. // find largest field, make sure that one is recreated by the sqrt to minimize error
  453. int i = 0;
  454. if ( fabs( ptmp[i] ) < fabs( ptmp[1] ) )
  455. {
  456. i = 1;
  457. }
  458. if ( fabs( ptmp[i] ) < fabs( ptmp[2] ) )
  459. {
  460. i = 2;
  461. }
  462. if ( fabs( ptmp[i] ) < fabs( ptmp[3] ) )
  463. {
  464. i = 3;
  465. }
  466. int offset = ( i + 1 ) % 4; // make "a" so that "d" is the largest element
  467. offsetL = offset & 1;
  468. offsetH = offset > 1;
  469. a = clamp( (int)(ptmp[ offset ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
  470. b = clamp( (int)(ptmp[ ( offset + 1 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
  471. c = clamp( (int)(ptmp[ ( offset + 2 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
  472. dneg = ( ptmp[ ( offset + 3 ) % 4 ] < 0.0f );
  473. return *this;
  474. }
  475. // decode onto a SIMD register
  476. inline Quaternion48S::operator fltx4 () const RESTRICT
  477. {
  478. AssertMsg1( (((uintp) this) & 1) == 0, "Quaternion48S is unaligned at %p\n", this );
  479. #ifdef PLATFORM_PPC // this algorithm depends heavily on the Altivec permute op, for which there is no analogue in SSE. This function should not be used on PC.
  480. // define some vector constants. the shift-scale will be done as a fused multiply-add,
  481. // with the scale already distributed onto the shift (the part subtracted)
  482. const static fltx4 vrSCALE48S = { (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S) };
  483. const static fltx4 vrSHIFT48S = { ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S };
  484. // start by hoisting the q48 onto a SIMD word.
  485. u32x4 source = (u32x4) LoadUnalignedSIMD( this );
  486. const u32x4 ZERO = (u32x4) LoadZeroSIMD();
  487. // also hoist the offset into an int word. Hopefully this executes in parallel with the vector ops thanks to SUPERSCALAR!
  488. const unsigned int offset = offsetL | ( offsetH << 1 );
  489. const bi32x4 vDMask = (bi32x4) LoadAlignedSIMD( g_SIMD_ComponentMask[(offset+3)%4] ); // lets vsel poke D into the right word
  490. #if 0 // This code can be used to deal with a situation where LoadUnalignedSIMD() fails to properly load
  491. // vectors starting on halfword boundaries (rather than 32-bit aligned). Because this is a 48-bit
  492. // structure, sometimes it'll only be 16-bit aligned. I expected that lvlx would always load from
  493. // a word boundary, requiring me to shift misaligned vectors over by 16 bits, but evidently,
  494. // lvlx actually works even on halfword boundaries. Who knew!
  495. // Anyway, this code is still here in case the problem crops up, as a hint to both cause and solution.
  496. if ( ((unsigned int) this) & 2 )
  497. {
  498. source = ShiftLeftByBits<16>(source);
  499. }
  500. #endif
  501. // mask out the offset and dneg bits. Because of the packing #pragmas, the one-bit fields are actually at the MSB
  502. // of the halfwords, not the LSB as you might expect.
  503. ALIGN16 const static uint32 vMaskTopBits[4] = { 0x80008000, 0x80000000, 0, 0 }; // just the LSB of each the first three halfwords
  504. u32x4 abc = AndNotSIMD( (u32x4) LoadAlignedSIMD(vMaskTopBits), source ); // now this is just the A, B, C halfwords.
  505. // Next, unpack abc as unsigned numbers. We can do this with a permute op. In fact, we can exploit
  506. // the integer pipe and load the offset while we're loading the SIMD numbers, then use the integer offset to select
  507. // the permute, which will therefore also perform the rotate that maps abc to their rightful destinations.
  508. // the masks below are for the vperm instruction, which is a byte-by-byte mapping from source to destination.
  509. // it's assumed that the FIRST parameter to vperm will be ZERO, and the second the data. (that makes the masks a little clearer)
  510. // in the simplest case -- imagine each letter below represents one byte; the source vector looks like
  511. // AABB CCxx xxxx xxxx. We're going to permute it onto the work register like
  512. // 00AA 00BB 00CC 0000
  513. ALIGN16 const static uint32 vPermutations[4][4] = {
  514. // offset = 0 means a->x, b->y, c->z, d->w
  515. { 0x00001011, 0x00001213, 0x00001415, 0x00000000 },
  516. // offset = 1 means a->y, b->z, c->w, d->a
  517. { 0x00000000, 0x00001011, 0x00001213, 0x00001415 },
  518. { 0x00001415, 0x00000000, 0x00001011, 0x00001213 },
  519. { 0x00001213, 0x00001415, 0x00000000, 0x00001011 }
  520. };
  521. // compute two permutations on the input data: one where the zero-word is always in the w component,
  522. // which lets us do a 3-way rather than 4-way dot product; and another where the zero-word corresponds to
  523. // wherever D is supposed to go.
  524. // Even though this seems redundant, the duplicated work ends up fitting into the pipeline bubbles,
  525. // and the savings between a 4-way and 3-way dot seem to be about 3ns.
  526. u32x4 abcfordot = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[0] ) );
  527. abc = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[offset] ) );
  528. // turn each of the ints into floats. Because we masked out the one-bit field at the top,
  529. // We can think of this as a conversion from fixed-point where there's no fractional bit.
  530. // This is done in line with the shift-scale operation, which is itself fused.
  531. // we do this twice: once for the vector with the guaranteed zero w-word, and
  532. // once for the vector rotated by the offset.
  533. fltx4 vfDest = AndNotSIMD( vDMask, MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abc, 0 ), vrSCALE48S, vrSHIFT48S ) );
  534. fltx4 vfDestForDot = MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abcfordot, 0 ), vrSCALE48S, vrSHIFT48S ) ;
  535. // compute magnitude of the vector we know to have a 0 in the w word.
  536. const fltx4 vDot = Dot3SIMD( vfDestForDot, vfDestForDot );
  537. // recover the "D" word
  538. const fltx4 vD = SqrtSIMD( SubSIMD( LoadOneSIMD(), vDot ) );
  539. // mask D into the converted-and-offset vector, then return.
  540. return MaskedAssign( vDMask, dneg ? NegSIMD(vD) : vD, vfDest );
  541. #else
  542. AssertMsg( false, "Quaternion48S::operator fltx4 is slow on this platform and should not be used.\n" );
  543. QuaternionAligned q( (Quaternion) *this );
  544. return LoadAlignedSIMD( &q );
  545. #endif
  546. }
  547. //=========================================================
  548. // 32 bit Quaternion
  549. //=========================================================
  550. class Quaternion32
  551. {
  552. public:
  553. // Construction/destruction:
  554. Quaternion32(void);
  555. Quaternion32(vec_t X, vec_t Y, vec_t Z);
  556. // assignment
  557. // Quaternion& operator=(const Quaternion48 &vOther);
  558. Quaternion32& operator=(const Quaternion &vOther);
  559. operator Quaternion ();
  560. inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
  561. private:
  562. unsigned int x:11;
  563. unsigned int y:10;
  564. unsigned int z:10;
  565. unsigned int wneg:1;
  566. };
  567. inline Quaternion32::operator Quaternion ()
  568. {
  569. #if defined(__SPU__)
  570. fltx4 tmpV;
  571. QuaternionAligned tmpQ;
  572. tmpV = LoadUnalignedSIMD();
  573. StoreAlignedSIMD( (float *)&tmpQ, tmpV );
  574. return tmpQ;
  575. #else
  576. Quaternion tmp;
  577. tmp.x = ((int)x - 1024) * (1 / 1024.0);
  578. tmp.y = ((int)y - 512) * (1 / 512.0);
  579. tmp.z = ((int)z - 512) * (1 / 512.0);
  580. tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
  581. if (wneg)
  582. tmp.w = -tmp.w;
  583. return tmp;
  584. #endif
  585. }
  586. inline Quaternion32& Quaternion32::operator=(const Quaternion &vOther)
  587. {
  588. CHECK_VALID(vOther);
  589. x = clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 );
  590. y = clamp( (int)(vOther.y * 512) + 512, 0, 1023 );
  591. z = clamp( (int)(vOther.z * 512) + 512, 0, 1023 );
  592. wneg = (vOther.w < 0);
  593. return *this;
  594. }
  595. inline fltx4 Quaternion32::LoadUnalignedSIMD() const
  596. {
  597. #ifdef _PS3 // assume little endian packing
  598. const static u32x4 xmask = { 0x000007ff, 0, 0, 0 };
  599. const static u32x4 ymask = { 0x001ff800, 0, 0, 0 };
  600. const static u32x4 zmask = { 0x7fe00000, 0, 0, 0 };
  601. const static u32x4 wmask = { 0x80000000, 0, 0, 0 };
  602. const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
  603. // fish x, y, and z and put them into the the first words of their respective vec registers
  604. i32x4 ix = (i32x4) ((vec_and( qbits, xmask )));
  605. i32x4 iy = (i32x4) (ShiftRightByBits<11>(vec_and( qbits, ymask )));
  606. i32x4 iz = (i32x4) (ShiftRightByBits<21>(vec_and( qbits, zmask )));
  607. // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
  608. i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask )));
  609. // convert each of the vectors from int to float. (because of the way the pipeline is organized,
  610. // it's as fast to do this as it would have been to do by combining them into one register above
  611. // and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
  612. // map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
  613. // by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
  614. const fltx4 ONE = LoadOneSIMD();
  615. #if defined(__SPU__)
  616. fltx4 fx = SubSIMD( vec_ctf( ix, 10 ), ONE);
  617. fltx4 fy = SubSIMD( vec_ctf( iy, 9 ), ONE);
  618. fltx4 fz = SubSIMD( vec_ctf( iz, 9 ), ONE);
  619. #else
  620. fltx4 fx = SubSIMD( vec_vcfsx( ix, 10 ), ONE);
  621. fltx4 fy = SubSIMD( vec_vcfsx( iy, 10 ), ONE);
  622. fltx4 fz = SubSIMD( vec_vcfsx( iz, 9 ), ONE);
  623. #endif
  624. // compute the dot product
  625. fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
  626. fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
  627. fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
  628. fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
  629. fltx4 result = Compress4SIMD( fx, fy, fz, fw );
  630. // and for the coup de grace, set the sign bit of fw appropriately
  631. result = OrSIMD( result, (fltx4)wsignbit );
  632. return result;
  633. #else
  634. struct { float x; float y; float z; float w; } tmp;
  635. tmp.x = ((int)x - 1024) * (1 / 1024.0);
  636. tmp.y = ((int)y - 512) * (1 / 512.0);
  637. tmp.z = ((int)z - 512) * (1 / 512.0);
  638. tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
  639. if (wneg)
  640. tmp.w = -tmp.w;
  641. fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
  642. return ret;
  643. #endif
  644. }
  645. //=========================================================
  646. // 16 bit float
  647. //=========================================================
  648. const int float32bias = 127;
  649. const int float16bias = 15;
  650. const float maxfloat16bits = 65504.0f;
  651. class float16
  652. {
  653. public:
  654. // float16() {};
  655. //float16( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }
  656. float16& operator=(const unsigned short &other) { m_storage.rawWord = other; return *this; };
  657. void Init() { m_storage.rawWord = 0; }
  658. // float16& operator=(const float16 &other) { m_storage.rawWord = other.m_storage.rawWord; return *this; }
  659. // float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }
  660. // operator unsigned short () { return m_storage.rawWord; }
  661. // operator float () { return Convert16bitFloatTo32bits( m_storage.rawWord ); }
  662. unsigned short GetBits() const
  663. {
  664. return m_storage.rawWord;
  665. }
  666. float GetFloat() const
  667. {
  668. return Convert16bitFloatTo32bits( m_storage.rawWord );
  669. }
  670. void SetFloat( float in )
  671. {
  672. m_storage.rawWord = ConvertFloatTo16bits( in );
  673. }
  674. bool IsInfinity() const
  675. {
  676. return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa == 0;
  677. }
  678. bool IsNaN() const
  679. {
  680. return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa != 0;
  681. }
  682. bool operator==(const float16 other) const { return m_storage.rawWord == other.m_storage.rawWord; }
  683. bool operator!=(const float16 other) const { return m_storage.rawWord != other.m_storage.rawWord; }
  684. // bool operator< (const float other) const { return GetFloat() < other; }
  685. // bool operator> (const float other) const { return GetFloat() > other; }
  686. template< bool BRANCHLESS > // allows you to force branchy/branchless implementation regardless of the current platform
  687. static unsigned short ConvertFloatTo16bitsNonDefault( float input );
  688. static float Convert16bitFloatTo32bits( unsigned short input );
  689. // a special case useful for the pixel writer: take four input float values, which are already in memory (not on registers),
  690. // convert them all at once and write them sequentially through the output pointer.
  691. static void ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut,
  692. const float *a, const float *b, const float *c, const float *d );
  693. // unfortunately, function templates can't have default template parameters in 2010-era C++
  694. inline static unsigned short ConvertFloatTo16bits( float input )
  695. { // default to branchless on ppc and branchy on x86
  696. #ifdef PLATFORM_PPC
  697. return ConvertFloatTo16bitsNonDefault<true>(input);
  698. #else
  699. return ConvertFloatTo16bitsNonDefault<false>(input);
  700. #endif
  701. }
  702. protected:
  703. union float32bits
  704. {
  705. float rawFloat;
  706. uint32 rawAsInt;
  707. struct
  708. {
  709. unsigned int mantissa : 23;
  710. unsigned int biased_exponent : 8;
  711. unsigned int sign : 1;
  712. } bits;
  713. };
  714. union float16bits
  715. {
  716. unsigned short rawWord;
  717. struct
  718. {
  719. unsigned short mantissa : 10;
  720. unsigned short biased_exponent : 5;
  721. unsigned short sign : 1;
  722. } bits;
  723. };
  724. static bool IsNaN( float16bits in )
  725. {
  726. return in.bits.biased_exponent == 31 && in.bits.mantissa != 0;
  727. }
  728. static bool IsInfinity( float16bits in )
  729. {
  730. return in.bits.biased_exponent == 31 && in.bits.mantissa == 0;
  731. }
  732. // 0x0001 - 0x03ff
  733. float16bits m_storage;
  734. };
  735. class float16_with_assign : public float16
  736. {
  737. public:
  738. float16_with_assign() {}
  739. float16_with_assign( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }
  740. float16& operator=(const float16 &other) { m_storage.rawWord = ((float16_with_assign &)other).m_storage.rawWord; return *this; }
  741. float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }
  742. // operator unsigned short () const { return m_storage.rawWord; }
  743. operator float () const { return Convert16bitFloatTo32bits( m_storage.rawWord ); }
  744. };
  745. //=========================================================
  746. // Fit a 3D vector in 48 bits
  747. //=========================================================
  748. class Vector48
  749. {
  750. public:
  751. // Construction/destruction:
  752. Vector48(void) {}
  753. Vector48(vec_t X, vec_t Y, vec_t Z) { x.SetFloat( X ); y.SetFloat( Y ); z.SetFloat( Z ); }
  754. // assignment
  755. Vector48& operator=(const Vector &vOther);
  756. operator Vector ();
  757. const float operator[]( int i ) const { return (((float16 *)this)[i]).GetFloat(); }
  758. float16 x;
  759. float16 y;
  760. float16 z;
  761. };
  762. // The uses of isel below are malformed because the first expression is unsigned and thus always >= 0,
  763. // so this whole expression maps to a simple assignment. This was found through a noisy clang
  764. // warning. I am preprocessing this out until it is needed.
  765. #if 0
  766. inline void float16::ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut,
  767. const float *a, const float *b, const float *c, const float *d )
  768. {
  769. COMPILE_TIME_ASSERT( sizeof(float) == 4 );
  770. // being meant for use on the PPC, this is tuned for that.
  771. // it is mostly branchless, except for the large outer for loop,
  772. // since there's enough instructions inside that unrolling is
  773. // a bad idea. This fucntion is four-at-once to simplify SIMDifying in the
  774. // future should a convenient SIMD way to decimate emerge
  775. // Also, because this is only used for the special case of converting
  776. // float arrays into float16 GPU textures, this turns denorms into zeroes
  777. // and infinities into MAXFLTs, since the shader can't deal with nonfinite
  778. // numbers anyway.
  779. // alias the input floats onto a union giving their mantissa etc
  780. const float32bits * const inFloat[4] = {
  781. reinterpret_cast<const float32bits *>(a),
  782. reinterpret_cast<const float32bits *>(b),
  783. reinterpret_cast<const float32bits *>(c),
  784. reinterpret_cast<const float32bits *>(d) };
  785. const static unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f
  786. const static unsigned int SIGNBIT = 0x80000000;
  787. for ( int i = 0 ; i < 4 ; ++i ) // performs better not unrolled (less stack spilling)
  788. {
  789. unsigned int onGPR = inFloat[i]->rawAsInt;
  790. // make a mask for each word; will be all 1's if the float is
  791. // negative, all 0s if it is positive. Can do this just by
  792. // using arithmetic shift to smear out the sign bit.
  793. int isNegative = ((int) onGPR) >> 31;
  794. // clamp to be within -maxfloat16bits, maxfloat16bits
  795. // can't just use isel because IEEE754 floats are sign-magnitude, not two's comp. However,
  796. // positive IEEE754s can be compared as if they were ints. So, we need to do a little extra
  797. // work to test the negative case efficiently.
  798. // clamp to -maxfloat16
  799. #error See above for explanation of why this and other uses of isel in this file are broken.
  800. int clampedNeg = isel( ((int)(onGPR & ~SIGNBIT)) - maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat
  801. maxfloat16bitsAsInt | SIGNBIT, // -65504.0f
  802. onGPR );
  803. // clamp to +maxfloat16
  804. int clampedPos = isel( ((int)(onGPR)) - maxfloat16bitsAsInt, // in >= maxfloatbits
  805. maxfloat16bitsAsInt , // -65504.0f
  806. onGPR );
  807. // take advantage of PPC's andc operator to effectively do a masked-move
  808. onGPR = ( clampedNeg & isNegative ) | ( clampedPos & ~isNegative );
  809. // fish out the input exponent and mantis fields directly (using the union induces an LHS)
  810. int inExponent = (onGPR & 0x7f800000) >> 23;
  811. unsigned int inMantissa = (onGPR & 0x007FFFFF);
  812. int exponent = inExponent - 127 + 15; // rebias the exponent
  813. unsigned int mantissa = isel( exponent, inMantissa >> 13, (unsigned) 0 ); // squash the mantissa to zero if the number is too small to represent (no denorms)
  814. float16bits output;
  815. // saturate the mantissa if rebiased exponent >= 31 (too big to store)
  816. output.bits.mantissa = isel( exponent - 31, (unsigned) 0x3ff, mantissa );
  817. // clamp the exponent to 0..30
  818. output.bits.biased_exponent = isel( exponent, isel( exponent - 31, 30, exponent ), 0 );
  819. output.bits.sign = isNegative; // this doesn't lhs, but instead issues the insrdi op to a word on GPR
  820. pOut[i].m_storage.rawWord = output.rawWord;
  821. }
  822. }
  823. #endif
  824. #ifdef _X360
  825. #define __cntlzw _CountLeadingZeros
  826. #endif
  827. template< bool BRANCHLESS >
  828. inline unsigned short float16::ConvertFloatTo16bitsNonDefault( float input )
  829. {
  830. float16bits output;
  831. float32bits inFloat;
  832. //if ( !BRANCHLESS ) // x86 code
  833. {
  834. if ( input > maxfloat16bits )
  835. input = maxfloat16bits;
  836. else if ( input < -maxfloat16bits )
  837. input = -maxfloat16bits;
  838. inFloat.rawFloat = input;
  839. }
  840. /*
  841. // The use of isel is incorrect because the first expression is unsigned and therefore always passes
  842. // the test.
  843. else // PPC code
  844. {
  845. // force the float onto the stack and then a GPR so we eat the LHS only once.
  846. // you can't just write to one union member and then read back another;
  847. // the compiler is inconsistent about supporting that kind of type-punning.
  848. // (ie, it will work in one file, but not another.)
  849. memcpy(&inFloat.rawFloat, &input, sizeof(inFloat.rawFloat));
  850. // inFloat.rawFloat = input;
  851. // clamp using the GPR
  852. {
  853. const unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f
  854. // clamp to be <= maxfloat16bits
  855. uint32 &rawint = inFloat.rawAsInt; // <--- lhs
  856. if ( rawint & 0x80000000 ) // negative
  857. {
  858. // because floats are sign-magnitude, not two's comp, need to
  859. // flip the int positive briefly to do the isel comparison
  860. #error See above for explanation of why this and other uses of isel in this file are broken.
  861. rawint = isel( ((int)(rawint & ~0x80000000)) - maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat
  862. maxfloat16bitsAsInt | 0x80000000, // -65504.0f
  863. rawint );
  864. }
  865. else // positive
  866. {
  867. rawint = isel( ((int)(rawint)) - maxfloat16bitsAsInt, // in >= maxfloatbits
  868. maxfloat16bitsAsInt , // -65504.0f
  869. rawint );
  870. }
  871. }
  872. }
  873. */
  874. output.bits.sign = inFloat.bits.sign;
  875. if ( (inFloat.bits.biased_exponent==0) )
  876. {
  877. // zero and denorm both map to zero
  878. output.bits.mantissa = 0;
  879. output.bits.biased_exponent = 0;
  880. }
  881. else if ( inFloat.bits.biased_exponent==0xff )
  882. {
  883. if ( !BRANCHLESS )
  884. {
  885. if ( (inFloat.bits.mantissa==0) )
  886. {
  887. /*
  888. // infinity
  889. output.bits.mantissa = 0;
  890. output.bits.biased_exponent = 31;
  891. */
  892. // infinity maps to maxfloat
  893. output.bits.mantissa = 0x3ff;
  894. output.bits.biased_exponent = 0x1e;
  895. }
  896. else if ( (inFloat.bits.mantissa!=0) )
  897. {
  898. /*
  899. // NaN
  900. output.bits.mantissa = 1;
  901. output.bits.biased_exponent = 31;
  902. */
  903. // NaN maps to zero
  904. output.bits.mantissa = 0;
  905. output.bits.biased_exponent = 0;
  906. }
  907. }
  908. else // branchless, only meant for PPC really bc needing the cntlzw op.
  909. {
  910. // else if ( inFloat.bits.biased_exponent==0xff ) // either infinity (biased_exponent is 0xff) or NaN.
  911. {
  912. #ifdef PLATFORM_PPC
  913. #if defined(__SPU__)
  914. int mantissamask = __builtin_clz( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise
  915. #else
  916. int mantissamask = __cntlzw( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise
  917. #endif
  918. #else
  919. int mantissamask = output.bits.mantissa ? -1 : 0;
  920. #endif
  921. output.bits.mantissa = isel( mantissamask, 0x3ff, 0 ); //infinity maps to maxfloat, NaN to zero
  922. output.bits.biased_exponent = isel( mantissamask, 0x1e, 0 );
  923. output.bits.sign = inFloat.bits.sign;
  924. }
  925. }
  926. }
  927. else
  928. {
  929. // regular number
  930. int new_exp = inFloat.bits.biased_exponent-float32bias;
  931. // it's actually better to branch in these cases on PPC,
  932. // because the variable bit shift is such a massive penalty
  933. // that it's worth a branch penalty to avoid it.
  934. if (new_exp<-24)
  935. {
  936. // this maps to 0
  937. output.bits.mantissa = 0;
  938. output.bits.biased_exponent = 0;
  939. }
  940. if (new_exp<-14)
  941. {
  942. // this maps to a denorm
  943. output.bits.biased_exponent = 0;
  944. unsigned int exp_val = ( unsigned int )( -14 - new_exp );
  945. if( exp_val > 0 && exp_val < 11 )
  946. {
  947. output.bits.mantissa = ( 1 << ( 10 - exp_val ) ) + ( inFloat.bits.mantissa >> ( 13 + exp_val ) );
  948. }
  949. }
  950. else if (new_exp>15)
  951. {
  952. #if 0
  953. // map this value to infinity
  954. output.bits.mantissa = 0;
  955. output.bits.biased_exponent = 31;
  956. #else
  957. // to big. . . maps to maxfloat
  958. output.bits.mantissa = 0x3ff;
  959. output.bits.biased_exponent = 0x1e;
  960. #endif
  961. }
  962. else
  963. {
  964. output.bits.biased_exponent = new_exp+15;
  965. output.bits.mantissa = (inFloat.bits.mantissa >> 13);
  966. }
  967. }
  968. return output.rawWord;
  969. }
  970. inline float float16::Convert16bitFloatTo32bits( unsigned short input )
  971. {
  972. float32bits output;
  973. const float16bits &inFloat = *((float16bits *)&input);
  974. if( IsInfinity( inFloat ) )
  975. {
  976. return maxfloat16bits * ( ( inFloat.bits.sign == 1 ) ? -1.0f : 1.0f );
  977. }
  978. if( IsNaN( inFloat ) )
  979. {
  980. return 0.0;
  981. }
  982. if( inFloat.bits.biased_exponent == 0 && inFloat.bits.mantissa != 0 )
  983. {
  984. // denorm
  985. const float half_denorm = (1.0f/16384.0f); // 2^-14
  986. float mantissa = ((float)(inFloat.bits.mantissa)) / 1024.0f;
  987. float sgn = (inFloat.bits.sign)? -1.0f :1.0f;
  988. output.rawFloat = sgn*mantissa*half_denorm;
  989. }
  990. else
  991. {
  992. // regular number
  993. unsigned mantissa = inFloat.bits.mantissa;
  994. unsigned biased_exponent = inFloat.bits.biased_exponent;
  995. unsigned sign = ((unsigned)inFloat.bits.sign) << 31;
  996. biased_exponent = ( (biased_exponent - float16bias + float32bias) * (biased_exponent != 0) ) << 23;
  997. mantissa <<= (23-10);
  998. *((unsigned *)&output) = ( mantissa | biased_exponent | sign );
  999. }
  1000. return output.rawFloat;
  1001. }
  1002. inline Vector48& Vector48::operator=(const Vector &vOther)
  1003. {
  1004. CHECK_VALID(vOther);
  1005. x.SetFloat( vOther.x );
  1006. y.SetFloat( vOther.y );
  1007. z.SetFloat( vOther.z );
  1008. return *this;
  1009. }
  1010. inline Vector48::operator Vector ()
  1011. {
  1012. Vector tmp;
  1013. tmp.x = x.GetFloat();
  1014. tmp.y = y.GetFloat();
  1015. tmp.z = z.GetFloat();
  1016. return tmp;
  1017. }
  1018. //=========================================================
  1019. // Fit a 2D vector in 32 bits
  1020. //=========================================================
  1021. class Vector2d32
  1022. {
  1023. public:
  1024. // Construction/destruction:
  1025. Vector2d32(void) {}
  1026. Vector2d32(vec_t X, vec_t Y) { x.SetFloat( X ); y.SetFloat( Y ); }
  1027. // assignment
  1028. Vector2d32& operator=(const Vector &vOther);
  1029. Vector2d32& operator=(const Vector2D &vOther);
  1030. operator Vector2D ();
  1031. void Init( vec_t ix = 0.f, vec_t iy = 0.f);
  1032. float16_with_assign x;
  1033. float16_with_assign y;
  1034. };
  1035. inline Vector2d32& Vector2d32::operator=(const Vector2D &vOther)
  1036. {
  1037. x.SetFloat( vOther.x );
  1038. y.SetFloat( vOther.y );
  1039. return *this;
  1040. }
  1041. inline Vector2d32::operator Vector2D ()
  1042. {
  1043. Vector2D tmp;
  1044. tmp.x = x.GetFloat();
  1045. tmp.y = y.GetFloat();
  1046. return tmp;
  1047. }
  1048. inline void Vector2d32::Init( vec_t ix, vec_t iy )
  1049. {
  1050. x.SetFloat(ix);
  1051. y.SetFloat(iy);
  1052. }
  1053. //=========================================================
  1054. // FAST SIMD BATCH OPERATIONS
  1055. //=========================================================
  1056. #ifdef _X360
  1057. //// Compressed vector formats: unpack Vector48 and Quaternion48 onto SIMD registers.
  1058. // Only available on 360 for now because SSE1 lacks the necessary operations. SSE2 could
  1059. // do it but we can't count on that yet.
  1060. // If you have many v48's or q48's to stream, please note the functions designed to
  1061. // work on them many at a time.
  1062. extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit.
  1063. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16];
  1064. extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants;
  1065. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16];
  1066. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16];
  1067. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16];
  1068. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16];
  1069. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16];
  1070. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16];
  1071. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16];
  1072. // unpack a single vector48 at the pointer into the x,y,z components of a fltx4.
  1073. // the w is total garbage.
  1074. FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec )
  1075. {
  1076. // load the three 16-bit floats into the first 48 bits of ret:
  1077. fltx4 ret = XMLoadVector4((const void *)&pVec->x);
  1078. // shuffle the top 64 bits of ret down to the least significant (the z,w) -- 16 of those bits are garbage.
  1079. ret = __vrlimi( ret, ret, 2 | 1, 2 ); // rotate left by 2 words and insert into z,w components
  1080. // now unpack the 16-bit floats into 32-bit floats. This is a hardware op, woohoo!
  1081. ret = __vupkd3d( ret , VPACK_FLOAT16_4 );
  1082. return ret;
  1083. }
  1084. // unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
  1085. // FIXME!!! If we need a version of this that runs on 360, there is a work-in-progress version that hasn't been debugged lower in the file.
  1086. FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )
  1087. {
  1088. // A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
  1089. // z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
  1090. // w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
  1091. // w's sign bit.
  1092. fltx4 q16s = XMLoadVector3((const void *)pVec);
  1093. fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z.
  1094. fltx4 permute = __lvx(&g_SIMD_Quat48_Unpack_Permute0, 0); // load the permute word that shuffles x,y,z into their own words
  1095. bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
  1096. q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
  1097. q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1098. // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
  1099. const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
  1100. const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
  1101. /*
  1102. fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats.
  1103. // scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
  1104. ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes );
  1105. */
  1106. fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd );
  1107. // now, work out what w must be.
  1108. fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.
  1109. dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );
  1110. fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz
  1111. ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)
  1112. if (wneg)
  1113. {
  1114. ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret
  1115. }
  1116. else
  1117. {
  1118. ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret
  1119. }
  1120. return ret;
  1121. }
  1122. // Many-at-a-time unpackers.
  1123. /// Unpack eight consecutive Vector48's in memory onto eight SIMD registers.
  1124. /// The Vector48 pointer must be 16-byte aligned. Eight Vector48s add up
  1125. /// to 48 bytes long. You should maybe think about prefetching.
  1126. FORCEINLINE void UnpackEightVector48SIMD( fltx4 &out1, fltx4 &out2, fltx4 &out3, fltx4 &out4,
  1127. fltx4 &out5, fltx4 &out6, fltx4 &out7, fltx4 &out8,
  1128. Vector48 * RESTRICT pVecs )
  1129. {
  1130. AssertMsg((reinterpret_cast<unsigned int>(pVecs) & 0x0F) == 0, "Input to UnpackEightVector48SIMD is not 16-byte aligned." );
  1131. // first load the data onto three packed SIMD vectors, which contain eight Vector48s between them.
  1132. // I've named them very explicitly so you can follow the movement of the input data.
  1133. fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7;
  1134. x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 0
  1135. z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 1
  1136. y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 2
  1137. // Now, start unpacking. The __vupkd3d operation can turn 16-bit floats into 32-bit floats in a single op!
  1138. // It converts the contents of the z and w words of the input fltx4 , so we need to process a word to do
  1139. // one half, then rotate it to do the other half.
  1140. fltx4 y1z1x2y2 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 );
  1141. x0y0z0x1y1z1x2y2 = __vrlimi( x0y0z0x1y1z1x2y2, x0y0z0x1y1z1x2y2, 0xf, 2 ); // actually y1z1x2y2x0y0z0x1 now. For perf it's important that the first param to vrlimi also be the assignee.
  1142. fltx4 x4y4z4x5 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 );
  1143. z2x3y3z3x4y4z4x5 = __vrlimi( z2x3y3z3x4y4z4x5, z2x3y3z3x4y4z4x5, 0xf, 2 );
  1144. fltx4 z6x7y7z7 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 );
  1145. y5z5x6y6z6x7y7z7 = __vrlimi( y5z5x6y6z6x7y7z7, y5z5x6y6z6x7y7z7, 0xf, 2 );
  1146. fltx4 x0y0z0x1 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 );
  1147. fltx4 z2x3y3z3 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 );
  1148. fltx4 y5z5x6y6 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 );
  1149. // permute to populate the out-registers with part of their vectors:
  1150. out1 = x0y0z0x1; // DONE
  1151. out2 = __vpermwi( y1z1x2y2, VPERMWI_CONST(0, 0, 1, 0) ); // __y1z1__
  1152. out3 = __vpermwi( y1z1x2y2, VPERMWI_CONST(2, 3, 0, 0) ); // x2y2____
  1153. out4 = __vpermwi( z2x3y3z3, VPERMWI_CONST(1, 2, 3, 0) ); // x3y3z3__ // DONE
  1154. out5 = x4y4z4x5; // DONE
  1155. out6 = __vpermwi( y5z5x6y6, VPERMWI_CONST(0, 0, 1, 0) ); // __y5z5__
  1156. out7 = __vpermwi( y5z5x6y6, VPERMWI_CONST(2, 3, 0, 0) ); // x6y6____
  1157. out8 = __vpermwi( z6x7y7z7, VPERMWI_CONST(1, 2, 3, 0) ); // x7y7z7__ // DONE
  1158. // there are four more to finish, which we do with a masked insert
  1159. out2 = __vrlimi( out2, x0y0z0x1, 8, 3 ); // x1y1z1__
  1160. out3 = __vrlimi( out3, z2x3y3z3, 2, 2 ); // x2y2x2__
  1161. out6 = __vrlimi( out6, x4y4z4x5, 8, 3 ); // x5y5z5__
  1162. out7 = __vrlimi( out7, z6x7y7z7, 2, 2 ); // x6y6z6__
  1163. // and we're done!
  1164. }
  1165. /// Unpack eight consecutive Quaternion48's in memory onto eight SIMD registers.
  1166. /// The Quaternion48 pointer must be 16-byte aligned. Eight Quaternion48s add up
  1167. /// to 48 bytes long. You should maybe think about prefetching.
  1168. //
  1169. // This could be improved with verticalization, so that the W sqrts happen
  1170. // on two rather than eight vectors, and then transposing. This would make
  1171. // the initial permuatation even more complicated.
  1172. FORCEINLINE void UnpackEightQuaternion48SIMD( fltx4 &out0, fltx4 &out1, fltx4 &out2, fltx4 &out3,
  1173. fltx4 &out4, fltx4 &out5, fltx4 &out6, fltx4 &out7,
  1174. Quaternion48 * RESTRICT pVecs )
  1175. {
  1176. AssertMsg((reinterpret_cast<unsigned int>(pVecs) & 0x0F) == 0, "Input to UnpackEightQuaternion48SIMD is not 16-byte aligned." );
  1177. // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
  1178. const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
  1179. const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
  1180. const fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z left by one bit.
  1181. // first load the data onto three packed SIMD vectors, which contain eight Quaternion48s between them.
  1182. // I've named them very explicitly so you can follow the movement of the input data.
  1183. fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7;
  1184. x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 0
  1185. z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 1
  1186. y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 2
  1187. // shove each quat onto its own fltx4, by using the permute operation
  1188. // each halfword argument goes into the bottom 16 bits of the floating
  1189. // point rep of 3.0f, then we use a magic constant to scale them.
  1190. out0 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute0) ); // __x0__y0__z0____
  1191. out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____
  1192. // postpone 2 since it straddles two words, we'll get back to it
  1193. out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2 // z2 is important, goes into out2
  1194. out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5 // x5 is important, goes into out5
  1195. // 5 straddles two words
  1196. out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____
  1197. out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____
  1198. // now get back to the straddlers, which we make by blending together a prior output and the other source word
  1199. out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute2) ); // __x2__y2__z2____
  1200. out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute5) ); // __x5__y5__z5____
  1201. // the top bit of the z component in each word isn't part of the number; it's
  1202. // a flag indicating whether the eventual w component should be negative.
  1203. // so, we need to move the 0x00008000 bit of the z word onto the top bit
  1204. // of the w word, which is a rotation two bytes right, or 14 bytes left.
  1205. fltx4 wneg[8];
  1206. // juggle all the z halfwords left one bit (toss the wneg sign bit, multiply by two)
  1207. wneg[0] = __vsldoi( out0, out0, 14 );
  1208. out0 = __vslh(out0, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1209. wneg[1] = __vsldoi( out1, out1, 14 );
  1210. out1 = __vslh(out1, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1211. wneg[2] = __vsldoi( out2, out2, 14 );
  1212. out2 = __vslh(out2, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1213. wneg[3] = __vsldoi( out3, out3, 14 );
  1214. out3 = __vslh(out3, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1215. wneg[4] = __vsldoi( out4, out4, 14 );
  1216. out4 = __vslh(out4, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1217. wneg[5] = __vsldoi( out5, out5, 14 );
  1218. out5 = __vslh(out5, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1219. wneg[6] = __vsldoi( out6, out6, 14 );
  1220. out6 = __vslh(out6, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1221. wneg[7] = __vsldoi( out7, out7, 14 );
  1222. out7 = __vslh(out7, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1223. // create a mask that is just the sign bit of the w word.
  1224. fltx4 vAllOneBits = __vspltisw(-1); // Shift 31
  1225. fltx4 signMask = __vslw(vAllOneBits, vAllOneBits); // all the sign bits
  1226. signMask = __vrlimi( signMask, Four_Zeros, 14, 0 ); // zero out x,y,z words
  1227. // this macro defines the operations that will be performed on each of the eight words:
  1228. // * scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
  1229. // * take the xyz dot product to get 1 - w^2
  1230. // * subtract from one to get w^2
  1231. // * square root to get zero
  1232. // * OR in the wneg sign mask to get sign for zero.
  1233. // though the macro makes it look like these are being done in serial,
  1234. // in fact the compiler will reorder them to minimize stalls.
  1235. fltx4 ONE = Four_Ones;
  1236. fltx4 dotxyz[8];
  1237. fltx4 ww[8];
  1238. // out0 = __vmaddfp( out0, vUpkMul, vUpkAdd );
  1239. // dotxyz[0] = Dot3SIMD( out0, out0 );
  1240. // clamnp dotxyz if it's more than 1.0
  1241. // all components are 1 - dotxyz
  1242. // clear all but w's sign bit in wneg
  1243. // all components are sqrt(1-dotxyz)
  1244. // toggle w's sign where necessary
  1245. // insert one element from the ww vector into the w component of ret
  1246. #define COMPUTE( target, number ) \
  1247. target ## number = __vmaddfp( target ## number, vUpkMul, vUpkAdd ); \
  1248. dotxyz[number] = Dot3SIMD( target ## number, target ## number ); \
  1249. dotxyz[number] = __vminfp( dotxyz[number], ONE ); \
  1250. ww[number] = SubSIMD( ONE, dotxyz[number] ); \
  1251. wneg[number] = AndSIMD( wneg[number], signMask ) ; \
  1252. ww[number] = SqrtSIMD(ww[number]); \
  1253. ww[number] = OrSIMD( ww[number], wneg[number] ); \
  1254. target ## number = __vrlimi( target ## number, ww[number], 1, 0 );
  1255. COMPUTE(out, 0);
  1256. COMPUTE(out, 1);
  1257. COMPUTE(out, 2);
  1258. COMPUTE(out, 3);
  1259. COMPUTE(out, 4);
  1260. COMPUTE(out, 5);
  1261. COMPUTE(out, 6);
  1262. COMPUTE(out, 7);
  1263. #undef COMPUTE
  1264. }
  1265. #elif defined(_PS3)
  1266. // unpack a single vector48 at the pointer into the x,y,z components of a fltx4.
  1267. // the w is total garbage.
  1268. FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec )
  1269. {
  1270. // PS3 libs just give us this
  1271. Vectormath::Aos::Vector3 ret;
  1272. Vectormath::Aos::loadHalfFloats( ret, reinterpret_cast<const uint16_t *>(&pVec->x) );
  1273. return ret.get128();
  1274. }
  1275. extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit.
  1276. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16];
  1277. extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants;
  1278. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16];
  1279. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16];
  1280. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16];
  1281. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16];
  1282. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16];
  1283. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16];
  1284. extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16];
  1285. // unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
  1286. FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )
  1287. {
  1288. // A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
  1289. // z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
  1290. // w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
  1291. // w's sign bit.
  1292. fltx4 q16s = LoadUnaligned3SIMD((const void *)pVec);
  1293. #if defined(__SPU__)
  1294. vec_ushort8 shift = vec_ld( 0, (short unsigned int *)g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z.
  1295. vec_uchar16 permute = vec_ld(0, (unsigned char *)g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words
  1296. #else
  1297. vec_ushort8 shift = vec_ld( 0, g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z.
  1298. vec_uchar16 permute = vec_ld(0, g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words
  1299. #endif
  1300. bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
  1301. q16s = vec_perm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
  1302. #if defined(__SPU__)
  1303. q16s = (fltx4) vec_sl( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1304. #else
  1305. q16s = (fltx4) vec_vslh( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  1306. #endif
  1307. // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
  1308. const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
  1309. const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
  1310. fltx4 ret = vec_madd( q16s, vUpkMul, vUpkAdd );
  1311. // now, work out what w must be.
  1312. fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.
  1313. dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );
  1314. fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz
  1315. ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)
  1316. // insert one element from the ww vector into the w component of ret
  1317. ret = MaskedAssign( LoadAlignedSIMD(g_SIMD_ComponentMask[3]), wneg ? NegSIMD(ww) : ww, ret );
  1318. return ret;
  1319. }
  1320. #endif
  1321. #if defined( _X360 )
  1322. #pragma bitfield_order( pop )
  1323. #elif defined( _PS3 )
  1324. #pragma ms_struct off
  1325. #pragma reverse_bitfields off
  1326. #endif
  1327. #endif