Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1264 lines
39 KiB

  1. //===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
  2. //
  3. // Purpose: - defines SIMD "structure of arrays" classes and functions.
  4. //
  5. //===========================================================================//
  6. #ifndef SSEQUATMATH_H
  7. #define SSEQUATMATH_H
  8. #ifdef _WIN32
  9. #pragma once
  10. #endif
  11. #include "mathlib/ssemath.h"
  12. // Use this #define to allow SSE versions of Quaternion math
  13. // to exist on PC.
  14. // On PC, certain horizontal vector operations are not supported.
  15. // This causes the SSE implementation of quaternion math to mix the
  16. // vector and scalar floating point units, which is extremely
  17. // performance negative if you don't compile to native SSE2 (which
  18. // we don't as of Sept 1, 2007). So, it's best not to allow these
  19. // functions to exist at all. It's not good enough to simply replace
  20. // the contents of the functions with scalar math, because each call
  21. // to LoadAligned and StoreAligned will result in an unnecssary copy
  22. // of the quaternion, and several moves to and from the XMM registers.
  23. //
  24. // Basically, the problem you run into is that for efficient SIMD code,
  25. // you need to load the quaternions and vectors into SIMD registers and
  26. // keep them there as long as possible while doing only SIMD math,
  27. // whereas for efficient scalar code, each time you copy onto or ever
  28. // use a fltx4, it hoses your pipeline. So the difference has to be
  29. // in the management of temporary variables in the calling function,
  30. // not inside the math functions.
  31. //
  32. // If you compile assuming the presence of SSE2, the MSVC will abandon
  33. // the traditional x87 FPU operations altogether and make everything use
  34. // the SSE2 registers, which lessens this problem a little.
  35. // permitted only on 360, as we've done careful tuning on its Altivec math.
  36. // FourQuaternions, however, are always allowed, because vertical ops are
  37. // fine on SSE.
  38. #ifdef PLATFORM_PPC
  39. #define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC!
  40. #endif
  41. //---------------------------------------------------------------------
  42. // Load/store quaternions
  43. //---------------------------------------------------------------------
  44. #ifndef _X360
  45. // Using STDC or SSE
  46. FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
  47. {
  48. fltx4 retval = LoadAlignedSIMD( pSIMD.Base() );
  49. return retval;
  50. }
  51. FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
  52. {
  53. fltx4 retval = LoadAlignedSIMD( pSIMD->Base() );
  54. return retval;
  55. }
  56. FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )
  57. {
  58. StoreAlignedSIMD( pSIMD->Base(), a );
  59. }
  60. #else
  61. // for the transitional class -- load a QuaternionAligned
  62. FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
  63. {
  64. fltx4 retval = XMLoadVector4A( pSIMD.Base() );
  65. return retval;
  66. }
  67. FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
  68. {
  69. fltx4 retval = XMLoadVector4A( pSIMD );
  70. return retval;
  71. }
  72. FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )
  73. {
  74. XMStoreVector4A( pSIMD->Base(), a );
  75. }
  76. // From a RadianEuler packed onto a fltx4, to a quaternion
  77. fltx4 AngleQuaternionSIMD( FLTX4 vAngles );
  78. #endif
  79. #if ALLOW_SIMD_QUATERNION_MATH
  80. //---------------------------------------------------------------------
  81. // Make sure quaternions are within 180 degrees of one another, if not, reverse q
  82. //---------------------------------------------------------------------
  83. FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q )
  84. {
  85. // decide if one of the quaternions is backwards
  86. fltx4 a = SubSIMD( p, q );
  87. fltx4 b = AddSIMD( p, q );
  88. a = Dot4SIMD( a, a );
  89. b = Dot4SIMD( b, b );
  90. fltx4 cmp = (fltx4) CmpGtSIMD( a, b );
  91. fltx4 result = MaskedAssign( cmp, NegSIMD(q), q );
  92. return result;
  93. }
  94. //---------------------------------------------------------------------
  95. // Normalize Quaternion
  96. //---------------------------------------------------------------------
  97. #if USE_STDC_FOR_SIMD
  98. FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )
  99. {
  100. fltx4 radius, result;
  101. radius = Dot4SIMD( q, q );
  102. if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))
  103. {
  104. float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) );
  105. result = ReplicateX4( iradius );
  106. result = MulSIMD( result, q );
  107. return result;
  108. }
  109. return q;
  110. }
  111. #else
  112. // SSE + X360 implementation
  113. FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )
  114. {
  115. fltx4 radius, result, mask;
  116. radius = Dot4SIMD( q, q );
  117. mask = (fltx4) CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0
  118. result = ReciprocalSqrtSIMD( radius );
  119. result = MulSIMD( result, q );
  120. return MaskedAssign( mask, q, result ); // if radius was 0, just return q
  121. }
  122. #endif
  123. //---------------------------------------------------------------------
  124. // 0.0 returns p, 1.0 return q.
  125. //---------------------------------------------------------------------
  126. FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
  127. {
  128. fltx4 sclp, sclq, result;
  129. sclq = ReplicateX4( t );
  130. sclp = SubSIMD( Four_Ones, sclq );
  131. result = MulSIMD( sclp, p );
  132. result = MaddSIMD( sclq, q, result );
  133. return QuaternionNormalizeSIMD( result );
  134. }
  135. //---------------------------------------------------------------------
  136. // Blend Quaternions
  137. //---------------------------------------------------------------------
  138. FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t )
  139. {
  140. // decide if one of the quaternions is backwards
  141. fltx4 q2, result;
  142. q2 = QuaternionAlignSIMD( p, q );
  143. result = QuaternionBlendNoAlignSIMD( p, q2, t );
  144. return result;
  145. }
  146. //---------------------------------------------------------------------
  147. // Multiply Quaternions
  148. //---------------------------------------------------------------------
  149. #ifndef _X360
  150. // SSE and STDC
  151. FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
  152. {
  153. // decide if one of the quaternions is backwards
  154. fltx4 q2, result;
  155. q2 = QuaternionAlignSIMD( p, q );
  156. SubFloat( result, 0 ) = SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 );
  157. SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 );
  158. SubFloat( result, 2 ) = SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 );
  159. SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 );
  160. return result;
  161. }
  162. #else
  163. // X360
  164. extern const fltx4 g_QuatMultRowSign[4];
  165. FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
  166. {
  167. fltx4 q2, row, result;
  168. q2 = QuaternionAlignSIMD( p, q );
  169. row = XMVectorSwizzle( q2, 3, 2, 1, 0 );
  170. row = MulSIMD( row, g_QuatMultRowSign[0] );
  171. result = Dot4SIMD( row, p );
  172. row = XMVectorSwizzle( q2, 2, 3, 0, 1 );
  173. row = MulSIMD( row, g_QuatMultRowSign[1] );
  174. row = Dot4SIMD( row, p );
  175. result = __vrlimi( result, row, 4, 0 );
  176. row = XMVectorSwizzle( q2, 1, 0, 3, 2 );
  177. row = MulSIMD( row, g_QuatMultRowSign[2] );
  178. row = Dot4SIMD( row, p );
  179. result = __vrlimi( result, row, 2, 0 );
  180. row = MulSIMD( q2, g_QuatMultRowSign[3] );
  181. row = Dot4SIMD( row, p );
  182. result = __vrlimi( result, row, 1, 0 );
  183. return result;
  184. }
  185. #endif
  186. //---------------------------------------------------------------------
  187. // Quaternion scale
  188. //---------------------------------------------------------------------
  189. #ifdef _X360
  190. // X360
  191. FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
  192. {
  193. fltx4 sinom = Dot3SIMD( p, p );
  194. sinom = SqrtSIMD( sinom );
  195. sinom = MinSIMD( sinom, Four_Ones );
  196. fltx4 sinsom = ArcSinSIMD( sinom );
  197. fltx4 t4 = ReplicateX4( t );
  198. sinsom = MulSIMD( sinsom, t4 );
  199. sinsom = SinSIMD( sinsom );
  200. sinom = AddSIMD( sinom, Four_Epsilons );
  201. sinom = ReciprocalSIMD( sinom );
  202. t4 = MulSIMD( sinsom, sinom );
  203. fltx4 result = MulSIMD( p, t4 );
  204. // rescale rotation
  205. sinsom = MulSIMD( sinsom, sinsom );
  206. fltx4 r = SubSIMD( Four_Ones, sinsom );
  207. r = MaxSIMD( r, Four_Zeros );
  208. r = SqrtSIMD( r );
  209. // keep sign of rotation
  210. fltx4 cmp = CmpGeSIMD( p, Four_Zeros );
  211. r = MaskedAssign( cmp, r, NegSIMD( r ) );
  212. result = __vrlimi(result, r, 1, 0);
  213. return result;
  214. }
  215. // X360
  216. // assumes t4 contains a float replicated to each slot
  217. FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, const fltx4 &t4 )
  218. {
  219. fltx4 sinom = Dot3SIMD( p, p );
  220. sinom = SqrtSIMD( sinom );
  221. sinom = MinSIMD( sinom, Four_Ones );
  222. fltx4 sinsom = ArcSinSIMD( sinom );
  223. sinsom = MulSIMD( sinsom, t4 );
  224. sinsom = SinSIMD( sinsom );
  225. sinom = AddSIMD( sinom, Four_Epsilons );
  226. sinom = ReciprocalSIMD( sinom );
  227. fltx4 result = MulSIMD( p, MulSIMD( sinsom, sinom ) );
  228. // rescale rotation
  229. sinsom = MulSIMD( sinsom, sinsom );
  230. fltx4 r = SubSIMD( Four_Ones, sinsom );
  231. r = MaxSIMD( r, Four_Zeros );
  232. r = SqrtSIMD( r );
  233. // keep sign of rotation
  234. fltx4 cmp = CmpGeSIMD( p, Four_Zeros );
  235. r = MaskedAssign( cmp, r, NegSIMD( r ) );
  236. result = __vrlimi(result, r, 1, 0);
  237. return result;
  238. }
  239. #elif defined(_PS3)
  240. // X360
  241. FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
  242. {
  243. fltx4 sinom = Dot3SIMD( p, p );
  244. sinom = SqrtSIMD( sinom );
  245. sinom = MinSIMD( sinom, Four_Ones );
  246. fltx4 sinsom = ArcSinSIMD( sinom );
  247. fltx4 t4 = ReplicateX4( t );
  248. sinsom = MulSIMD( sinsom, t4 );
  249. sinsom = SinSIMD( sinsom );
  250. sinom = AddSIMD( sinom, Four_Epsilons );
  251. sinom = ReciprocalSIMD( sinom );
  252. t4 = MulSIMD( sinsom, sinom );
  253. fltx4 result = MulSIMD( p, t4 );
  254. // rescale rotation
  255. sinsom = MulSIMD( sinsom, sinsom );
  256. fltx4 r = SubSIMD( Four_Ones, sinsom );
  257. r = MaxSIMD( r, Four_Zeros );
  258. r = SqrtSIMD( r );
  259. // keep sign of rotation
  260. r = MaskedAssign( CmpGeSIMD( p, Four_Zeros ), r, NegSIMD( r ) );
  261. // set just the w component of result
  262. result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), r, result );
  263. return result;
  264. }
  265. // X360
  266. // assumes t4 contains a float replicated to each slot
  267. FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, const fltx4 &t4 )
  268. {
  269. fltx4 sinom = Dot3SIMD( p, p );
  270. sinom = SqrtSIMD( sinom );
  271. sinom = MinSIMD( sinom, Four_Ones );
  272. fltx4 sinsom = ArcSinSIMD( sinom );
  273. sinsom = MulSIMD( sinsom, t4 );
  274. sinsom = SinSIMD( sinsom );
  275. sinom = AddSIMD( sinom, Four_Epsilons );
  276. sinom = ReciprocalSIMD( sinom );
  277. fltx4 result = MulSIMD( p, MulSIMD( sinsom, sinom ) );
  278. // rescale rotation
  279. sinsom = MulSIMD( sinsom, sinsom );
  280. fltx4 r = SubSIMD( Four_Ones, sinsom );
  281. r = MaxSIMD( r, Four_Zeros );
  282. r = SqrtSIMD( r );
  283. // keep sign of rotation
  284. r = MaskedAssign( CmpGeSIMD( p, Four_Zeros ), r, NegSIMD( r ) );
  285. // set just the w component of result
  286. result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), r, result );
  287. return result;
  288. }
  289. #else
  290. // SSE and STDC
  291. FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
  292. {
  293. float r;
  294. fltx4 q;
  295. // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to
  296. // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
  297. float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) );
  298. sinom = fmin( sinom, 1.f );
  299. float sinsom = sin( asin( sinom ) * t );
  300. t = sinsom / (sinom + FLT_EPSILON);
  301. SubFloat( q, 0 ) = t * SubFloat( p, 0 );
  302. SubFloat( q, 1 ) = t * SubFloat( p, 1 );
  303. SubFloat( q, 2 ) = t * SubFloat( p, 2 );
  304. // rescale rotation
  305. r = 1.0f - sinsom * sinsom;
  306. // Assert( r >= 0 );
  307. if (r < 0.0f)
  308. r = 0.0f;
  309. r = sqrt( r );
  310. // keep sign of rotation
  311. SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r );
  312. return q;
  313. }
  314. #endif
  315. //-----------------------------------------------------------------------------
  316. // Quaternion sphereical linear interpolation
  317. //-----------------------------------------------------------------------------
  318. #ifndef _X360
  319. // SSE and STDC
  320. FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
  321. {
  322. float omega, cosom, sinom, sclp, sclq;
  323. fltx4 result;
  324. // 0.0 returns p, 1.0 return q.
  325. cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) +
  326. SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 );
  327. if ( (1.0f + cosom ) > 0.000001f )
  328. {
  329. if ( (1.0f - cosom ) > 0.000001f )
  330. {
  331. omega = acos( cosom );
  332. sinom = sin( omega );
  333. sclp = sin( (1.0f - t)*omega) / sinom;
  334. sclq = sin( t*omega ) / sinom;
  335. }
  336. else
  337. {
  338. // TODO: add short circuit for cosom == 1.0f?
  339. sclp = 1.0f - t;
  340. sclq = t;
  341. }
  342. SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 );
  343. SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 );
  344. SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 );
  345. SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 );
  346. }
  347. else
  348. {
  349. SubFloat( result, 0 ) = -SubFloat( q, 1 );
  350. SubFloat( result, 1 ) = SubFloat( q, 0 );
  351. SubFloat( result, 2 ) = -SubFloat( q, 3 );
  352. SubFloat( result, 3 ) = SubFloat( q, 2 );
  353. sclp = sin( (1.0f - t) * (0.5f * M_PI));
  354. sclq = sin( t * (0.5f * M_PI));
  355. SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 );
  356. SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 );
  357. SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 );
  358. }
  359. return result;
  360. }
  361. #else
  362. // X360
  363. FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
  364. {
  365. return XMQuaternionSlerp( p, q, t );
  366. }
  367. #endif
  368. FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t )
  369. {
  370. fltx4 q2, result;
  371. q2 = QuaternionAlignSIMD( p, q );
  372. result = QuaternionSlerpNoAlignSIMD( p, q2, t );
  373. return result;
  374. }
  375. #endif // ALLOW_SIMD_QUATERNION_MATH
  376. /// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
  377. /// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
  378. class ALIGN16 FourQuaternions
  379. {
  380. public:
  381. fltx4 x,y,z,w;
  382. FourQuaternions(void)
  383. {
  384. }
  385. FourQuaternions( const fltx4 &_x,
  386. const fltx4 &_y,
  387. const fltx4 &_z,
  388. const fltx4 &_w )
  389. : x(_x), y(_y), z(_z), w(_w)
  390. {}
  391. #if !defined(__SPU__)
  392. // four rotations around the same axis. angles should be in radians.
  393. FourQuaternions ( const fltx4 &axis,
  394. const float &angle0, const float &angle1, const float &angle2, const float &angle3)
  395. {
  396. FromAxisAndAngles( axis, angle0, angle1, angle2, angle3 );
  397. }
  398. #endif
  399. FourQuaternions( FourQuaternions const &src )
  400. {
  401. x=src.x;
  402. y=src.y;
  403. z=src.z;
  404. w=src.w;
  405. }
  406. FORCEINLINE void operator=( FourQuaternions const &src )
  407. {
  408. x=src.x;
  409. y=src.y;
  410. z=src.z;
  411. w=src.w;
  412. }
  413. /// this = this * q;
  414. FORCEINLINE FourQuaternions Mul( FourQuaternions const &q ) const;
  415. /// negate the vector part
  416. FORCEINLINE FourQuaternions Conjugate() const;
  417. /// for a quaternion representing a rotation of angle theta, return
  418. /// one of angle s*theta
  419. /// scale is four floats -- one for each quat
  420. FORCEINLINE FourQuaternions ScaleAngle( const fltx4 &scale ) const;
  421. /// ret = this * ( s * q )
  422. /// In other words, for a quaternion representing a rotation of angle theta, return
  423. /// one of angle s*theta
  424. /// s is four floats in a fltx4 -- one for each quaternion
  425. FORCEINLINE FourQuaternions MulAc( const fltx4 &s, const FourQuaternions &q ) const;
  426. /// ret = ( s * this ) * q
  427. FORCEINLINE FourQuaternions ScaleMul( const fltx4 &s, const FourQuaternions &q ) const;
  428. /// Slerp four quaternions at once, FROM me TO the specified out.
  429. FORCEINLINE FourQuaternions Slerp( const FourQuaternions &to, const fltx4 &t );
  430. FORCEINLINE FourQuaternions SlerpNoAlign( const FourQuaternions &originalto, const fltx4 &t );
  431. #if !defined(__SPU__)
  432. /// given an axis and four angles, populate this quaternion with the equivalent rotations
  433. /// (ie, make these four quaternions represent four different rotations around the same axis)
  434. /// angles should be in RADIANS
  435. FORCEINLINE FourQuaternions &FromAxisAndAngles( const fltx4 &axis,
  436. const float &angle0, const float &angle1, const float &angle2, const float &angle3 );
  437. FORCEINLINE FourQuaternions &FromAxisAndAngles( const fltx4 &axis, const fltx4 &angles );
  438. // one convenience imp if you're doing this in degrees
  439. FORCEINLINE FourQuaternions &FromAxisAndAnglesInDegrees( const fltx4 &axis, const fltx4 &angles )
  440. {
  441. return FromAxisAndAngles( axis, MulSIMD(angles, Four_DegToRad));
  442. }
  443. #endif
  444. // rotate (in place) a FourVectors by this quaternion. there's a corresponding RotateBy in FourVectors.
  445. FORCEINLINE void RotateFourVectors( FourVectors * RESTRICT vecs ) const RESTRICT ;
  446. /// LoadAndSwizzleAligned - load 4 QuaternionAligneds into a FourQuaternions, performing transpose op.
  447. /// all 4 vectors must be 128 bit boundary
  448. FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d)
  449. {
  450. #if defined( _X360 )
  451. fltx4 tx = LoadAlignedSIMD(a);
  452. fltx4 ty = LoadAlignedSIMD(b);
  453. fltx4 tz = LoadAlignedSIMD(c);
  454. fltx4 tw = LoadAlignedSIMD(d);
  455. fltx4 r0 = __vmrghw(tx, tz);
  456. fltx4 r1 = __vmrghw(ty, tw);
  457. fltx4 r2 = __vmrglw(tx, tz);
  458. fltx4 r3 = __vmrglw(ty, tw);
  459. x = __vmrghw(r0, r1);
  460. y = __vmrglw(r0, r1);
  461. z = __vmrghw(r2, r3);
  462. w = __vmrglw(r2, r3);
  463. #else
  464. x = LoadAlignedSIMD(a);
  465. y = LoadAlignedSIMD(b);
  466. z = LoadAlignedSIMD(c);
  467. w = LoadAlignedSIMD(d);
  468. // now, matrix is:
  469. // x y z w
  470. // x y z w
  471. // x y z w
  472. // x y z w
  473. TransposeSIMD(x, y, z, w);
  474. #endif
  475. }
  476. FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned * RESTRICT a,
  477. const QuaternionAligned * RESTRICT b,
  478. const QuaternionAligned * RESTRICT c,
  479. const QuaternionAligned * RESTRICT d)
  480. {
  481. LoadAndSwizzleAligned(a->Base(), b->Base(), c->Base(), d->Base() );
  482. }
  483. /// LoadAndSwizzleAligned - load 4 consecutive QuaternionAligneds into a FourQuaternions,
  484. /// performing transpose op.
  485. /// all 4 vectors must be 128 bit boundary
  486. FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned *qs)
  487. {
  488. #if defined( _X360 )
  489. fltx4 tx = LoadAlignedSIMD(qs++);
  490. fltx4 ty = LoadAlignedSIMD(qs++);
  491. fltx4 tz = LoadAlignedSIMD(qs++);
  492. fltx4 tw = LoadAlignedSIMD(qs);
  493. fltx4 r0 = __vmrghw(tx, tz);
  494. fltx4 r1 = __vmrghw(ty, tw);
  495. fltx4 r2 = __vmrglw(tx, tz);
  496. fltx4 r3 = __vmrglw(ty, tw);
  497. x = __vmrghw(r0, r1);
  498. y = __vmrglw(r0, r1);
  499. z = __vmrghw(r2, r3);
  500. w = __vmrglw(r2, r3);
  501. #else
  502. x = LoadAlignedSIMD(qs++);
  503. y = LoadAlignedSIMD(qs++);
  504. z = LoadAlignedSIMD(qs++);
  505. w = LoadAlignedSIMD(qs++);
  506. // now, matrix is:
  507. // x y z w
  508. // x y z w
  509. // x y z w
  510. // x y z w
  511. TransposeSIMD(x, y, z, w);
  512. #endif
  513. }
  514. // Store the FourQuaternions out to four nonconsecutive ordinary quaternions in memory.
  515. FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned *a, QuaternionAligned *b, QuaternionAligned *c, QuaternionAligned *d)
  516. {
  517. #if defined( _X360 )
  518. fltx4 r0 = __vmrghw(x, z);
  519. fltx4 r1 = __vmrghw(y, w);
  520. fltx4 r2 = __vmrglw(x, z);
  521. fltx4 r3 = __vmrglw(y, w);
  522. fltx4 rx = __vmrghw(r0, r1);
  523. fltx4 ry = __vmrglw(r0, r1);
  524. fltx4 rz = __vmrghw(r2, r3);
  525. fltx4 rw = __vmrglw(r2, r3);
  526. StoreAlignedSIMD(a, rx);
  527. StoreAlignedSIMD(b, ry);
  528. StoreAlignedSIMD(c, rz);
  529. StoreAlignedSIMD(d, rw);
  530. #else
  531. fltx4 dupes[4] = { x, y, z, w };
  532. TransposeSIMD(dupes[0], dupes[1], dupes[2], dupes[3]);
  533. StoreAlignedSIMD(a, dupes[0]);
  534. StoreAlignedSIMD(b, dupes[1]);
  535. StoreAlignedSIMD(c, dupes[2]);
  536. StoreAlignedSIMD(d, dupes[3]);
  537. #endif
  538. }
  539. // Store the FourQuaternions out to four consecutive ordinary quaternions in memory.
  540. FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned *qs)
  541. {
  542. #if defined( _X360 )
  543. fltx4 r0 = __vmrghw(x, z);
  544. fltx4 r1 = __vmrghw(y, w);
  545. fltx4 r2 = __vmrglw(x, z);
  546. fltx4 r3 = __vmrglw(y, w);
  547. fltx4 rx = __vmrghw(r0, r1);
  548. fltx4 ry = __vmrglw(r0, r1);
  549. fltx4 rz = __vmrghw(r2, r3);
  550. fltx4 rw = __vmrglw(r2, r3);
  551. StoreAlignedSIMD(qs, rx);
  552. StoreAlignedSIMD(++qs, ry);
  553. StoreAlignedSIMD(++qs, rz);
  554. StoreAlignedSIMD(++qs, rw);
  555. #else
  556. SwizzleAndStoreAligned(qs, qs+1, qs+2, qs+3);
  557. #endif
  558. }
  559. // Store the FourQuaternions out to four consecutive ordinary quaternions in memory.
  560. // The mask specifies which of the quaternions are actually written out -- each
  561. // word in the fltx4 should be all binary ones or zeros. Ones means the corresponding
  562. // quat will be written.
  563. FORCEINLINE void SwizzleAndStoreAlignedMasked(QuaternionAligned * RESTRICT qs, const bi32x4 &controlMask)
  564. {
  565. fltx4 originals[4];
  566. originals[0] = LoadAlignedSIMD(qs);
  567. originals[1] = LoadAlignedSIMD(qs+1);
  568. originals[2] = LoadAlignedSIMD(qs+2);
  569. originals[3] = LoadAlignedSIMD(qs+3);
  570. bi32x4 masks[4] = { SplatXSIMD(controlMask),
  571. SplatYSIMD(controlMask),
  572. SplatZSIMD(controlMask),
  573. SplatWSIMD(controlMask) };
  574. #if defined( _X360 )
  575. fltx4 r0 = __vmrghw(x, z);
  576. fltx4 r1 = __vmrghw(y, w);
  577. fltx4 r2 = __vmrglw(x, z);
  578. fltx4 r3 = __vmrglw(y, w);
  579. fltx4 rx = __vmrghw(r0, r1);
  580. fltx4 ry = __vmrglw(r0, r1);
  581. fltx4 rz = __vmrghw(r2, r3);
  582. fltx4 rw = __vmrglw(r2, r3);
  583. #else
  584. fltx4 rx = x;
  585. fltx4 ry = y;
  586. fltx4 rz = z;
  587. fltx4 rw = w;
  588. TransposeSIMD( rx, ry, rz, rw );
  589. #endif
  590. StoreAlignedSIMD( qs+0, MaskedAssign(masks[0], rx, originals[0]));
  591. StoreAlignedSIMD( qs+1, MaskedAssign(masks[1], ry, originals[1]));
  592. StoreAlignedSIMD( qs+2, MaskedAssign(masks[2], rz, originals[2]));
  593. StoreAlignedSIMD( qs+3, MaskedAssign(masks[3], rw, originals[3]));
  594. }
  595. };
  596. FORCEINLINE FourQuaternions FourQuaternions::Conjugate( ) const
  597. {
  598. return FourQuaternions( NegSIMD(x), NegSIMD(y), NegSIMD(z), w );
  599. }
  600. FORCEINLINE const fltx4 Dot(const FourQuaternions &a, const FourQuaternions &b)
  601. {
  602. return
  603. MaddSIMD(a.x, b.x,
  604. MaddSIMD(a.y, b.y,
  605. MaddSIMD(a.z,b.z, MulSIMD(a.w,b.w))
  606. )
  607. );
  608. }
  609. FORCEINLINE const FourQuaternions Madd(const FourQuaternions &a, const fltx4 &scale, const FourQuaternions &c)
  610. {
  611. FourQuaternions ret;
  612. ret.x = MaddSIMD(a.x,scale,c.x);
  613. ret.y = MaddSIMD(a.y,scale,c.y);
  614. ret.z = MaddSIMD(a.z,scale,c.z);
  615. ret.w = MaddSIMD(a.w,scale,c.w);
  616. return ret;
  617. }
  618. FORCEINLINE const FourQuaternions Mul(const FourQuaternions &a, const fltx4 &scale)
  619. {
  620. FourQuaternions ret;
  621. ret.x = MulSIMD(a.x,scale);
  622. ret.y = MulSIMD(a.y,scale);
  623. ret.z = MulSIMD(a.z,scale);
  624. ret.w = MulSIMD(a.w,scale);
  625. return ret;
  626. }
  627. FORCEINLINE const FourQuaternions Add(const FourQuaternions &a,const FourQuaternions &b)
  628. {
  629. FourQuaternions ret;
  630. ret.x = AddSIMD(a.x,b.x);
  631. ret.y = AddSIMD(a.y,b.y);
  632. ret.z = AddSIMD(a.z,b.z);
  633. ret.w = AddSIMD(a.w,b.w);
  634. return ret;
  635. }
  636. FORCEINLINE const FourQuaternions Sub(const FourQuaternions &a,const FourQuaternions &b)
  637. {
  638. FourQuaternions ret;
  639. ret.x = SubSIMD(a.x,b.x);
  640. ret.y = SubSIMD(a.y,b.y);
  641. ret.z = SubSIMD(a.z,b.z);
  642. ret.w = SubSIMD(a.w,b.w);
  643. return ret;
  644. }
  645. FORCEINLINE const FourQuaternions Neg(const FourQuaternions &q)
  646. {
  647. FourQuaternions ret;
  648. ret.x = NegSIMD(q.x);
  649. ret.y = NegSIMD(q.y);
  650. ret.z = NegSIMD(q.z);
  651. ret.w = NegSIMD(q.w);
  652. return ret;
  653. }
  654. FORCEINLINE const FourQuaternions MaskedAssign(const bi32x4 &mask, const FourQuaternions &a, const FourQuaternions &b)
  655. {
  656. FourQuaternions ret;
  657. ret.x = MaskedAssign(mask,a.x,b.x);
  658. ret.y = MaskedAssign(mask,a.y,b.y);
  659. ret.z = MaskedAssign(mask,a.z,b.z);
  660. ret.w = MaskedAssign(mask,a.w,b.w);
  661. return ret;
  662. }
  663. #ifdef DIFFERENT_NATIVE_VECTOR_TYPES
  664. FORCEINLINE const FourQuaternions MaskedAssign(const fltx4 &mask, const FourQuaternions &a, const FourQuaternions &b)
  665. {
  666. return MaskedAssign( ( bi32x4 )mask, a, b );
  667. }
  668. #endif
  669. FORCEINLINE FourQuaternions QuaternionAlign( const FourQuaternions &p, const FourQuaternions &q )
  670. {
  671. // decide if one of the quaternions is backwards
  672. bi32x4 cmp = CmpLtSIMD( Dot(p,q), Four_Zeros );
  673. return MaskedAssign( cmp, Neg(q), q );
  674. }
  675. FORCEINLINE const FourQuaternions QuaternionNormalize( const FourQuaternions &q )
  676. {
  677. fltx4 radius = Dot( q, q );
  678. bi32x4 mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0
  679. fltx4 invRadius = ReciprocalSqrtSIMD( radius );
  680. FourQuaternions ret = MaskedAssign(mask, q, Mul(q, invRadius));
  681. return ret;
  682. }
  683. #if !defined(__SPU__)
  684. FORCEINLINE FourQuaternions &FourQuaternions::FromAxisAndAngles( const fltx4 &axis,
  685. const float &angle0, const float &angle1, const float &angle2, const float &angle3 )
  686. {
  687. return FromAxisAndAngles( axis, LoadGatherSIMD(angle0,angle1,angle2,angle3) );
  688. }
  689. FORCEINLINE FourQuaternions &FourQuaternions::FromAxisAndAngles( const fltx4 &axis,
  690. const fltx4 &angles )
  691. {
  692. // compute the half theta
  693. fltx4 theta = MulSIMD( angles, Four_PointFives );
  694. // compute the sine and cosine of each angle simultaneously
  695. fltx4 vsines; fltx4 vcoses;
  696. SinCosSIMD( vsines, vcoses, theta );
  697. // now the sines and coses vectors contain the results for four angles.
  698. // for each of the angles, splat them out and then swizzle together so
  699. // as to get a < cos, sin, sin, sin > coefficient vector
  700. x = MulSIMD( vsines, SplatXSIMD( axis ) ); // sin(t0) * x, sin(t1) * x, etc
  701. y = MulSIMD( vsines, SplatYSIMD( axis ) );
  702. z = MulSIMD( vsines, SplatZSIMD( axis ) );
  703. w = vcoses;
  704. return *this;
  705. }
  706. #endif
  707. /// this = this * q;
  708. FORCEINLINE FourQuaternions FourQuaternions::Mul( FourQuaternions const &q ) const
  709. {
  710. // W = w1w2 - x1x2 - y1y2 - z1z2
  711. FourQuaternions ret;
  712. fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask );
  713. // as we do the multiplication, also do a dot product, so we know whether
  714. // one of the quats is backwards and if we therefore have to negate at the end
  715. fltx4 dotProduct = MulSIMD( w, q.w );
  716. ret.w = MulSIMD( w, q.w ); // W = w1w2
  717. ret.x = MulSIMD( w, q.x ); // X = w1x2
  718. ret.y = MulSIMD( w, q.y ); // Y = w1y2
  719. ret.z = MulSIMD( w, q.z ); // Z = w1z2
  720. dotProduct = MaddSIMD( x, q.x, dotProduct );
  721. ret.w = MsubSIMD( x, q.x, ret.w ); // W = w1w2 - x1x2
  722. ret.x = MaddSIMD( x, q.w, ret.x ); // X = w1x2 + x1w2
  723. ret.y = MsubSIMD( x, q.z, ret.y ); // Y = w1y2 - x1z2
  724. ret.z = MaddSIMD( x, q.y, ret.z ); // Z = w1z2 + x1y2
  725. dotProduct = MaddSIMD( y, q.y, dotProduct );
  726. ret.w = MsubSIMD( y, q.y, ret.w ); // W = w1w2 - x1x2 - y1y2
  727. ret.x = MaddSIMD( y, q.z, ret.x ); // X = w1x2 + x1w2 + y1z2
  728. ret.y = MaddSIMD( y, q.w, ret.y ); // Y = w1y2 - x1z2 + y1w2
  729. ret.z = MsubSIMD( y, q.x, ret.z ); // Z = w1z2 + x1y2 - y1x2
  730. dotProduct = MaddSIMD( z, q.z, dotProduct );
  731. ret.w = MsubSIMD( z, q.z, ret.w ); // W = w1w2 - x1x2 - y1y2 - z1z2
  732. ret.x = MsubSIMD( z, q.y, ret.x ); // X = w1x2 + x1w2 + y1z2 - z1y2
  733. ret.y = MaddSIMD( z, q.x, ret.y ); // Y = w1y2 - x1z2 + y1w2 + z1x2
  734. ret.z = MaddSIMD( z, q.w, ret.z ); // Z = w1z2 + x1y2 - y1x2 + z1w2
  735. fltx4 Zero = Four_Zeros;
  736. bi32x4 control = CmpLtSIMD( dotProduct, Four_Zeros );
  737. signMask = MaskedAssign(control, signMask, Zero); // negate quats where q1.q2 < 0
  738. ret.w = XorSIMD( signMask, ret.w );
  739. ret.x = XorSIMD( signMask, ret.x );
  740. ret.y = XorSIMD( signMask, ret.y );
  741. ret.z = XorSIMD( signMask, ret.z );
  742. return ret;
  743. }
  744. FORCEINLINE void FourQuaternions::RotateFourVectors( FourVectors * RESTRICT vecs ) const RESTRICT
  745. {
  746. fltx4 tmpX, tmpY, tmpZ, tmpW;
  747. fltx4 outX, outY, outZ;
  748. tmpX = SubSIMD( MaddSIMD( w, vecs->x , MulSIMD( y, vecs->z ) ),
  749. MulSIMD( z, vecs->y ) );
  750. tmpY = SubSIMD( MaddSIMD( w, vecs->y, MulSIMD( z, vecs->x ) ),
  751. MulSIMD( x, vecs->z ) );
  752. tmpZ = SubSIMD( MaddSIMD( w, vecs->z, MulSIMD( x, vecs->y ) ),
  753. MulSIMD( y, vecs->x ) );
  754. tmpW = AddSIMD( MaddSIMD( x, vecs->x, MulSIMD( y, vecs->y ) ),
  755. MulSIMD( z, vecs->z ) );
  756. outX = AddSIMD( SubSIMD( MaddSIMD( tmpW, x, MulSIMD( tmpX, w ) ),
  757. MulSIMD( tmpY, z ) ),
  758. MulSIMD( tmpZ, y ) );
  759. outY = AddSIMD( SubSIMD( MaddSIMD( tmpW, y, MulSIMD( tmpY, w ) ),
  760. MulSIMD( tmpZ, x ) ),
  761. MulSIMD( tmpX, z ) );
  762. outZ = AddSIMD( SubSIMD( MaddSIMD( tmpW, z, MulSIMD( tmpZ, w ) ),
  763. MulSIMD( tmpX, y ) ),
  764. MulSIMD( tmpY, x ) );
  765. // although apparently redundant, assigning the results to intermediate local variables
  766. // seems to improve code scheduling slightly in SN.
  767. vecs->x = outX;
  768. vecs->y = outY;
  769. vecs->z = outZ;
  770. }
  771. /*
  772. void QuaternionScale( const Quaternion &p, float t, Quaternion &q )
  773. {
  774. Assert( s_bMathlibInitialized );
  775. float r;
  776. // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to
  777. // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
  778. float sinom = sqrt( DotProduct( &p.x, &p.x ) );
  779. sinom = min( sinom, 1.f );
  780. float sinsom = sin( asin( sinom ) * t );
  781. t = sinsom / (sinom + FLT_EPSILON);
  782. VectorScale( &p.x, t, &q.x );
  783. // rescale rotation
  784. r = 1.0f - sinsom * sinsom;
  785. // Assert( r >= 0 );
  786. if (r < 0.0f)
  787. r = 0.0f;
  788. r = sqrt( r );
  789. // keep sign of rotation
  790. if (p.w < 0)
  791. q.w = -r;
  792. else
  793. q.w = r;
  794. Assert( q.IsValid() );
  795. return;
  796. }
  797. */
  798. FORCEINLINE FourQuaternions FourQuaternions::ScaleAngle( const fltx4 &scale ) const
  799. {
  800. FourQuaternions ret;
  801. static const fltx4 OneMinusEpsilon = {1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
  802. const fltx4 Zero = Four_Zeros;
  803. fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask );
  804. // work out if there are any tiny scales or angles, which are unstable
  805. bi32x4 tinyAngles = CmpGtSIMD(w,OneMinusEpsilon);
  806. bi32x4 negativeRotations = CmpLtSIMD(w, Zero); // if any w's are <0, we will need to negate later down
  807. // figure out the theta
  808. fltx4 angles = ArcCosSIMD( w );
  809. // test also if w > -1
  810. fltx4 negativeWs = XorSIMD(signMask, w);
  811. tinyAngles = OrSIMD( CmpGtSIMD(negativeWs, OneMinusEpsilon ), tinyAngles );
  812. // meanwhile start working on computing the dot product of the
  813. // vector component, and trust in the scheduler to interleave them
  814. fltx4 vLenSq = MulSIMD( x, x );
  815. vLenSq = MaddSIMD( y, y, vLenSq );
  816. vLenSq = MaddSIMD( z, z, vLenSq );
  817. // scale the angles
  818. angles = MulSIMD( angles, scale );
  819. // clear out the sign mask where w>=0
  820. signMask = MaskedAssign( negativeRotations, signMask, Zero);
  821. // work out the new w component and vector length
  822. fltx4 vLenRecip = ReciprocalSqrtSIMD(vLenSq); // interleave with Cos to hide latencies
  823. fltx4 sine;
  824. SinCosSIMD( sine, ret.w, angles );
  825. ret.x = MulSIMD( x, vLenRecip ); // renormalize so the vector length + w = 1
  826. ret.y = MulSIMD( y, vLenRecip ); // renormalize so the vector length + w = 1
  827. ret.z = MulSIMD( z, vLenRecip ); // renormalize so the vector length + w = 1
  828. ret.x = MulSIMD( ret.x, sine );
  829. ret.y = MulSIMD( ret.y, sine );
  830. ret.z = MulSIMD( ret.z, sine );
  831. // negate where necessary
  832. ret.x = XorSIMD(ret.x, signMask);
  833. ret.y = XorSIMD(ret.y, signMask);
  834. ret.z = XorSIMD(ret.z, signMask);
  835. ret.w = XorSIMD(ret.w, signMask);
  836. // finally, toss results from where cos(theta) is close to 1 -- these are non rotations.
  837. ret.x = MaskedAssign(tinyAngles, x, ret.x);
  838. ret.y = MaskedAssign(tinyAngles, y, ret.y);
  839. ret.z = MaskedAssign(tinyAngles, z, ret.z);
  840. ret.w = MaskedAssign(tinyAngles, w, ret.w);
  841. return ret;
  842. }
  843. //-----------------------------------------------------------------------------
  844. // Purpose: return = this * ( s * q )
  845. // In other words, for a quaternion representing a rotation of angle theta, return
  846. // one of angle s*theta
  847. // s is four floats in a fltx4 -- one for each quaternion
  848. //-----------------------------------------------------------------------------
  849. FORCEINLINE FourQuaternions FourQuaternions::MulAc( const fltx4 &s, const FourQuaternions &q ) const
  850. {
  851. /*
  852. void QuaternionMA( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt )
  853. {
  854. Quaternion p1, q1;
  855. QuaternionScale( q, s, q1 );
  856. QuaternionMult( p, q1, p1 );
  857. QuaternionNormalize( p1 );
  858. qt[0] = p1[0];
  859. qt[1] = p1[1];
  860. qt[2] = p1[2];
  861. qt[3] = p1[3];
  862. }
  863. */
  864. return Mul(q.ScaleAngle(s));
  865. }
  866. FORCEINLINE FourQuaternions FourQuaternions::ScaleMul( const fltx4 &s, const FourQuaternions &q ) const
  867. {
  868. return ScaleAngle(s).Mul(q);
  869. }
  870. FORCEINLINE FourQuaternions FourQuaternions::Slerp( const FourQuaternions &originalto, const fltx4 &t )
  871. {
  872. FourQuaternions ret;
  873. static const fltx4 OneMinusEpsilon = {1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
  874. // align if necessary.
  875. // actually, before we even do that, start by computing the dot product of
  876. // the quaternions. it has lots of dependent ops and we can sneak it into
  877. // the pipeline bubbles as we figure out alignment. Of course we don't know
  878. // yet if we need to realign, so compute them both -- there's plenty of
  879. // space in the bubbles. They're roomy, those bubbles.
  880. fltx4 cosineOmega;
  881. #if 0 // Maybe I don't need to do alignment seperately, using the xb360 technique...
  882. FourQuaternions to;
  883. {
  884. fltx4 diffs[4], sums[4], originalToNeg[4];
  885. fltx4 dotIfAligned, dotIfNotAligned;
  886. // compute negations of the TO quaternion.
  887. originalToNeg[0] = NegSIMD(originalto.x);
  888. originalToNeg[1] = NegSIMD(originalto.y);
  889. originalToNeg[2] = NegSIMD(originalto.z);
  890. originalToNeg[3] = NegSIMD(originalto.w);
  891. dotIfAligned = MulSIMD(x, originalto.x);
  892. dotIfNotAligned = MulSIMD(x, originalToNeg[0]);
  893. diffs[0] = SubSIMD(x, originalto.x);
  894. diffs[1] = SubSIMD(y, originalto.y);
  895. diffs[2] = SubSIMD(z, originalto.z);
  896. diffs[3] = SubSIMD(w, originalto.w);
  897. sums[0] = AddSIMD(x, originalto.x);
  898. sums[1] = AddSIMD(y, originalto.y);
  899. sums[2] = AddSIMD(z, originalto.z);
  900. sums[3] = AddSIMD(w, originalto.w);
  901. dotIfAligned = MaddSIMD(y, originalto.y, dotIfAligned);
  902. dotIfNotAligned = MaddSIMD(y, originalToNeg[1], dotIfNotAligned);
  903. fltx4 diffsDot, sumsDot;
  904. diffsDot = MulSIMD(diffs[0], diffs[0]); // x^2
  905. sumsDot = MulSIMD(sums[0], sums[0] ); // x^2
  906. // do some work on the dot products while letting the multiplies cook
  907. dotIfAligned = MaddSIMD(z, originalto.z, dotIfAligned);
  908. dotIfNotAligned = MaddSIMD(z, originalToNeg[2], dotIfNotAligned);
  909. diffsDot = MaddSIMD(diffs[1], diffs[1], diffsDot); // x^2 + y^2
  910. sumsDot = MaddSIMD(sums[1], sums[1], sumsDot );
  911. diffsDot = MaddSIMD(diffs[2], diffs[2], diffsDot); // x^2 + y^2 + z^2
  912. sumsDot = MaddSIMD(sums[2], sums[2], sumsDot );
  913. diffsDot = MaddSIMD(diffs[3], diffs[3], diffsDot); // x^2 + y^2 + z^2 + w^2
  914. sumsDot = MaddSIMD(sums[3], sums[3], sumsDot );
  915. // do some work on the dot products while letting the multiplies cook
  916. dotIfAligned = MaddSIMD(w, originalto.w, dotIfAligned);
  917. dotIfNotAligned = MaddSIMD(w, originalToNeg[3], dotIfNotAligned);
  918. // are the differences greater than the sums?
  919. // if so, we need to negate that quaternion
  920. fltx4 mask = CmpGtSIMD(diffsDot, sumsDot); // 1 for diffs>0 and 0 elsewhere
  921. to.x = MaskedAssign(mask, originalToNeg[0], originalto.x);
  922. to.y = MaskedAssign(mask, originalToNeg[1], originalto.y);
  923. to.z = MaskedAssign(mask, originalToNeg[2], originalto.z);
  924. to.w = MaskedAssign(mask, originalToNeg[3], originalto.w);
  925. cosineOmega = MaskedAssign(mask, dotIfNotAligned, dotIfAligned);
  926. }
  927. // right, now to is aligned to be the short way round, and we computed
  928. // the dot product while we were figuring all that out.
  929. #else
  930. const FourQuaternions &to = originalto;
  931. cosineOmega = MulSIMD(x, to.x);
  932. cosineOmega = MaddSIMD(y, to.y, cosineOmega);
  933. cosineOmega = MaddSIMD(z, to.z, cosineOmega);
  934. cosineOmega = MaddSIMD(w, to.w, cosineOmega);
  935. #endif
  936. fltx4 Zero = Four_Zeros;
  937. bi32x4 cosOmegaLessThanZero = CmpLtSIMD(cosineOmega, Zero);
  938. // fltx4 shouldNegate = MaskedAssign(cosOmegaLessThanZero, Four_NegativeOnes , Four_Ones );
  939. fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask ); // contains a one in the sign bit -- xor against a number to negate it
  940. fltx4 sinOmega = Four_Ones;
  941. // negate cosineOmega where necessary
  942. cosineOmega = MaskedAssign( cosOmegaLessThanZero, XorSIMD(cosineOmega, signMask), cosineOmega );
  943. fltx4 oneMinusT = SubSIMD(Four_Ones,t);
  944. bi32x4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps
  945. // figure out the sin component of the diff quaternion.
  946. // since sin^2(t) + cos^2(t) = 1...
  947. sinOmega = MsubSIMD( cosineOmega, cosineOmega, sinOmega ); // = 1 - cos^2(t) = sin^2(t)
  948. fltx4 invSinOmega = ReciprocalSqrtSIMD( sinOmega ); // 1/sin(t)
  949. sinOmega = MulSIMD( sinOmega, invSinOmega ); // = sin^2(t) / sin(t) = sin(t)
  950. // use the arctangent technique to work out omega from tan^-1(sin/cos)
  951. fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);
  952. // alpha = sin(omega * (1-T))/sin(omega)
  953. // beta = sin(omega * T)/sin(omega)
  954. fltx4 alpha = MulSIMD(omega, oneMinusT); // w(1-T)
  955. fltx4 beta = MulSIMD(omega, t); // w(T)
  956. signMask = MaskedAssign(cosOmegaLessThanZero, signMask, Zero);
  957. alpha = SinSIMD(alpha); // sin(w(1-T))
  958. beta = SinSIMD(beta); // sin(wT)
  959. alpha = MulSIMD(alpha, invSinOmega);
  960. beta = MulSIMD(beta, invSinOmega);
  961. // depending on whether the dot product was less than zero, negate beta, or not
  962. beta = XorSIMD(beta, signMask);
  963. // mask out singularities (where omega = 1)
  964. alpha = MaskedAssign( bCosOmegaLessThanOne, alpha, oneMinusT );
  965. beta = MaskedAssign( bCosOmegaLessThanOne, beta , t );
  966. ret.x = MulSIMD(x, alpha);
  967. ret.y = MulSIMD(y, alpha);
  968. ret.z = MulSIMD(z, alpha);
  969. ret.w = MulSIMD(w, alpha);
  970. ret.x = MaddSIMD(to.x, beta, ret.x);
  971. ret.y = MaddSIMD(to.y, beta, ret.y);
  972. ret.z = MaddSIMD(to.z, beta, ret.z);
  973. ret.w = MaddSIMD(to.w, beta, ret.w);
  974. return ret;
  975. }
  976. FORCEINLINE FourQuaternions FourQuaternions::SlerpNoAlign( const FourQuaternions &originalto, const fltx4 &t )
  977. {
  978. FourQuaternions ret;
  979. static const fltx4 OneMinusEpsilon = {1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f, 1.0f - 0.000001f };
  980. // align if necessary.
  981. // actually, before we even do that, start by computing the dot product of
  982. // the quaternions. it has lots of dependent ops and we can sneak it into
  983. // the pipeline bubbles as we figure out alignment. Of course we don't know
  984. // yet if we need to realign, so compute them both -- there's plenty of
  985. // space in the bubbles. They're roomy, those bubbles.
  986. fltx4 cosineOmega;
  987. const FourQuaternions &to = originalto;
  988. cosineOmega = MulSIMD(x, to.x);
  989. cosineOmega = MaddSIMD(y, to.y, cosineOmega);
  990. cosineOmega = MaddSIMD(z, to.z, cosineOmega);
  991. cosineOmega = MaddSIMD(w, to.w, cosineOmega);
  992. fltx4 sinOmega = Four_Ones;
  993. fltx4 oneMinusT = SubSIMD(Four_Ones,t);
  994. bi32x4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps
  995. // figure out the sin component of the diff quaternion.
  996. // since sin^2(t) + cos^2(t) = 1...
  997. sinOmega = MsubSIMD( cosineOmega, cosineOmega, sinOmega ); // = 1 - cos^2(t) = sin^2(t)
  998. fltx4 invSinOmega = ReciprocalSqrtSIMD( sinOmega ); // 1/sin(t)
  999. sinOmega = MulSIMD( sinOmega, invSinOmega ); // = sin^2(t) / sin(t) = sin(t)
  1000. // use the arctangent technique to work out omega from tan^-1(sin/cos)
  1001. fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);
  1002. // alpha = sin(omega * (1-T))/sin(omega)
  1003. // beta = sin(omega * T)/sin(omega)
  1004. fltx4 alpha = MulSIMD(omega, oneMinusT); // w(1-T)
  1005. fltx4 beta = MulSIMD(omega, t); // w(T)
  1006. alpha = SinSIMD(alpha); // sin(w(1-T))
  1007. beta = SinSIMD(beta); // sin(wT)
  1008. alpha = MulSIMD(alpha, invSinOmega);
  1009. beta = MulSIMD(beta, invSinOmega);
  1010. // mask out singularities (where omega = 1)
  1011. alpha = MaskedAssign( bCosOmegaLessThanOne, alpha, oneMinusT );
  1012. beta = MaskedAssign( bCosOmegaLessThanOne, beta , t );
  1013. ret.x = MulSIMD(x, alpha);
  1014. ret.y = MulSIMD(y, alpha);
  1015. ret.z = MulSIMD(z, alpha);
  1016. ret.w = MulSIMD(w, alpha);
  1017. ret.x = MaddSIMD(to.x, beta, ret.x);
  1018. ret.y = MaddSIMD(to.y, beta, ret.y);
  1019. ret.z = MaddSIMD(to.z, beta, ret.z);
  1020. ret.w = MaddSIMD(to.w, beta, ret.w);
  1021. return ret;
  1022. }
  1023. /***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function:
  1024. inline void FourVectors::RotateBy( const FourQuaternions &quats )
  1025. {
  1026. quats.RotateFourVectors( this );
  1027. }
  1028. */
  1029. #endif // SSEQUATMATH_H