Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

367 lines
11 KiB

  1. //========= Copyright Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose: - defines SIMD "structure of arrays" classes and functions.
  4. //
  5. //===========================================================================//
  6. #ifndef SSEQUATMATH_H
  7. #define SSEQUATMATH_H
  8. #ifdef _WIN32
  9. #pragma once
  10. #endif
  11. #include "mathlib/ssemath.h"
  12. // Use this #define to allow SSE versions of Quaternion math
  13. // to exist on PC.
  14. // On PC, certain horizontal vector operations are not supported.
  15. // This causes the SSE implementation of quaternion math to mix the
  16. // vector and scalar floating point units, which is extremely
  17. // performance negative if you don't compile to native SSE2 (which
  18. // we don't as of Sept 1, 2007). So, it's best not to allow these
  19. // functions to exist at all. It's not good enough to simply replace
  20. // the contents of the functions with scalar math, because each call
  21. // to LoadAligned and StoreAligned will result in an unnecssary copy
  22. // of the quaternion, and several moves to and from the XMM registers.
  23. //
  24. // Basically, the problem you run into is that for efficient SIMD code,
  25. // you need to load the quaternions and vectors into SIMD registers and
  26. // keep them there as long as possible while doing only SIMD math,
  27. // whereas for efficient scalar code, each time you copy onto or ever
  28. // use a fltx4, it hoses your pipeline. So the difference has to be
  29. // in the management of temporary variables in the calling function,
  30. // not inside the math functions.
  31. //
  32. // If you compile assuming the presence of SSE2, the MSVC will abandon
  33. // the traditional x87 FPU operations altogether and make everything use
  34. // the SSE2 registers, which lessens this problem a little.
  35. // permitted only on 360, as we've done careful tuning on its Altivec math:
  36. #ifdef _X360
  37. #define ALLOW_SIMD_QUATERNION_MATH 1 // not on PC!
  38. #endif
  39. //---------------------------------------------------------------------
  40. // Load/store quaternions
  41. //---------------------------------------------------------------------
  42. #ifndef _X360
  43. #if ALLOW_SIMD_QUATERNION_MATH
  44. // Using STDC or SSE
  45. FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
  46. {
  47. fltx4 retval = LoadAlignedSIMD( pSIMD.Base() );
  48. return retval;
  49. }
  50. FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
  51. {
  52. fltx4 retval = LoadAlignedSIMD( pSIMD );
  53. return retval;
  54. }
  55. FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )
  56. {
  57. StoreAlignedSIMD( pSIMD->Base(), a );
  58. }
  59. #endif
  60. #else
  61. // for the transitional class -- load a QuaternionAligned
  62. FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
  63. {
  64. fltx4 retval = XMLoadVector4A( pSIMD.Base() );
  65. return retval;
  66. }
  67. FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
  68. {
  69. fltx4 retval = XMLoadVector4A( pSIMD );
  70. return retval;
  71. }
  72. FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )
  73. {
  74. XMStoreVector4A( pSIMD->Base(), a );
  75. }
  76. #endif
  77. #if ALLOW_SIMD_QUATERNION_MATH
  78. //---------------------------------------------------------------------
  79. // Make sure quaternions are within 180 degrees of one another, if not, reverse q
  80. //---------------------------------------------------------------------
  81. FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q )
  82. {
  83. // decide if one of the quaternions is backwards
  84. fltx4 a = SubSIMD( p, q );
  85. fltx4 b = AddSIMD( p, q );
  86. a = Dot4SIMD( a, a );
  87. b = Dot4SIMD( b, b );
  88. fltx4 cmp = CmpGtSIMD( a, b );
  89. fltx4 result = MaskedAssign( cmp, NegSIMD(q), q );
  90. return result;
  91. }
  92. //---------------------------------------------------------------------
  93. // Normalize Quaternion
  94. //---------------------------------------------------------------------
  95. #if USE_STDC_FOR_SIMD
  96. FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )
  97. {
  98. fltx4 radius, result;
  99. radius = Dot4SIMD( q, q );
  100. if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))
  101. {
  102. float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) );
  103. result = ReplicateX4( iradius );
  104. result = MulSIMD( result, q );
  105. return result;
  106. }
  107. return q;
  108. }
  109. #else
  110. // SSE + X360 implementation
  111. FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )
  112. {
  113. fltx4 radius, result, mask;
  114. radius = Dot4SIMD( q, q );
  115. mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0
  116. result = ReciprocalSqrtSIMD( radius );
  117. result = MulSIMD( result, q );
  118. return MaskedAssign( mask, q, result ); // if radius was 0, just return q
  119. }
  120. #endif
  121. //---------------------------------------------------------------------
  122. // 0.0 returns p, 1.0 return q.
  123. //---------------------------------------------------------------------
  124. FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
  125. {
  126. fltx4 sclp, sclq, result;
  127. sclq = ReplicateX4( t );
  128. sclp = SubSIMD( Four_Ones, sclq );
  129. result = MulSIMD( sclp, p );
  130. result = MaddSIMD( sclq, q, result );
  131. return QuaternionNormalizeSIMD( result );
  132. }
  133. //---------------------------------------------------------------------
  134. // Blend Quaternions
  135. //---------------------------------------------------------------------
  136. FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t )
  137. {
  138. // decide if one of the quaternions is backwards
  139. fltx4 q2, result;
  140. q2 = QuaternionAlignSIMD( p, q );
  141. result = QuaternionBlendNoAlignSIMD( p, q2, t );
  142. return result;
  143. }
  144. //---------------------------------------------------------------------
  145. // Multiply Quaternions
  146. //---------------------------------------------------------------------
  147. #ifndef _X360
  148. // SSE and STDC
  149. FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
  150. {
  151. // decide if one of the quaternions is backwards
  152. fltx4 q2, result;
  153. q2 = QuaternionAlignSIMD( p, q );
  154. SubFloat( result, 0 ) = SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 );
  155. SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 );
  156. SubFloat( result, 2 ) = SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 );
  157. SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 );
  158. return result;
  159. }
  160. #else
  161. // X360
  162. extern const fltx4 g_QuatMultRowSign[4];
  163. FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
  164. {
  165. fltx4 q2, row, result;
  166. q2 = QuaternionAlignSIMD( p, q );
  167. row = XMVectorSwizzle( q2, 3, 2, 1, 0 );
  168. row = MulSIMD( row, g_QuatMultRowSign[0] );
  169. result = Dot4SIMD( row, p );
  170. row = XMVectorSwizzle( q2, 2, 3, 0, 1 );
  171. row = MulSIMD( row, g_QuatMultRowSign[1] );
  172. row = Dot4SIMD( row, p );
  173. result = __vrlimi( result, row, 4, 0 );
  174. row = XMVectorSwizzle( q2, 1, 0, 3, 2 );
  175. row = MulSIMD( row, g_QuatMultRowSign[2] );
  176. row = Dot4SIMD( row, p );
  177. result = __vrlimi( result, row, 2, 0 );
  178. row = MulSIMD( q2, g_QuatMultRowSign[3] );
  179. row = Dot4SIMD( row, p );
  180. result = __vrlimi( result, row, 1, 0 );
  181. return result;
  182. }
  183. #endif
  184. //---------------------------------------------------------------------
  185. // Quaternion scale
  186. //---------------------------------------------------------------------
  187. #ifndef _X360
  188. // SSE and STDC
  189. FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
  190. {
  191. float r;
  192. fltx4 q;
  193. // FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to
  194. // use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
  195. float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) );
  196. sinom = min( sinom, 1.f );
  197. float sinsom = sin( asin( sinom ) * t );
  198. t = sinsom / (sinom + FLT_EPSILON);
  199. SubFloat( q, 0 ) = t * SubFloat( p, 0 );
  200. SubFloat( q, 1 ) = t * SubFloat( p, 1 );
  201. SubFloat( q, 2 ) = t * SubFloat( p, 2 );
  202. // rescale rotation
  203. r = 1.0f - sinsom * sinsom;
  204. // Assert( r >= 0 );
  205. if (r < 0.0f)
  206. r = 0.0f;
  207. r = sqrt( r );
  208. // keep sign of rotation
  209. SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r );
  210. return q;
  211. }
  212. #else
  213. // X360
  214. FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
  215. {
  216. fltx4 sinom = Dot3SIMD( p, p );
  217. sinom = SqrtSIMD( sinom );
  218. sinom = MinSIMD( sinom, Four_Ones );
  219. fltx4 sinsom = ArcSinSIMD( sinom );
  220. fltx4 t4 = ReplicateX4( t );
  221. sinsom = MulSIMD( sinsom, t4 );
  222. sinsom = SinSIMD( sinsom );
  223. sinom = AddSIMD( sinom, Four_Epsilons );
  224. sinom = ReciprocalSIMD( sinom );
  225. t4 = MulSIMD( sinsom, sinom );
  226. fltx4 result = MulSIMD( p, t4 );
  227. // rescale rotation
  228. sinsom = MulSIMD( sinsom, sinsom );
  229. fltx4 r = SubSIMD( Four_Ones, sinsom );
  230. r = MaxSIMD( r, Four_Zeros );
  231. r = SqrtSIMD( r );
  232. // keep sign of rotation
  233. fltx4 cmp = CmpGeSIMD( p, Four_Zeros );
  234. r = MaskedAssign( cmp, r, NegSIMD( r ) );
  235. result = __vrlimi(result, r, 1, 0);
  236. return result;
  237. }
  238. #endif
  239. //-----------------------------------------------------------------------------
  240. // Quaternion sphereical linear interpolation
  241. //-----------------------------------------------------------------------------
  242. #ifndef _X360
  243. // SSE and STDC
  244. FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
  245. {
  246. float omega, cosom, sinom, sclp, sclq;
  247. fltx4 result;
  248. // 0.0 returns p, 1.0 return q.
  249. cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) +
  250. SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 );
  251. if ( (1.0f + cosom ) > 0.000001f )
  252. {
  253. if ( (1.0f - cosom ) > 0.000001f )
  254. {
  255. omega = acos( cosom );
  256. sinom = sin( omega );
  257. sclp = sin( (1.0f - t)*omega) / sinom;
  258. sclq = sin( t*omega ) / sinom;
  259. }
  260. else
  261. {
  262. // TODO: add short circuit for cosom == 1.0f?
  263. sclp = 1.0f - t;
  264. sclq = t;
  265. }
  266. SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 );
  267. SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 );
  268. SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 );
  269. SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 );
  270. }
  271. else
  272. {
  273. SubFloat( result, 0 ) = -SubFloat( q, 1 );
  274. SubFloat( result, 1 ) = SubFloat( q, 0 );
  275. SubFloat( result, 2 ) = -SubFloat( q, 3 );
  276. SubFloat( result, 3 ) = SubFloat( q, 2 );
  277. sclp = sin( (1.0f - t) * (0.5f * M_PI));
  278. sclq = sin( t * (0.5f * M_PI));
  279. SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 );
  280. SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 );
  281. SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 );
  282. }
  283. return result;
  284. }
  285. #else
  286. // X360
  287. FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
  288. {
  289. return XMQuaternionSlerp( p, q, t );
  290. }
  291. #endif
  292. FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t )
  293. {
  294. fltx4 q2, result;
  295. q2 = QuaternionAlignSIMD( p, q );
  296. result = QuaternionSlerpNoAlignSIMD( p, q2, t );
  297. return result;
  298. }
  299. #endif // ALLOW_SIMD_QUATERNION_MATH
  300. #endif // SSEQUATMATH_H