Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

920 lines
26 KiB

  1. //===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
  2. //
  3. // Purpose: Implementation of our SIMD functions for the x86 using SSE
  4. //==============================================================//
  5. #ifndef _MATH_PFNS_H_
  6. #include "mathlib/math_pfns.h"
  7. #endif
  8. #if defined( PLATFORM_WINDOWS_PC )
  9. #include <intrin.h>
  10. #else
  11. #include <xmmintrin.h>
  12. #include <pmmintrin.h>
  13. #endif
  14. //---------------------------------------------------------------------
  15. // Intel/SSE implementation
  16. //---------------------------------------------------------------------
  17. FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
  18. {
  19. _mm_store_ps( pSIMD, a );
  20. }
  21. FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
  22. {
  23. _mm_storeu_ps( pSIMD, a );
  24. }
  25. FORCEINLINE void StoreUnalignedSIMD( int * RESTRICT pSIMD, const i32x4 &a )
  26. {
  27. _mm_storeu_si128( ( __m128i * ) pSIMD, a );
  28. }
  29. FORCEINLINE fltx4 RotateLeft( const fltx4 & a );
  30. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a );
  31. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  32. {
  33. _mm_store_ss(pSIMD, a);
  34. _mm_store_ss(pSIMD+1, RotateLeft(a));
  35. _mm_store_ss(pSIMD+2, RotateLeft2(a));
  36. }
  37. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  38. {
  39. return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) );
  40. }
  41. FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD )
  42. {
  43. return _mm_load_si128( reinterpret_cast< const shortx8 *> ( pSIMD ) );
  44. }
  45. FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD )
  46. {
  47. return _mm_loadu_si128( reinterpret_cast< const shortx8 *> ( pSIMD ) );
  48. }
  49. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  50. {
  51. return _mm_and_ps( a, b );
  52. }
  53. FORCEINLINE i32x4 AndSIMD( const i32x4 &a, const i32x4 &b )
  54. {
  55. return _mm_and_si128( a, b );
  56. }
  57. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  58. {
  59. return _mm_andnot_ps( a, b );
  60. }
  61. FORCEINLINE i32x4 AndNotSIMD( const i32x4 & a, const i32x4 & b ) // ~a & b
  62. {
  63. return _mm_andnot_si128( a, b );
  64. }
  65. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  66. {
  67. return _mm_xor_ps( a, b );
  68. }
  69. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  70. {
  71. return _mm_or_ps( a, b );
  72. }
  73. FORCEINLINE i32x4 OrSIMD( const i32x4 &a, const i32x4 &b )
  74. {
  75. return _mm_or_si128( a, b );
  76. }
  77. // Squelch the w component of a vector to +0.0.
  78. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  79. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  80. {
  81. return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) );
  82. }
  83. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  84. {
  85. return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
  86. }
  87. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  88. {
  89. return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
  90. }
  91. // load a single unaligned float into the x component of a SIMD word
  92. FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
  93. {
  94. return _mm_load_ss(pFlt);
  95. }
  96. FORCEINLINE fltx4 CastToFltx4( i32x4 const & a )
  97. {
  98. return _mm_castsi128_ps( a );
  99. }
  100. /// replicate a single 32 bit integer value to all 4 components of an m128
  101. FORCEINLINE i32x4 ReplicateIX4( int i )
  102. {
  103. return _mm_set1_epi32( i );
  104. }
  105. FORCEINLINE fltx4 ReplicateX4( float flValue )
  106. {
  107. __m128 value = _mm_set_ss( flValue );
  108. return _mm_shuffle_ps( value, value, 0 );
  109. }
  110. // AltiVec compilers may have trouble inlining pass-by-value variant of ReplicateX4, whereas
  111. // they will have absolutely no problem inlining pass-by-pointer variant. So it's better to use
  112. // the pass-by-pointer variant unless you're mixing scalar and vector code (which is bad for perf on AltiVec anyway)
  113. FORCEINLINE fltx4 ReplicateX4( const float *pValue )
  114. {
  115. return ReplicateX4( *pValue );
  116. }
  117. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  118. {
  119. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  120. #ifndef POSIX
  121. return a.m128_f32[ idx ];
  122. #else
  123. return (reinterpret_cast<float const *>(&a))[idx];
  124. #endif
  125. }
  126. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  127. {
  128. #ifndef POSIX
  129. return a.m128_f32[ idx ];
  130. #else
  131. return (reinterpret_cast<float *>(&a))[idx];
  132. #endif
  133. }
  134. FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
  135. {
  136. return (uint32)SubFloat(a,idx);
  137. }
  138. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  139. {
  140. #ifndef POSIX
  141. return a.m128_u32[idx];
  142. #else
  143. return (reinterpret_cast<uint32 const *>(&a))[idx];
  144. #endif
  145. }
  146. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  147. {
  148. #ifndef POSIX
  149. return a.m128_u32[idx];
  150. #else
  151. return (reinterpret_cast<uint32 *>(&a))[idx];
  152. #endif
  153. }
  154. FORCEINLINE uint32 SubInt( i32x4 const & a, int idx )
  155. {
  156. #ifndef POSIX
  157. return a.m128i_u32[idx];
  158. #else
  159. return (reinterpret_cast<const uint32 *>(&a))[idx];
  160. #endif
  161. }
  162. FORCEINLINE uint32 & SubInt( i32x4 & a, int idx )
  163. {
  164. #ifndef POSIX
  165. return a.m128i_u32[idx];
  166. #else
  167. return (reinterpret_cast<uint32 *>(&a))[idx];
  168. #endif
  169. }
  170. // gather from array. Indices are in units of float size
  171. FORCEINLINE fltx4 GatherFltX4SIMD( float const *pData, i32x4 n4Indices )
  172. {
  173. fltx4 fl4Ret;
  174. SubFloat( fl4Ret, 0 ) = pData[SubInt(n4Indices,0)];
  175. SubFloat( fl4Ret, 1 ) = pData[SubInt(n4Indices,1)];
  176. SubFloat( fl4Ret, 2 ) = pData[SubInt(n4Indices,2)];
  177. SubFloat( fl4Ret, 3 ) = pData[SubInt(n4Indices,3)];
  178. return fl4Ret;
  179. }
  180. // gather from array. Indices are in units of float size
  181. FORCEINLINE fltx4 GatherFltX4SIMD( fltx4 const *pData, i32x4 n4Indices )
  182. {
  183. return GatherFltX4SIMD( ( float const * ) pData, n4Indices );
  184. }
  185. // Return one in the fastest way -- on the x360, faster even than loading.
  186. FORCEINLINE fltx4 LoadZeroSIMD( void )
  187. {
  188. return Four_Zeros;
  189. }
  190. // Return one in the fastest way -- on the x360, faster even than loading.
  191. FORCEINLINE fltx4 LoadOneSIMD( void )
  192. {
  193. return Four_Ones;
  194. }
  195. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  196. {
  197. return OrSIMD(
  198. AndSIMD( ReplacementMask, NewValue ),
  199. AndNotSIMD( ReplacementMask, OldValue ) );
  200. }
  201. // remember, the SSE numbers its words 3 2 1 0
  202. // The way we want to specify shuffles is backwards from the default
  203. // MM_SHUFFLE_REV is in array index order (default is reversed)
  204. #define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a)
  205. FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a )
  206. {
  207. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
  208. }
  209. FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a )
  210. {
  211. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) );
  212. }
  213. FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a )
  214. {
  215. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) );
  216. }
  217. FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a )
  218. {
  219. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 3, 3, 3, 3 ) );
  220. }
  221. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  222. {
  223. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a );
  224. return result;
  225. }
  226. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  227. {
  228. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a );
  229. return result;
  230. }
  231. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  232. {
  233. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a );
  234. return result;
  235. }
  236. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  237. {
  238. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a );
  239. return result;
  240. }
  241. /// Set one component of a SIMD word with the given float value.
  242. /// This function is a template because the native implementation of
  243. /// this on PPC platforms requires that the component be given as a
  244. /// compiler immediate -- not a function parameter, not a const function
  245. /// parameter, not even a load from a const static array. It has to be
  246. /// a real immediate.
  247. /// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.
  248. /// \note This function is not particularly performant on any platform (because of
  249. /// the load from float), so prefer a masked assign from a fltx4 wherever
  250. /// possible.
  251. template < unsigned int NCOMPONENT >
  252. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )
  253. {
  254. fltx4 val = ReplicateX4( flValue );
  255. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[NCOMPONENT] ), val, a );
  256. return result;
  257. }
  258. // a b c d -> b c d a
  259. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  260. {
  261. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) );
  262. }
  263. // a b c d -> c d a b
  264. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  265. {
  266. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
  267. }
  268. // a b c d -> d a b c
  269. FORCEINLINE fltx4 RotateRight( const fltx4 & a )
  270. {
  271. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 3, 0, 1, 2 ) );
  272. }
  273. // a b c d -> c d a b
  274. FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
  275. {
  276. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
  277. }
  278. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b
  279. {
  280. return _mm_add_ps( a, b );
  281. }
  282. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  283. {
  284. return _mm_sub_ps( a, b );
  285. };
  286. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  287. {
  288. return _mm_mul_ps( a, b );
  289. };
  290. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  291. {
  292. return _mm_div_ps( a, b );
  293. };
  294. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  295. {
  296. return AddSIMD( MulSIMD(a,b), c );
  297. }
  298. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  299. {
  300. return SubSIMD( c, MulSIMD(a,b) );
  301. };
  302. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  303. {
  304. fltx4 m = MulSIMD( a, b );
  305. return AddSIMD( AddSIMD( SplatXSIMD(m), SplatYSIMD(m) ), SplatZSIMD(m) );
  306. }
  307. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  308. {
  309. // 4 instructions, serial, order of addition varies so individual elements my differ in the LSB on some CPUs
  310. fltx4 fl4Product = MulSIMD( a, b );
  311. fltx4 fl4YXWZ = _mm_shuffle_ps( fl4Product, fl4Product, MM_SHUFFLE_REV(1,0,3,2) );
  312. fltx4 fl4UUVV = AddSIMD( fl4Product, fl4YXWZ ); // U = X+Y; V = Z+W
  313. fltx4 fl4VVUU = RotateLeft2( fl4UUVV );
  314. return AddSIMD( fl4UUVV, fl4VVUU );
  315. }
  316. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  317. {
  318. // FIXME: Make a fast SSE version
  319. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  320. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  321. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  322. }
  323. //TODO: implement as four-way Taylor series (see xbox implementation)
  324. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  325. {
  326. // FIXME: Make a fast SSE version
  327. fltx4 result;
  328. SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
  329. SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
  330. SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
  331. SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
  332. return result;
  333. }
  334. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  335. {
  336. fltx4 result;
  337. SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
  338. SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
  339. SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
  340. SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
  341. return result;
  342. }
  343. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  344. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  345. {
  346. fltx4 result;
  347. SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  348. SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  349. SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  350. SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  351. return result;
  352. }
  353. /// [ a.x+a.y a.z+a.w b.x+b.y b.z+b.w ] from sse3
  354. FORCEINLINE fltx4 PairwiseHorizontalAddSIMD( const fltx4 &a, const fltx4 &b )
  355. {
  356. return _mm_hadd_ps( a, b );
  357. }
  358. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  359. {
  360. return SubSIMD(LoadZeroSIMD(),a);
  361. }
  362. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  363. {
  364. return _mm_movemask_ps( a );
  365. }
  366. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  367. {
  368. return (0 != TestSignSIMD( a ));
  369. }
  370. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  371. {
  372. return _mm_cmpeq_ps( a, b );
  373. }
  374. FORCEINLINE fltx4 CmpEqSIMD( const i32x4 & a, const i32x4 & b ) // (a==b) ? ~0:0 for 32 bit ints.fltx4 result.
  375. {
  376. return _mm_castsi128_ps( _mm_cmpeq_epi32( a, b ) );
  377. }
  378. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  379. {
  380. return _mm_cmpgt_ps( a, b );
  381. }
  382. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  383. {
  384. return _mm_cmpge_ps( a, b );
  385. }
  386. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  387. {
  388. return _mm_cmplt_ps( a, b );
  389. }
  390. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  391. {
  392. return _mm_cmple_ps( a, b );
  393. }
  394. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? 1.0 : 0
  395. {
  396. return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) ) ;
  397. }
  398. FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? 1.0:0
  399. {
  400. return AndSIMD( Four_Ones, _mm_cmpeq_ps( a, b ) );
  401. }
  402. FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? 1.0:0
  403. {
  404. return AndSIMD( Four_Ones, _mm_cmpgt_ps( a, b ) );
  405. }
  406. FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? 1.0:0
  407. {
  408. return AndSIMD( Four_Ones, _mm_cmpge_ps( a, b ) );
  409. }
  410. FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? 1.0:0
  411. {
  412. return AndSIMD( Four_Ones, _mm_cmplt_ps( a, b ) );
  413. }
  414. FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? 1.0:0
  415. {
  416. return AndSIMD( Four_Ones, _mm_cmple_ps( a, b ) );
  417. }
  418. FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? 1.0 : 0
  419. {
  420. return AndSIMD( Four_Ones, AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) ) );
  421. }
  422. // for branching when a.xyzw > b.xyzw
  423. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  424. {
  425. return TestSignSIMD( CmpLeSIMD( a, b ) ) == 0;
  426. }
  427. // for branching when a.xyzw >= b.xyzw
  428. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  429. {
  430. return TestSignSIMD( CmpLtSIMD( a, b ) ) == 0;
  431. }
  432. // For branching if all a.xyzw == b.xyzw
  433. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  434. {
  435. return TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf;
  436. }
  437. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  438. {
  439. return _mm_min_ps( a, b );
  440. }
  441. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  442. {
  443. return _mm_max_ps( a, b );
  444. }
  445. // SSE lacks rounding operations.
  446. // Really.
  447. // You can emulate them by setting the rounding mode for the
  448. // whole processor and then converting to int, and then back again.
  449. // But every time you set the rounding mode, you clear out the
  450. // entire pipeline. So, I can't do them per operation. You
  451. // have to do it once, before the loop that would call these.
  452. // Round towards positive infinity
  453. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  454. {
  455. fltx4 retVal;
  456. SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
  457. SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
  458. SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
  459. SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
  460. return retVal;
  461. }
  462. fltx4 fabs( const fltx4 & x );
  463. // Round towards negative infinity
  464. // This is the implementation that was here before; it assumes
  465. // you are in round-to-floor mode, which I guess is usually the
  466. // case for us vis-a-vis SSE. It's totally unnecessary on
  467. // VMX, which has a native floor op.
  468. FORCEINLINE fltx4 FloorSIMD( const fltx4 &val )
  469. {
  470. fltx4 fl4Abs = fabs( val );
  471. fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
  472. ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
  473. return XorSIMD( ival, XorSIMD( val, fl4Abs ) ); // restore sign bits
  474. }
  475. FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero?
  476. {
  477. return TestSignSIMD( CmpEqSIMD( a, Four_Zeros ) ) != 0;
  478. }
  479. inline bool IsAllZeros( const fltx4 & var )
  480. {
  481. return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF;
  482. }
  483. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  484. {
  485. return _mm_sqrt_ps( a );
  486. }
  487. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  488. {
  489. return _mm_sqrt_ps( a );
  490. }
  491. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  492. {
  493. return _mm_rsqrt_ps( a );
  494. }
  495. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  496. {
  497. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  498. fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  499. ret = ReciprocalSqrtEstSIMD( ret );
  500. return ret;
  501. }
  502. /// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD
  503. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  504. {
  505. fltx4 guess = ReciprocalSqrtEstSIMD( a );
  506. // newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2));
  507. guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess ))));
  508. guess = MulSIMD( Four_PointFives, guess);
  509. return guess;
  510. }
  511. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  512. {
  513. return _mm_rcp_ps( a );
  514. }
  515. /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
  516. /// No error checking!
  517. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  518. {
  519. fltx4 ret = ReciprocalEstSIMD( a );
  520. // newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2
  521. ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) );
  522. return ret;
  523. }
  524. // CHRISG: is it worth doing integer bitfiddling for this?
  525. // 2^x for all values (the antilog)
  526. FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )
  527. {
  528. fltx4 retval;
  529. SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) );
  530. SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) );
  531. SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) );
  532. SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) );
  533. return retval;
  534. }
  535. // Clamps the components of a vector to a specified minimum and maximum range.
  536. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  537. {
  538. return MaxSIMD( min, MinSIMD( max, in ) );
  539. }
  540. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w)
  541. {
  542. _MM_TRANSPOSE4_PS( x, y, z, w );
  543. }
  544. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a )
  545. {
  546. // a is [x,y,z,G] (where G is garbage)
  547. // rotate left by one
  548. fltx4 compareOne = RotateLeft( a );
  549. // compareOne is [y,z,G,x]
  550. fltx4 retval = MinSIMD( a, compareOne );
  551. // retVal is [min(x,y), ... ]
  552. compareOne = RotateLeft2( a );
  553. // compareOne is [z, G, x, y]
  554. retval = MinSIMD( retval, compareOne );
  555. // retVal = [ min(min(x,y),z)..]
  556. // splat the x component out to the whole vector and return
  557. return SplatXSIMD( retval );
  558. }
  559. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a )
  560. {
  561. // a is [x,y,z,G] (where G is garbage)
  562. // rotate left by one
  563. fltx4 compareOne = RotateLeft( a );
  564. // compareOne is [y,z,G,x]
  565. fltx4 retval = MaxSIMD( a, compareOne );
  566. // retVal is [max(x,y), ... ]
  567. compareOne = RotateLeft2( a );
  568. // compareOne is [z, G, x, y]
  569. retval = MaxSIMD( retval, compareOne );
  570. // retVal = [ max(max(x,y),z)..]
  571. // splat the x component out to the whole vector and return
  572. return SplatXSIMD( retval );
  573. }
  574. // ------------------------------------
  575. // INTEGER SIMD OPERATIONS.
  576. // ------------------------------------
  577. #if 0 /* pc does not have these ops */
  578. // splat all components of a vector to a signed immediate int number.
  579. FORCEINLINE fltx4 IntSetImmediateSIMD(int to)
  580. {
  581. //CHRISG: SSE2 has this, but not SSE1. What to do?
  582. fltx4 retval;
  583. SubInt( retval, 0 ) = to;
  584. SubInt( retval, 1 ) = to;
  585. SubInt( retval, 2 ) = to;
  586. SubInt( retval, 3 ) = to;
  587. return retval;
  588. }
  589. #endif
  590. // Load 4 aligned words into a SIMD register
  591. FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
  592. {
  593. return _mm_load_si128( reinterpret_cast<const __m128i *>(pSIMD) );
  594. }
  595. // Load 4 unaligned words into a SIMD register
  596. FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
  597. {
  598. return _mm_loadu_si128( reinterpret_cast<const __m128i *>(pSIMD) );
  599. }
  600. // save into four words, 16-byte aligned
  601. FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
  602. {
  603. _mm_store_ps( reinterpret_cast<float *>(pSIMD), a );
  604. }
  605. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  606. {
  607. _mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a );
  608. }
  609. FORCEINLINE void StoreAlignedSIMD( short * RESTRICT pSIMD, const shortx8 & a )
  610. {
  611. _mm_store_si128( (shortx8 *)pSIMD, a );
  612. }
  613. FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
  614. {
  615. _mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a );
  616. }
  617. FORCEINLINE void StoreUnalignedSIMD( short * RESTRICT pSIMD, const shortx8 & a )
  618. {
  619. _mm_storeu_si128( (shortx8 *)pSIMD, a );
  620. }
  621. // a={ a.x, a.z, b.x, b.z }
  622. // combine two fltx4s by throwing away every other field.
  623. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )
  624. {
  625. return _mm_shuffle_ps( a, b, MM_SHUFFLE_REV( 0, 2, 0, 2 ) );
  626. }
  627. // Load four consecutive uint16's, and turn them into floating point numbers.
  628. // This function isn't especially fast and could be made faster if anyone is
  629. // using it heavily.
  630. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
  631. {
  632. #ifdef POSIX
  633. fltx4 retval;
  634. SubFloat( retval, 0 ) = pInts[0];
  635. SubFloat( retval, 1 ) = pInts[1];
  636. SubFloat( retval, 2 ) = pInts[2];
  637. SubFloat( retval, 3 ) = pInts[3];
  638. return retval;
  639. #else
  640. __m128i inA = _mm_loadl_epi64( (__m128i const*) pInts); // Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result.
  641. inA = _mm_unpacklo_epi16( inA, _mm_setzero_si128() ); // unpack unsigned 16's to signed 32's
  642. return _mm_cvtepi32_ps(inA);
  643. #endif
  644. }
  645. // a={ a.x, b.x, c.x, d.x }
  646. // combine 4 fltx4s by throwing away 3/4s of the fields
  647. FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )
  648. {
  649. fltx4 aacc = _mm_shuffle_ps( a, c, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
  650. fltx4 bbdd = _mm_shuffle_ps( b, d, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
  651. return MaskedAssign( LoadAlignedSIMD( g_SIMD_EveryOtherMask ), bbdd, aacc );
  652. }
  653. // outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w }
  654. FORCEINLINE void ExpandSIMD( fltx4 const &a, fltx4 &fl4OutA, fltx4 &fl4OutB )
  655. {
  656. fl4OutA = _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 1, 1 ) );
  657. fl4OutB = _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 3, 3 ) );
  658. }
  659. // CHRISG: the conversion functions all seem to operate on m64's only...
  660. // how do we make them work here?
  661. // Take a fltx4 containing fixed-point uints and
  662. // return them as single precision floats. No
  663. // fixed point conversion is done.
  664. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
  665. {
  666. fltx4 retval;
  667. SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
  668. SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
  669. SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
  670. SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
  671. return retval;
  672. }
  673. // Convert the 4 32-bit integers to single precison floats.
  674. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  675. {
  676. return _mm_cvtepi32_ps( (const __m128i &)vSrcA );
  677. }
  678. /*
  679. works on fltx4's as if they are four uints.
  680. the first parameter contains the words to be shifted,
  681. the second contains the amount to shift by AS INTS
  682. for i = 0 to 3
  683. shift = vSrcB_i*32:(i*32)+4
  684. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  685. */
  686. FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
  687. {
  688. i32x4 retval;
  689. SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
  690. SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
  691. SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
  692. SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
  693. return retval;
  694. }
  695. // Fixed-point conversion and save as SIGNED INTS.
  696. // pDest->x = Int (vSrc.x)
  697. // note: some architectures have means of doing
  698. // fixed point conversion when the fix depth is
  699. // specified as an immediate.. but there is no way
  700. // to guarantee an immediate as a parameter to function
  701. // like this.
  702. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  703. {
  704. #if defined( COMPILER_MSVC64 )
  705. (*pDest)[0] = SubFloat(vSrc, 0);
  706. (*pDest)[1] = SubFloat(vSrc, 1);
  707. (*pDest)[2] = SubFloat(vSrc, 2);
  708. (*pDest)[3] = SubFloat(vSrc, 3);
  709. #else
  710. __m64 bottom = _mm_cvttps_pi32( vSrc );
  711. __m64 top = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) );
  712. *reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom;
  713. *reinterpret_cast<__m64 *>(&(*pDest)[2]) = top;
  714. _mm_empty();
  715. #endif
  716. }
  717. // some sse2 packed integer intrinsic wrappers
  718. #if _MSC_VER >= 1600 || defined(LINUX) || defined(OSX)
  719. /// replicate an 16 bit integer value to all 8 16-bit positions in an fltx4
  720. FORCEINLINE fltx4 ReplicateWordX8( uint16 nWord )
  721. {
  722. return _mm_castsi128_ps( _mm_set_epi16( nWord, nWord, nWord, nWord, nWord, nWord, nWord, nWord ) );
  723. }
  724. /// Return a 16-bit mask consiting of the upper bit of each of the bytes in the input
  725. FORCEINLINE int TestSignsOfBytesSIMD( fltx4 const &packedBytes )
  726. {
  727. return _mm_movemask_epi8( _mm_castps_si128( packedBytes ) );
  728. }
  729. /// compare each 16-bit field of a word for equality
  730. FORCEINLINE fltx4 CmpEqWordsSIMD( fltx4 const &flIn, fltx4 const &flValue )
  731. {
  732. return _mm_castsi128_ps( _mm_cmpeq_epi16( _mm_castps_si128( flIn ), _mm_castps_si128( flValue ) ) );
  733. }
  734. /// grab 16 16-bit signed words from two fltx4s, and pack them into one register holding 16 bytes converted from them
  735. FORCEINLINE fltx4 PackSignedWordsToBytesWithSaturateSIMD( fltx4 const &packedWorlds0, fltx4 const &packedWorlds1 )
  736. {
  737. return _mm_castsi128_ps( _mm_packs_epi16( _mm_castps_si128( packedWorlds0 ), _mm_castps_si128( packedWorlds1 ) ) );
  738. }
  739. FORCEINLINE fltx4 CrossProduct3SIMD( const fltx4 &v1, const fltx4 &v2 )
  740. {
  741. fltx4 v1_yzxx = _mm_shuffle_ps( v1, v1, MM_SHUFFLE_REV( 1,2,0,0 ) );
  742. fltx4 v2_zxyy = _mm_shuffle_ps( v2, v2, MM_SHUFFLE_REV( 2,0,1,0 ) );
  743. fltx4 v1_zxyy = _mm_shuffle_ps( v1, v1, MM_SHUFFLE_REV( 2,0,1,0 ) );
  744. fltx4 v2_yzxx = _mm_shuffle_ps( v2, v2, MM_SHUFFLE_REV( 1,2,0,0 ) );
  745. return SubSIMD( MulSIMD( v1_yzxx, v2_zxyy ), MulSIMD( v1_zxyy, v2_yzxx ) );
  746. }
  747. #endif