Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

801 lines
22 KiB

  1. //===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
  2. //
  3. // Purpose: Implementation of our SIMD functions for the 360.
  4. //==============================================================//
  5. #ifndef DBG_H
  6. #include "tier0/dbg.h"
  7. #endif
  8. //---------------------------------------------------------------------
  9. // X360 implementation
  10. //---------------------------------------------------------------------
  11. FORCEINLINE float & FloatSIMD( fltx4 & a, int idx )
  12. {
  13. fltx4_union & a_union = (fltx4_union &)a;
  14. return a_union.m128_f32[idx];
  15. }
  16. FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx )
  17. {
  18. fltx4_union & a_union = (fltx4_union &)a;
  19. return a_union.m128_u32[idx];
  20. }
  21. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
  22. {
  23. return __vaddfp( a, b );
  24. }
  25. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  26. {
  27. return __vsubfp( a, b );
  28. }
  29. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  30. {
  31. return __vmulfp( a, b );
  32. }
  33. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  34. {
  35. return __vmaddfp( a, b, c );
  36. }
  37. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  38. {
  39. return __vnmsubfp( a, b, c );
  40. };
  41. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  42. {
  43. return __vmsum3fp( a, b );
  44. }
  45. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  46. {
  47. return __vmsum4fp( a, b );
  48. }
  49. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  50. {
  51. return XMVectorSin( radians );
  52. }
  53. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  54. {
  55. XMVectorSinCos( &sine, &cosine, radians );
  56. }
  57. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  58. {
  59. XMVectorSinCos( &sine, &cosine, radians );
  60. }
  61. FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians )
  62. {
  63. cosine = XMVectorCos( radians );
  64. }
  65. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  66. {
  67. return XMVectorASin( sine );
  68. }
  69. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  70. {
  71. return XMVectorACos( cs );
  72. }
  73. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  74. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  75. {
  76. return XMVectorATan2( a, b );
  77. }
  78. // DivSIMD defined further down, since it uses ReciprocalSIMD
  79. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  80. {
  81. return __vmaxfp( a, b );
  82. }
  83. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  84. {
  85. return __vminfp( a, b );
  86. }
  87. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  88. {
  89. return __vand( a, b );
  90. }
  91. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  92. {
  93. // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
  94. return __vandc( b, a );
  95. }
  96. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  97. {
  98. return __vxor( a, b );
  99. }
  100. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  101. {
  102. return __vor( a, b );
  103. }
  104. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  105. {
  106. return XMVectorNegate(a);
  107. }
  108. FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
  109. {
  110. unsigned int equalFlags = 0;
  111. __vcmpeqfpR( a, Four_Zeros, &equalFlags );
  112. return XMComparisonAllTrue( equalFlags );
  113. }
  114. FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero?
  115. {
  116. unsigned int conditionregister;
  117. XMVectorEqualR(&conditionregister, a, XMVectorZero());
  118. return XMComparisonAnyTrue(conditionregister);
  119. }
  120. FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero?
  121. {
  122. // copy a's x component into w, in case w was zero.
  123. fltx4 temp = __vrlimi(a, a, 1, 1);
  124. unsigned int conditionregister;
  125. XMVectorEqualR(&conditionregister, temp, XMVectorZero());
  126. return XMComparisonAnyTrue(conditionregister);
  127. }
  128. /// for branching when a.xyzw > b.xyzw
  129. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  130. {
  131. unsigned int cr;
  132. XMVectorGreaterR(&cr,a,b);
  133. return XMComparisonAllTrue(cr);
  134. }
  135. /// for branching when a.xyzw >= b.xyzw
  136. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  137. {
  138. unsigned int cr;
  139. XMVectorGreaterOrEqualR(&cr,a,b);
  140. return XMComparisonAllTrue(cr);
  141. }
  142. /// for branching when a.xyzw > b.xyzw
  143. FORCEINLINE bool IsAnyGreaterThan( const fltx4 &a, const fltx4 &b )
  144. {
  145. unsigned int cr;
  146. XMVectorGreaterR(&cr,a,b);
  147. return XMComparisonAnyTrue(cr);
  148. }
  149. /// for branching when a.xyzw >= b.xyzw
  150. FORCEINLINE bool IsAnyGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  151. {
  152. unsigned int cr;
  153. XMVectorGreaterOrEqualR(&cr,a,b);
  154. return XMComparisonAnyTrue(cr);
  155. }
  156. // For branching if all a.xyzw == b.xyzw
  157. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  158. {
  159. unsigned int cr;
  160. XMVectorEqualR(&cr,a,b);
  161. return XMComparisonAllTrue(cr);
  162. }
  163. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  164. {
  165. // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
  166. int nRet = 0;
  167. const fltx4_union & a_union = (const fltx4_union &)a;
  168. nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
  169. nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
  170. nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
  171. nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3
  172. return nRet;
  173. }
  174. // Squelch the w component of a vector to +0.0.
  175. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  176. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  177. {
  178. return __vrlimi( a, __vzero(), 1, 0 );
  179. }
  180. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  181. {
  182. // NOTE: this tests the top bits of each vector element using integer math
  183. // (so it ignores NaNs - it will return true for "-NaN")
  184. unsigned int equalFlags = 0;
  185. fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)
  186. signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000
  187. __vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags );
  188. return !XMComparisonAllTrue( equalFlags );
  189. }
  190. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  191. {
  192. return __vcmpeqfp( a, b );
  193. }
  194. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  195. {
  196. return __vcmpgtfp( a, b );
  197. }
  198. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  199. {
  200. return __vcmpgefp( a, b );
  201. }
  202. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  203. {
  204. return __vcmpgtfp( b, a );
  205. }
  206. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  207. {
  208. return __vcmpgefp( b, a );
  209. }
  210. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  211. {
  212. return XMVectorInBounds( a, b );
  213. }
  214. FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? 1.0:0
  215. {
  216. return AndSIMD( Four_Ones, __vcmpeqfp( a, b ) );
  217. }
  218. FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? 1.0:0
  219. {
  220. return AndSIMD( Four_Ones, __vcmpgtfp( a, b ) );
  221. }
  222. FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? 1.0:0
  223. {
  224. return AndSIMD( Four_Ones, __vcmpgefp( a, b ) );
  225. }
  226. FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? 1.0:0
  227. {
  228. return AndSIMD( Four_Ones, __vcmpgtfp( b, a ) );
  229. }
  230. FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? 1.0:0
  231. {
  232. return AndSIMD( Four_Ones, __vcmpgefp( b, a ) );
  233. }
  234. FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? 1.0 : 0
  235. {
  236. return AndSIMD( Four_Ones, XMVectorInBounds( a, b ) );
  237. }
  238. // returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
  239. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  240. {
  241. return __vsel( OldValue, NewValue, ReplacementMask );
  242. }
  243. // AKA "Broadcast", "Splat"
  244. FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
  245. {
  246. // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  247. float * pValue = &flValue;
  248. Assert( pValue );
  249. Assert( ((unsigned int)pValue & 3) == 0);
  250. return __vspltw( __lvlx( pValue, 0 ), 0 );
  251. }
  252. FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a
  253. {
  254. Assert( pValue );
  255. return __vspltw( __lvlx( pValue, 0 ), 0 );
  256. }
  257. /// replicate a single 32 bit integer value to all 4 components of an m128
  258. FORCEINLINE fltx4 ReplicateIX4( int nValue )
  259. {
  260. // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
  261. int * pValue = &nValue;
  262. Assert( pValue );
  263. Assert( ((unsigned int)pValue & 3) == 0);
  264. return __vspltw( __lvlx( pValue, 0 ), 0 );
  265. }
  266. // Round towards positive infinity
  267. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  268. {
  269. return __vrfip(a);
  270. }
  271. // Round towards nearest integer
  272. FORCEINLINE fltx4 RoundSIMD( const fltx4 &a )
  273. {
  274. return __vrfin(a);
  275. }
  276. // Round towards negative infinity
  277. FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
  278. {
  279. return __vrfim(a);
  280. }
  281. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  282. {
  283. // This is emulated from rsqrt
  284. return XMVectorSqrtEst( a );
  285. }
  286. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  287. {
  288. // This is emulated from rsqrt
  289. return XMVectorSqrt( a );
  290. }
  291. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  292. {
  293. return __vrsqrtefp( a );
  294. }
  295. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  296. {
  297. // Convert zeros to epsilons
  298. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  299. fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  300. return ReciprocalSqrtEstSIMD( a_safe );
  301. }
  302. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  303. {
  304. // This uses Newton-Raphson to improve the HW result
  305. return XMVectorReciprocalSqrt( a );
  306. }
  307. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  308. {
  309. return __vrefp( a );
  310. }
  311. /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
  312. /// No error checking!
  313. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  314. {
  315. // This uses Newton-Raphson to improve the HW result
  316. return XMVectorReciprocal( a );
  317. }
  318. // FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?)
  319. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  320. {
  321. return MulSIMD( ReciprocalSIMD( b ), a );
  322. }
  323. // CHRISG: is it worth doing integer bitfiddling for this?
  324. // 2^x for all values (the antilog)
  325. FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )
  326. {
  327. return XMVectorExp(toPower);
  328. }
  329. // Clamps the components of a vector to a specified minimum and maximum range.
  330. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  331. {
  332. return XMVectorClamp(in, min, max);
  333. }
  334. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  335. {
  336. return XMLoadVector4( pSIMD );
  337. }
  338. // load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).
  339. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  340. {
  341. return XMLoadVector3( pSIMD );
  342. }
  343. // load a single unaligned float into the x component of a SIMD word
  344. FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
  345. {
  346. return __lvlx( pFlt, 0 );
  347. }
  348. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  349. {
  350. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  351. }
  352. FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD )
  353. {
  354. return XMLoadVector4A( pSIMD );
  355. }
  356. FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD )
  357. {
  358. return XMLoadVector4( pSIMD );
  359. }
  360. FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
  361. {
  362. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  363. }
  364. FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
  365. {
  366. XMStoreVector4( pSIMD, a );
  367. }
  368. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  369. {
  370. XMStoreVector3( pSIMD, a );
  371. }
  372. // Fixed-point conversion and save as SIGNED INTS.
  373. // pDest->x = Int (vSrc.x)
  374. // note: some architectures have means of doing
  375. // fixed point conversion when the fix depth is
  376. // specified as an immediate.. but there is no way
  377. // to guarantee an immediate as a parameter to function
  378. // like this.
  379. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  380. {
  381. fltx4 asInt = __vctsxs( vSrc, 0 );
  382. XMStoreVector4A(pDest->Base(), asInt);
  383. }
  384. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
  385. {
  386. XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w );
  387. xyzwMatrix = XMMatrixTranspose( xyzwMatrix );
  388. x = xyzwMatrix.r[0];
  389. y = xyzwMatrix.r[1];
  390. z = xyzwMatrix.r[2];
  391. w = xyzwMatrix.r[3];
  392. }
  393. // Return one in the fastest way -- faster even than loading.
  394. FORCEINLINE fltx4 LoadZeroSIMD( void )
  395. {
  396. return XMVectorZero();
  397. }
  398. // Return one in the fastest way -- faster even than loading.
  399. FORCEINLINE fltx4 LoadOneSIMD( void )
  400. {
  401. return XMVectorSplatOne();
  402. }
  403. FORCEINLINE fltx4 SplatXSIMD( fltx4 a )
  404. {
  405. return XMVectorSplatX( a );
  406. }
  407. FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
  408. {
  409. return XMVectorSplatY( a );
  410. }
  411. FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
  412. {
  413. return XMVectorSplatZ( a );
  414. }
  415. FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
  416. {
  417. return XMVectorSplatW( a );
  418. }
  419. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  420. {
  421. fltx4 result = __vrlimi(a, x, 8, 0);
  422. return result;
  423. }
  424. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  425. {
  426. fltx4 result = __vrlimi(a, y, 4, 0);
  427. return result;
  428. }
  429. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  430. {
  431. fltx4 result = __vrlimi(a, z, 2, 0);
  432. return result;
  433. }
  434. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  435. {
  436. fltx4 result = __vrlimi(a, w, 1, 0);
  437. return result;
  438. }
  439. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  440. {
  441. fltx4 compareOne = a;
  442. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 );
  443. }
  444. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  445. {
  446. fltx4 compareOne = a;
  447. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
  448. }
  449. FORCEINLINE fltx4 RotateRight( const fltx4 & a )
  450. {
  451. fltx4 compareOne = a;
  452. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 3 );
  453. }
  454. FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
  455. {
  456. fltx4 compareOne = a;
  457. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
  458. }
  459. // find the lowest component of a.x, a.y, a.z,
  460. // and replicate it to the whole return value.
  461. // ignores a.w.
  462. // Though this is only five instructions long,
  463. // they are all dependent, making this stall city.
  464. // Forcing this inline should hopefully help with scheduling.
  465. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
  466. {
  467. // a is [x,y,z,G] (where G is garbage)
  468. // rotate left by one
  469. fltx4 compareOne = a ;
  470. compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
  471. // compareOne is [y,z,G,G]
  472. fltx4 retval = MinSIMD( a, compareOne );
  473. // retVal is [min(x,y), min(y,z), G, G]
  474. compareOne = __vrlimi( compareOne, a, 8 , 2);
  475. // compareOne is [z, G, G, G]
  476. retval = MinSIMD( retval, compareOne );
  477. // retVal = [ min(min(x,y),z), G, G, G ]
  478. // splat the x component out to the whole vector and return
  479. return SplatXSIMD( retval );
  480. }
  481. // find the highest component of a.x, a.y, a.z,
  482. // and replicate it to the whole return value.
  483. // ignores a.w.
  484. // Though this is only five instructions long,
  485. // they are all dependent, making this stall city.
  486. // Forcing this inline should hopefully help with scheduling.
  487. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
  488. {
  489. // a is [x,y,z,G] (where G is garbage)
  490. // rotate left by one
  491. fltx4 compareOne = a ;
  492. compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
  493. // compareOne is [y,z,G,G]
  494. fltx4 retval = MaxSIMD( a, compareOne );
  495. // retVal is [max(x,y), max(y,z), G, G]
  496. compareOne = __vrlimi( compareOne, a, 8 , 2);
  497. // compareOne is [z, G, G, G]
  498. retval = MaxSIMD( retval, compareOne );
  499. // retVal = [ max(max(x,y),z), G, G, G ]
  500. // splat the x component out to the whole vector and return
  501. return SplatXSIMD( retval );
  502. }
  503. // ------------------------------------
  504. // INTEGER SIMD OPERATIONS.
  505. // ------------------------------------
  506. // Load 4 aligned words into a SIMD register
  507. FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
  508. {
  509. return XMLoadVector4A(pSIMD);
  510. }
  511. // Load 4 unaligned words into a SIMD register
  512. FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD)
  513. {
  514. return XMLoadVector4( pSIMD );
  515. }
  516. // save into four words, 16-byte aligned
  517. FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  518. {
  519. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  520. }
  521. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  522. {
  523. *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
  524. }
  525. FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  526. {
  527. XMStoreVector4(pSIMD, a);
  528. }
  529. // Load four consecutive uint16's, and turn them into floating point numbers.
  530. // This function isn't especially fast and could be made faster if anyone is
  531. // using it heavily.
  532. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
  533. {
  534. return XMLoadUShort4(reinterpret_cast<const XMUSHORT4 *>(pInts));
  535. }
  536. // a={ a.x, a.z, b.x, b.z }
  537. // combine two fltx4s by throwing away every other field.
  538. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )
  539. {
  540. return XMVectorPermute( a, b, XMVectorPermuteControl( 0, 2, 4, 6 ) );
  541. }
  542. // a={ a.x, b.x, c.x, d.x }
  543. // combine 4 fltx4s by throwing away 3/4s of the fields
  544. // TODO: make more efficient by doing this in a parallel way at the caller
  545. // Compress4SIMD(FourVectors.. )
  546. FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )
  547. {
  548. fltx4 abcd = __vrlimi( a, b, 4, 3 ); // a.x, b.x, a.z, a.w
  549. abcd = __vrlimi( abcd, c, 2, 2 ); // ax, bx, cx, aw
  550. abcd = __vrlimi( abcd, d, 1, 1 ); // ax, bx, cx, dx
  551. return abcd;
  552. }
  553. // Take a fltx4 containing fixed-point uints and
  554. // return them as single precision floats. No
  555. // fixed point conversion is done.
  556. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  557. {
  558. return __vcfux( vSrcA, 0 );
  559. }
  560. // Take a fltx4 containing fixed-point sints and
  561. // return them as single precision floats. No
  562. // fixed point conversion is done.
  563. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  564. {
  565. return __vcfsx( vSrcA, 0 );
  566. }
  567. // Take a fltx4 containing fixed-point uints and
  568. // return them as single precision floats. Each uint
  569. // will be divided by 2^immed after conversion
  570. // (eg, this is fixed point math).
  571. /* as if:
  572. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
  573. {
  574. return __vcfux( vSrcA, uImmed );
  575. }
  576. */
  577. #define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))
  578. // Take a fltx4 containing fixed-point sints and
  579. // return them as single precision floats. Each int
  580. // will be divided by 2^immed (eg, this is fixed point
  581. // math).
  582. /* as if:
  583. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
  584. {
  585. return __vcfsx( vSrcA, uImmed );
  586. }
  587. */
  588. #define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))
  589. // set all components of a vector to a signed immediate int number.
  590. /* as if:
  591. FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
  592. {
  593. return __vspltisw( toImmediate );
  594. }
  595. */
  596. #define IntSetImmediateSIMD(x) (__vspltisw(x))
  597. /*
  598. works on fltx4's as if they are four uints.
  599. the first parameter contains the words to be shifted,
  600. the second contains the amount to shift by AS INTS
  601. for i = 0 to 3
  602. shift = vSrcB_i*32:(i*32)+4
  603. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  604. */
  605. FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB)
  606. {
  607. return __vslw(vSrcA, vSrcB);
  608. }
  609. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  610. {
  611. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  612. const fltx4_union & a_union = (const fltx4_union &)a;
  613. return a_union.m128_f32[ idx ];
  614. }
  615. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  616. {
  617. fltx4_union & a_union = (fltx4_union &)a;
  618. return a_union.m128_f32[idx];
  619. }
  620. /// Set one component of a SIMD word with the given float value.
  621. /// This function is a template because the native implementation of
  622. /// this on PPC platforms requires that the component be given as a
  623. /// compiler immediate -- not a function parameter, not a const function
  624. /// parameter, not even a load from a const static array. It has to be
  625. /// a real immediate.
  626. /// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.
  627. /// \note This function is not particularly performant on any platform (because of
  628. /// the load from float), so prefer a masked assign from a fltx4 wherever
  629. /// possible.
  630. template < unsigned int NCOMPONENT >
  631. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )
  632. {
  633. // vrlimi can only take an immediate opcode -- that is a constant
  634. // passed in from the compiler, not a function parameter, nor an
  635. // element loaded from an array, not even a const static array.
  636. #define SETCOMPONENTSIMD_MASK_IMMEDIATE ( NCOMPONENT == 0 ) ? 8 :\
  637. ( NCOMPONENT == 1 ) ? 4 :\
  638. ( NCOMPONENT == 2 ) ? 2 :\
  639. ( NCOMPONENT == 3 ) ? 1 :\
  640. 17 //< a meaningless immediate intended to make the compiler angry
  641. fltx4 val = ReplicateX4( flValue );
  642. fltx4 result = __vrlimi(a, val, SETCOMPONENTSIMD_MASK_IMMEDIATE, 0);
  643. return result;
  644. #undef SETCOMPONENTSIMD_MASK_IMMEDIATE
  645. }
  646. FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
  647. {
  648. fltx4 t = __vctuxs( a, 0 );
  649. const fltx4_union & a_union = (const fltx4_union &)t;
  650. return a_union.m128_u32[idx];
  651. }
  652. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  653. {
  654. const fltx4_union & a_union = (const fltx4_union &)a;
  655. return a_union.m128_u32[idx];
  656. }
  657. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  658. {
  659. fltx4_union & a_union = (fltx4_union &)a;
  660. return a_union.m128_u32[idx];
  661. }