Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3107 lines
92 KiB

  1. //========= Copyright Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose: - defines SIMD "structure of arrays" classes and functions.
  4. //
  5. //===========================================================================//
  6. #ifndef SSEMATH_H
  7. #define SSEMATH_H
  8. #if defined( _X360 )
  9. #include <xboxmath.h>
  10. #else
  11. #include <xmmintrin.h>
  12. #endif
  13. #include <mathlib/vector.h>
  14. #include <mathlib/mathlib.h>
  15. #if defined(GNUC)
  16. #define USE_STDC_FOR_SIMD 0
  17. #else
  18. #define USE_STDC_FOR_SIMD 0
  19. #endif
  20. #if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0))
  21. #define _SSE1 1
  22. #endif
  23. // I thought about defining a class/union for the SIMD packed floats instead of using fltx4,
  24. // but decided against it because (a) the nature of SIMD code which includes comparisons is to blur
  25. // the relationship between packed floats and packed integer types and (b) not sure that the
  26. // compiler would handle generating good code for the intrinsics.
  27. #if USE_STDC_FOR_SIMD
  28. typedef union
  29. {
  30. float m128_f32[4];
  31. uint32 m128_u32[4];
  32. } fltx4;
  33. typedef fltx4 i32x4;
  34. typedef fltx4 u32x4;
  35. #elif ( defined( _X360 ) )
  36. typedef union
  37. {
  38. // This union allows float/int access (which generally shouldn't be done in inner loops)
  39. __vector4 vmx;
  40. float m128_f32[4];
  41. uint32 m128_u32[4];
  42. } fltx4_union;
  43. typedef __vector4 fltx4;
  44. typedef __vector4 i32x4; // a VMX register; just a way of making it explicit that we're doing integer ops.
  45. typedef __vector4 u32x4; // a VMX register; just a way of making it explicit that we're doing unsigned integer ops.
  46. #else
  47. typedef __m128 fltx4;
  48. typedef __m128 i32x4;
  49. typedef __m128 u32x4;
  50. #endif
  51. // The FLTX4 type is a fltx4 used as a parameter to a function.
  52. // On the 360, the best way to do this is pass-by-copy on the registers.
  53. // On the PC, the best way is to pass by const reference.
  54. // The compiler will sometimes, but not always, replace a pass-by-const-ref
  55. // with a pass-in-reg on the 360; to avoid this confusion, you can
  56. // explicitly use a FLTX4 as the parameter type.
  57. #ifdef _X360
  58. typedef __vector4 FLTX4;
  59. #else
  60. typedef const fltx4 & FLTX4;
  61. #endif
  62. // A 16-byte aligned int32 datastructure
  63. // (for use when writing out fltx4's as SIGNED
  64. // ints).
  65. struct ALIGN16 intx4
  66. {
  67. int32 m_i32[4];
  68. inline int & operator[](int which)
  69. {
  70. return m_i32[which];
  71. }
  72. inline const int & operator[](int which) const
  73. {
  74. return m_i32[which];
  75. }
  76. inline int32 *Base() {
  77. return m_i32;
  78. }
  79. inline const int32 *Base() const
  80. {
  81. return m_i32;
  82. }
  83. inline const bool operator==(const intx4 &other) const
  84. {
  85. return m_i32[0] == other.m_i32[0] &&
  86. m_i32[1] == other.m_i32[1] &&
  87. m_i32[2] == other.m_i32[2] &&
  88. m_i32[3] == other.m_i32[3] ;
  89. }
  90. } ALIGN16_POST;
  91. #if defined( _DEBUG ) && defined( _X360 )
  92. FORCEINLINE void TestVPUFlags()
  93. {
  94. // Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com)
  95. __vector4 a;
  96. __asm
  97. {
  98. mfvscr a;
  99. }
  100. unsigned int * flags = (unsigned int *)&a;
  101. unsigned int controlWord = flags[3];
  102. Assert(controlWord == 0);
  103. }
  104. #else // _DEBUG
  105. FORCEINLINE void TestVPUFlags() {}
  106. #endif // _DEBUG
  107. // useful constants in SIMD packed float format:
  108. // (note: some of these aren't stored on the 360,
  109. // but are manufactured directly in one or two
  110. // instructions, saving a load and possible L2
  111. // miss.)
  112. #ifndef _X360
  113. extern const fltx4 Four_Zeros; // 0 0 0 0
  114. extern const fltx4 Four_Ones; // 1 1 1 1
  115. extern const fltx4 Four_Twos; // 2 2 2 2
  116. extern const fltx4 Four_Threes; // 3 3 3 3
  117. extern const fltx4 Four_Fours; // guess.
  118. extern const fltx4 Four_Point225s; // .225 .225 .225 .225
  119. extern const fltx4 Four_PointFives; // .5 .5 .5 .5
  120. extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
  121. extern const fltx4 Four_2ToThe21s; // (1<<21)..
  122. extern const fltx4 Four_2ToThe22s; // (1<<22)..
  123. extern const fltx4 Four_2ToThe23s; // (1<<23)..
  124. extern const fltx4 Four_2ToThe24s; // (1<<24)..
  125. extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2)
  126. extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1
  127. #else
  128. #define Four_Zeros XMVectorZero() // 0 0 0 0
  129. #define Four_Ones XMVectorSplatOne() // 1 1 1 1
  130. extern const fltx4 Four_Twos; // 2 2 2 2
  131. extern const fltx4 Four_Threes; // 3 3 3 3
  132. extern const fltx4 Four_Fours; // guess.
  133. extern const fltx4 Four_Point225s; // .225 .225 .225 .225
  134. extern const fltx4 Four_PointFives; // .5 .5 .5 .5
  135. extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
  136. extern const fltx4 Four_2ToThe21s; // (1<<21)..
  137. extern const fltx4 Four_2ToThe22s; // (1<<22)..
  138. extern const fltx4 Four_2ToThe23s; // (1<<23)..
  139. extern const fltx4 Four_2ToThe24s; // (1<<24)..
  140. extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2)
  141. extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1
  142. #endif
  143. extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
  144. extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
  145. extern const fltx4 g_SIMD_0123; // 0 1 2 3 as float
  146. // external aligned integer constants
  147. extern const ALIGN16 uint32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4
  148. extern const ALIGN16 uint32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4
  149. extern const ALIGN16 uint32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4
  150. extern const ALIGN16 uint32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0
  151. extern const ALIGN16 uint32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
  152. extern const ALIGN16 uint32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0
  153. extern const ALIGN16 uint32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4
  154. // this mask is used for skipping the tail of things. If you have N elements in an array, and wish
  155. // to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration.
  156. extern const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;
  157. // Define prefetch macros.
  158. // The characteristics of cache and prefetch are completely
  159. // different between the different platforms, so you DO NOT
  160. // want to just define one macro that maps to every platform
  161. // intrinsic under the hood -- you need to prefetch at different
  162. // intervals between x86 and PPC, for example, and that is
  163. // a higher level code change.
  164. // On the other hand, I'm tired of typing #ifdef _X360
  165. // all over the place, so this is just a nop on Intel, PS3.
  166. #ifdef _X360
  167. #define PREFETCH360(address, offset) __dcbt(offset,address)
  168. #else
  169. #define PREFETCH360(x,y) // nothing
  170. #endif
  171. #if USE_STDC_FOR_SIMD
  172. //---------------------------------------------------------------------
  173. // Standard C (fallback/Linux) implementation (only there for compat - slow)
  174. //---------------------------------------------------------------------
  175. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  176. {
  177. return a.m128_f32[ idx ];
  178. }
  179. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  180. {
  181. return a.m128_f32[idx];
  182. }
  183. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  184. {
  185. return a.m128_u32[idx];
  186. }
  187. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  188. {
  189. return a.m128_u32[idx];
  190. }
  191. // Return one in the fastest way -- on the x360, faster even than loading.
  192. FORCEINLINE fltx4 LoadZeroSIMD( void )
  193. {
  194. return Four_Zeros;
  195. }
  196. // Return one in the fastest way -- on the x360, faster even than loading.
  197. FORCEINLINE fltx4 LoadOneSIMD( void )
  198. {
  199. return Four_Ones;
  200. }
  201. FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a )
  202. {
  203. fltx4 retVal;
  204. SubFloat( retVal, 0 ) = SubFloat( a, 0 );
  205. SubFloat( retVal, 1 ) = SubFloat( a, 0 );
  206. SubFloat( retVal, 2 ) = SubFloat( a, 0 );
  207. SubFloat( retVal, 3 ) = SubFloat( a, 0 );
  208. return retVal;
  209. }
  210. FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
  211. {
  212. fltx4 retVal;
  213. SubFloat( retVal, 0 ) = SubFloat( a, 1 );
  214. SubFloat( retVal, 1 ) = SubFloat( a, 1 );
  215. SubFloat( retVal, 2 ) = SubFloat( a, 1 );
  216. SubFloat( retVal, 3 ) = SubFloat( a, 1 );
  217. return retVal;
  218. }
  219. FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
  220. {
  221. fltx4 retVal;
  222. SubFloat( retVal, 0 ) = SubFloat( a, 2 );
  223. SubFloat( retVal, 1 ) = SubFloat( a, 2 );
  224. SubFloat( retVal, 2 ) = SubFloat( a, 2 );
  225. SubFloat( retVal, 3 ) = SubFloat( a, 2 );
  226. return retVal;
  227. }
  228. FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
  229. {
  230. fltx4 retVal;
  231. SubFloat( retVal, 0 ) = SubFloat( a, 3 );
  232. SubFloat( retVal, 1 ) = SubFloat( a, 3 );
  233. SubFloat( retVal, 2 ) = SubFloat( a, 3 );
  234. SubFloat( retVal, 3 ) = SubFloat( a, 3 );
  235. return retVal;
  236. }
  237. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  238. {
  239. fltx4 result = a;
  240. SubFloat( result, 0 ) = SubFloat( x, 0 );
  241. return result;
  242. }
  243. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  244. {
  245. fltx4 result = a;
  246. SubFloat( result, 1 ) = SubFloat( y, 1 );
  247. return result;
  248. }
  249. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  250. {
  251. fltx4 result = a;
  252. SubFloat( result, 2 ) = SubFloat( z, 2 );
  253. return result;
  254. }
  255. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  256. {
  257. fltx4 result = a;
  258. SubFloat( result, 3 ) = SubFloat( w, 3 );
  259. return result;
  260. }
  261. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
  262. {
  263. fltx4 result = a;
  264. SubFloat( result, nComponent ) = flValue;
  265. return result;
  266. }
  267. // a b c d -> b c d a
  268. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  269. {
  270. fltx4 retVal;
  271. SubFloat( retVal, 0 ) = SubFloat( a, 1 );
  272. SubFloat( retVal, 1 ) = SubFloat( a, 2 );
  273. SubFloat( retVal, 2 ) = SubFloat( a, 3 );
  274. SubFloat( retVal, 3 ) = SubFloat( a, 0 );
  275. return retVal;
  276. }
  277. // a b c d -> c d a b
  278. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  279. {
  280. fltx4 retVal;
  281. SubFloat( retVal, 0 ) = SubFloat( a, 2 );
  282. SubFloat( retVal, 1 ) = SubFloat( a, 3 );
  283. SubFloat( retVal, 2 ) = SubFloat( a, 0 );
  284. SubFloat( retVal, 3 ) = SubFloat( a, 1 );
  285. return retVal;
  286. }
  287. #define BINOP(op) \
  288. fltx4 retVal; \
  289. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \
  290. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \
  291. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \
  292. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \
  293. return retVal;
  294. #define IBINOP(op) \
  295. fltx4 retVal; \
  296. SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \
  297. SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \
  298. SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \
  299. SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \
  300. return retVal;
  301. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
  302. {
  303. BINOP(+);
  304. }
  305. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  306. {
  307. BINOP(-);
  308. };
  309. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  310. {
  311. BINOP(*);
  312. }
  313. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  314. {
  315. BINOP(/);
  316. }
  317. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  318. {
  319. return AddSIMD( MulSIMD(a,b), c );
  320. }
  321. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  322. {
  323. return SubSIMD( c, MulSIMD(a,b) );
  324. };
  325. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  326. {
  327. fltx4 result;
  328. SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
  329. SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
  330. SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
  331. SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
  332. return result;
  333. }
  334. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  335. {
  336. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  337. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  338. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  339. }
  340. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  341. {
  342. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  343. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  344. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  345. SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
  346. }
  347. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  348. {
  349. fltx4 result;
  350. SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
  351. SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
  352. SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
  353. SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
  354. return result;
  355. }
  356. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  357. {
  358. fltx4 result;
  359. SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
  360. SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
  361. SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
  362. SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
  363. return result;
  364. }
  365. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  366. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  367. {
  368. fltx4 result;
  369. SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  370. SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  371. SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  372. SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  373. return result;
  374. }
  375. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  376. {
  377. fltx4 retVal;
  378. SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  379. SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  380. SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  381. SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  382. return retVal;
  383. }
  384. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  385. {
  386. fltx4 retVal;
  387. SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  388. SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  389. SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  390. SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  391. return retVal;
  392. }
  393. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  394. {
  395. IBINOP(&);
  396. }
  397. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  398. {
  399. fltx4 retVal;
  400. SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 );
  401. SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 );
  402. SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 );
  403. SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 );
  404. return retVal;
  405. }
  406. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  407. {
  408. IBINOP(^);
  409. }
  410. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  411. {
  412. IBINOP(|);
  413. }
  414. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  415. {
  416. fltx4 retval;
  417. SubFloat( retval, 0 ) = -SubFloat( a, 0 );
  418. SubFloat( retval, 1 ) = -SubFloat( a, 1 );
  419. SubFloat( retval, 2 ) = -SubFloat( a, 2 );
  420. SubFloat( retval, 3 ) = -SubFloat( a, 3 );
  421. return retval;
  422. }
  423. FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
  424. {
  425. return ( SubFloat( a, 0 ) == 0.0 ) &&
  426. ( SubFloat( a, 1 ) == 0.0 ) &&
  427. ( SubFloat( a, 2 ) == 0.0 ) &&
  428. ( SubFloat( a, 3 ) == 0.0 ) ;
  429. }
  430. // for branching when a.xyzw > b.xyzw
  431. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  432. {
  433. return SubFloat(a,0) > SubFloat(b,0) &&
  434. SubFloat(a,1) > SubFloat(b,1) &&
  435. SubFloat(a,2) > SubFloat(b,2) &&
  436. SubFloat(a,3) > SubFloat(b,3);
  437. }
  438. // for branching when a.xyzw >= b.xyzw
  439. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  440. {
  441. return SubFloat(a,0) >= SubFloat(b,0) &&
  442. SubFloat(a,1) >= SubFloat(b,1) &&
  443. SubFloat(a,2) >= SubFloat(b,2) &&
  444. SubFloat(a,3) >= SubFloat(b,3);
  445. }
  446. // For branching if all a.xyzw == b.xyzw
  447. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  448. {
  449. return SubFloat(a,0) == SubFloat(b,0) &&
  450. SubFloat(a,1) == SubFloat(b,1) &&
  451. SubFloat(a,2) == SubFloat(b,2) &&
  452. SubFloat(a,3) == SubFloat(b,3);
  453. }
  454. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  455. {
  456. int nRet = 0;
  457. nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0
  458. nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1
  459. nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2
  460. nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3
  461. return nRet;
  462. }
  463. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  464. {
  465. return (0 != TestSignSIMD( a ));
  466. }
  467. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  468. {
  469. fltx4 retVal;
  470. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0;
  471. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0;
  472. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0;
  473. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0;
  474. return retVal;
  475. }
  476. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  477. {
  478. fltx4 retVal;
  479. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0;
  480. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0;
  481. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0;
  482. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0;
  483. return retVal;
  484. }
  485. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  486. {
  487. fltx4 retVal;
  488. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0;
  489. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0;
  490. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0;
  491. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0;
  492. return retVal;
  493. }
  494. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  495. {
  496. fltx4 retVal;
  497. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0;
  498. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0;
  499. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0;
  500. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0;
  501. return retVal;
  502. }
  503. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  504. {
  505. fltx4 retVal;
  506. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0;
  507. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0;
  508. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0;
  509. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0;
  510. return retVal;
  511. }
  512. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  513. {
  514. fltx4 retVal;
  515. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0;
  516. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0;
  517. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0;
  518. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0;
  519. return retVal;
  520. }
  521. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  522. {
  523. return OrSIMD(
  524. AndSIMD( ReplacementMask, NewValue ),
  525. AndNotSIMD( ReplacementMask, OldValue ) );
  526. }
  527. FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
  528. {
  529. fltx4 retVal;
  530. SubFloat( retVal, 0 ) = flValue;
  531. SubFloat( retVal, 1 ) = flValue;
  532. SubFloat( retVal, 2 ) = flValue;
  533. SubFloat( retVal, 3 ) = flValue;
  534. return retVal;
  535. }
  536. /// replicate a single 32 bit integer value to all 4 components of an m128
  537. FORCEINLINE fltx4 ReplicateIX4( int nValue )
  538. {
  539. fltx4 retVal;
  540. SubInt( retVal, 0 ) = nValue;
  541. SubInt( retVal, 1 ) = nValue;
  542. SubInt( retVal, 2 ) = nValue;
  543. SubInt( retVal, 3 ) = nValue;
  544. return retVal;
  545. }
  546. // Round towards positive infinity
  547. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  548. {
  549. fltx4 retVal;
  550. SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
  551. SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
  552. SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
  553. SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
  554. return retVal;
  555. }
  556. // Round towards negative infinity
  557. FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
  558. {
  559. fltx4 retVal;
  560. SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) );
  561. SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) );
  562. SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) );
  563. SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) );
  564. return retVal;
  565. }
  566. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  567. {
  568. fltx4 retVal;
  569. SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
  570. SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
  571. SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
  572. SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
  573. return retVal;
  574. }
  575. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  576. {
  577. fltx4 retVal;
  578. SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
  579. SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
  580. SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
  581. SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
  582. return retVal;
  583. }
  584. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  585. {
  586. fltx4 retVal;
  587. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
  588. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
  589. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
  590. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
  591. return retVal;
  592. }
  593. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  594. {
  595. fltx4 retVal;
  596. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON );
  597. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON );
  598. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON );
  599. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON );
  600. return retVal;
  601. }
  602. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  603. {
  604. fltx4 retVal;
  605. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
  606. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
  607. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
  608. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
  609. return retVal;
  610. }
  611. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  612. {
  613. fltx4 retVal;
  614. SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
  615. SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
  616. SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
  617. SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
  618. return retVal;
  619. }
  620. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  621. {
  622. fltx4 retVal;
  623. SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
  624. SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
  625. SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
  626. SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
  627. return retVal;
  628. }
  629. /// 1/x for all 4 values.
  630. /// 1/0 will result in a big but NOT infinite result
  631. FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
  632. {
  633. fltx4 retVal;
  634. SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
  635. SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
  636. SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
  637. SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
  638. return retVal;
  639. }
  640. FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
  641. {
  642. fltx4 retVal;
  643. SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
  644. SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
  645. SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
  646. SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
  647. return retVal;
  648. }
  649. // 2^x for all values (the antilog)
  650. FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
  651. {
  652. fltx4 retVal;
  653. SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) );
  654. SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) );
  655. SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) );
  656. SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) );
  657. return retVal;
  658. }
  659. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  660. {
  661. float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
  662. SubFloat( a, 1 ) * SubFloat( b, 1 ) +
  663. SubFloat( a, 2 ) * SubFloat( b, 2 );
  664. return ReplicateX4( flDot );
  665. }
  666. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  667. {
  668. float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
  669. SubFloat( a, 1 ) * SubFloat( b, 1 ) +
  670. SubFloat( a, 2 ) * SubFloat( b, 2 ) +
  671. SubFloat( a, 3 ) * SubFloat( b, 3 );
  672. return ReplicateX4( flDot );
  673. }
  674. // Clamps the components of a vector to a specified minimum and maximum range.
  675. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  676. {
  677. return MaxSIMD( min, MinSIMD( max, in ) );
  678. }
  679. // Squelch the w component of a vector to +0.0.
  680. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  681. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  682. {
  683. fltx4 retval;
  684. retval = a;
  685. SubFloat( retval, 0 ) = 0;
  686. return retval;
  687. }
  688. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  689. {
  690. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  691. }
  692. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  693. {
  694. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  695. }
  696. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  697. {
  698. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  699. }
  700. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  701. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
  702. {
  703. fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
  704. // squelch w
  705. SubInt( retval, 3 ) = 0;
  706. return retval;
  707. }
  708. FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
  709. {
  710. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  711. }
  712. FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
  713. {
  714. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  715. }
  716. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  717. {
  718. *pSIMD = SubFloat(a, 0);
  719. *(pSIMD+1) = SubFloat(a, 1);
  720. *(pSIMD+2) = SubFloat(a, 2);
  721. }
  722. // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
  723. FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
  724. {
  725. StoreAlignedSIMD(pSIMD->Base(),a);
  726. }
  727. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
  728. {
  729. #define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; }
  730. SWAP_FLOATS( x, 1, y, 0 );
  731. SWAP_FLOATS( x, 2, z, 0 );
  732. SWAP_FLOATS( x, 3, w, 0 );
  733. SWAP_FLOATS( y, 2, z, 1 );
  734. SWAP_FLOATS( y, 3, w, 1 );
  735. SWAP_FLOATS( z, 3, w, 2 );
  736. }
  737. // find the lowest component of a.x, a.y, a.z,
  738. // and replicate it to the whole return value.
  739. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
  740. {
  741. float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
  742. return ReplicateX4(lowest);
  743. }
  744. // find the highest component of a.x, a.y, a.z,
  745. // and replicate it to the whole return value.
  746. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
  747. {
  748. float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
  749. return ReplicateX4(highest);
  750. }
  751. // Fixed-point conversion and save as SIGNED INTS.
  752. // pDest->x = Int (vSrc.x)
  753. // note: some architectures have means of doing
  754. // fixed point conversion when the fix depth is
  755. // specified as an immediate.. but there is no way
  756. // to guarantee an immediate as a parameter to function
  757. // like this.
  758. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  759. {
  760. (*pDest)[0] = SubFloat(vSrc, 0);
  761. (*pDest)[1] = SubFloat(vSrc, 1);
  762. (*pDest)[2] = SubFloat(vSrc, 2);
  763. (*pDest)[3] = SubFloat(vSrc, 3);
  764. }
  765. // ------------------------------------
  766. // INTEGER SIMD OPERATIONS.
  767. // ------------------------------------
  768. // splat all components of a vector to a signed immediate int number.
  769. FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue )
  770. {
  771. fltx4 retval;
  772. SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue;
  773. return retval;
  774. }
  775. // Load 4 aligned words into a SIMD register
  776. FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD)
  777. {
  778. return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
  779. }
  780. // Load 4 unaligned words into a SIMD register
  781. FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
  782. {
  783. return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
  784. }
  785. // save into four words, 16-byte aligned
  786. FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  787. {
  788. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  789. }
  790. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  791. {
  792. *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
  793. }
  794. FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  795. {
  796. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  797. }
  798. // Take a fltx4 containing fixed-point uints and
  799. // return them as single precision floats. No
  800. // fixed point conversion is done.
  801. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
  802. {
  803. Assert(0); /* pc has no such operation */
  804. fltx4 retval;
  805. SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
  806. SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
  807. SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
  808. SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
  809. return retval;
  810. }
  811. #if 0 /* pc has no such op */
  812. // Take a fltx4 containing fixed-point sints and
  813. // return them as single precision floats. No
  814. // fixed point conversion is done.
  815. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  816. {
  817. fltx4 retval;
  818. SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) );
  819. SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) );
  820. SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) );
  821. SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) );
  822. return retval;
  823. }
  824. /*
  825. works on fltx4's as if they are four uints.
  826. the first parameter contains the words to be shifted,
  827. the second contains the amount to shift by AS INTS
  828. for i = 0 to 3
  829. shift = vSrcB_i*32:(i*32)+4
  830. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  831. */
  832. FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
  833. {
  834. i32x4 retval;
  835. SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
  836. SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
  837. SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
  838. SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
  839. return retval;
  840. }
  841. #endif
  842. #elif ( defined( _X360 ) )
  843. //---------------------------------------------------------------------
  844. // X360 implementation
  845. //---------------------------------------------------------------------
  846. FORCEINLINE float & FloatSIMD( fltx4 & a, int idx )
  847. {
  848. fltx4_union & a_union = (fltx4_union &)a;
  849. return a_union.m128_f32[idx];
  850. }
  851. FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx )
  852. {
  853. fltx4_union & a_union = (fltx4_union &)a;
  854. return a_union.m128_u32[idx];
  855. }
  856. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
  857. {
  858. return __vaddfp( a, b );
  859. }
  860. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  861. {
  862. return __vsubfp( a, b );
  863. }
  864. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  865. {
  866. return __vmulfp( a, b );
  867. }
  868. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  869. {
  870. return __vmaddfp( a, b, c );
  871. }
  872. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  873. {
  874. return __vnmsubfp( a, b, c );
  875. };
  876. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  877. {
  878. return __vmsum3fp( a, b );
  879. }
  880. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  881. {
  882. return __vmsum4fp( a, b );
  883. }
  884. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  885. {
  886. return XMVectorSin( radians );
  887. }
  888. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  889. {
  890. XMVectorSinCos( &sine, &cosine, radians );
  891. }
  892. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  893. {
  894. XMVectorSinCos( &sine, &cosine, radians );
  895. }
  896. FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians )
  897. {
  898. cosine = XMVectorCos( radians );
  899. }
  900. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  901. {
  902. return XMVectorASin( sine );
  903. }
  904. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  905. {
  906. return XMVectorACos( cs );
  907. }
  908. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  909. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  910. {
  911. return XMVectorATan2( a, b );
  912. }
  913. // DivSIMD defined further down, since it uses ReciprocalSIMD
  914. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  915. {
  916. return __vmaxfp( a, b );
  917. }
  918. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  919. {
  920. return __vminfp( a, b );
  921. }
  922. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  923. {
  924. return __vand( a, b );
  925. }
  926. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  927. {
  928. // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
  929. return __vandc( b, a );
  930. }
  931. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  932. {
  933. return __vxor( a, b );
  934. }
  935. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  936. {
  937. return __vor( a, b );
  938. }
  939. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  940. {
  941. return XMVectorNegate(a);
  942. }
  943. FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
  944. {
  945. unsigned int equalFlags = 0;
  946. __vcmpeqfpR( a, Four_Zeros, &equalFlags );
  947. return XMComparisonAllTrue( equalFlags );
  948. }
  949. FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero?
  950. {
  951. unsigned int conditionregister;
  952. XMVectorEqualR(&conditionregister, a, XMVectorZero());
  953. return XMComparisonAnyTrue(conditionregister);
  954. }
  955. FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero?
  956. {
  957. // copy a's x component into w, in case w was zero.
  958. fltx4 temp = __vrlimi(a, a, 1, 1);
  959. unsigned int conditionregister;
  960. XMVectorEqualR(&conditionregister, temp, XMVectorZero());
  961. return XMComparisonAnyTrue(conditionregister);
  962. }
  963. // for branching when a.xyzw > b.xyzw
  964. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  965. {
  966. unsigned int cr;
  967. XMVectorGreaterR(&cr,a,b);
  968. return XMComparisonAllTrue(cr);
  969. }
  970. // for branching when a.xyzw >= b.xyzw
  971. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  972. {
  973. unsigned int cr;
  974. XMVectorGreaterOrEqualR(&cr,a,b);
  975. return XMComparisonAllTrue(cr);
  976. }
  977. // For branching if all a.xyzw == b.xyzw
  978. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  979. {
  980. unsigned int cr;
  981. XMVectorEqualR(&cr,a,b);
  982. return XMComparisonAllTrue(cr);
  983. }
  984. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  985. {
  986. // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
  987. int nRet = 0;
  988. const fltx4_union & a_union = (const fltx4_union &)a;
  989. nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
  990. nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
  991. nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
  992. nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3
  993. return nRet;
  994. }
  995. // Squelch the w component of a vector to +0.0.
  996. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  997. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  998. {
  999. return __vrlimi( a, __vzero(), 1, 0 );
  1000. }
  1001. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  1002. {
  1003. // NOTE: this tests the top bits of each vector element using integer math
  1004. // (so it ignores NaNs - it will return true for "-NaN")
  1005. unsigned int equalFlags = 0;
  1006. fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)
  1007. signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000
  1008. __vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags );
  1009. return !XMComparisonAllTrue( equalFlags );
  1010. }
  1011. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  1012. {
  1013. return __vcmpeqfp( a, b );
  1014. }
  1015. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  1016. {
  1017. return __vcmpgtfp( a, b );
  1018. }
  1019. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  1020. {
  1021. return __vcmpgefp( a, b );
  1022. }
  1023. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  1024. {
  1025. return __vcmpgtfp( b, a );
  1026. }
  1027. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  1028. {
  1029. return __vcmpgefp( b, a );
  1030. }
  1031. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  1032. {
  1033. return XMVectorInBounds( a, b );
  1034. }
  1035. // returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
  1036. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  1037. {
  1038. return __vsel( OldValue, NewValue, ReplacementMask );
  1039. }
  1040. // AKA "Broadcast", "Splat"
  1041. FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
  1042. {
  1043. // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  1044. float * pValue = &flValue;
  1045. Assert( pValue );
  1046. Assert( ((unsigned int)pValue & 3) == 0);
  1047. return __vspltw( __lvlx( pValue, 0 ), 0 );
  1048. }
  1049. FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a
  1050. {
  1051. Assert( pValue );
  1052. return __vspltw( __lvlx( pValue, 0 ), 0 );
  1053. }
  1054. /// replicate a single 32 bit integer value to all 4 components of an m128
  1055. FORCEINLINE fltx4 ReplicateIX4( int nValue )
  1056. {
  1057. // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
  1058. int * pValue = &nValue;
  1059. Assert( pValue );
  1060. Assert( ((unsigned int)pValue & 3) == 0);
  1061. return __vspltw( __lvlx( pValue, 0 ), 0 );
  1062. }
  1063. // Round towards positive infinity
  1064. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  1065. {
  1066. return __vrfip(a);
  1067. }
  1068. // Round towards nearest integer
  1069. FORCEINLINE fltx4 RoundSIMD( const fltx4 &a )
  1070. {
  1071. return __vrfin(a);
  1072. }
  1073. // Round towards negative infinity
  1074. FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
  1075. {
  1076. return __vrfim(a);
  1077. }
  1078. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  1079. {
  1080. // This is emulated from rsqrt
  1081. return XMVectorSqrtEst( a );
  1082. }
  1083. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  1084. {
  1085. // This is emulated from rsqrt
  1086. return XMVectorSqrt( a );
  1087. }
  1088. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  1089. {
  1090. return __vrsqrtefp( a );
  1091. }
  1092. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  1093. {
  1094. // Convert zeros to epsilons
  1095. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  1096. fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  1097. return ReciprocalSqrtEstSIMD( a_safe );
  1098. }
  1099. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  1100. {
  1101. // This uses Newton-Raphson to improve the HW result
  1102. return XMVectorReciprocalSqrt( a );
  1103. }
  1104. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  1105. {
  1106. return __vrefp( a );
  1107. }
  1108. /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
  1109. /// No error checking!
  1110. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  1111. {
  1112. // This uses Newton-Raphson to improve the HW result
  1113. return XMVectorReciprocal( a );
  1114. }
  1115. // FIXME: on 360, this is very slow, since it uses ReciprocalSIMD (do we need DivEstSIMD?)
  1116. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  1117. {
  1118. return MulSIMD( ReciprocalSIMD( b ), a );
  1119. }
  1120. /// 1/x for all 4 values.
  1121. /// 1/0 will result in a big but NOT infinite result
  1122. FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
  1123. {
  1124. // Convert zeros to epsilons
  1125. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  1126. fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  1127. return ReciprocalEstSIMD( a_safe );
  1128. }
  1129. FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
  1130. {
  1131. // Convert zeros to epsilons
  1132. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  1133. fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  1134. return ReciprocalSIMD( a_safe );
  1135. // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does)
  1136. // fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a );
  1137. // fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask );
  1138. // return ReciprocalSIMD( a_safe );
  1139. }
  1140. // CHRISG: is it worth doing integer bitfiddling for this?
  1141. // 2^x for all values (the antilog)
  1142. FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
  1143. {
  1144. return XMVectorExp(toPower);
  1145. }
  1146. // Clamps the components of a vector to a specified minimum and maximum range.
  1147. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  1148. {
  1149. return XMVectorClamp(in, min, max);
  1150. }
  1151. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  1152. {
  1153. return XMLoadVector4( pSIMD );
  1154. }
  1155. // load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).
  1156. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  1157. {
  1158. return XMLoadVector3( pSIMD );
  1159. }
  1160. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  1161. {
  1162. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  1163. }
  1164. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  1165. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
  1166. {
  1167. fltx4 out = XMLoadVector3A(pSIMD.Base());
  1168. // squelch w
  1169. return __vrlimi( out, __vzero(), 1, 0 );
  1170. }
  1171. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  1172. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD )
  1173. {
  1174. fltx4 out = XMLoadVector3A(pSIMD);
  1175. // squelch w
  1176. return __vrlimi( out, __vzero(), 1, 0 );
  1177. }
  1178. FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
  1179. {
  1180. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  1181. }
  1182. FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
  1183. {
  1184. XMStoreVector4( pSIMD, a );
  1185. }
  1186. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  1187. {
  1188. XMStoreVector3( pSIMD, a );
  1189. }
  1190. // strongly typed -- for typechecking as we transition to SIMD
  1191. FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
  1192. {
  1193. XMStoreVector3A(pSIMD->Base(),a);
  1194. }
  1195. // Fixed-point conversion and save as SIGNED INTS.
  1196. // pDest->x = Int (vSrc.x)
  1197. // note: some architectures have means of doing
  1198. // fixed point conversion when the fix depth is
  1199. // specified as an immediate.. but there is no way
  1200. // to guarantee an immediate as a parameter to function
  1201. // like this.
  1202. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  1203. {
  1204. fltx4 asInt = __vctsxs( vSrc, 0 );
  1205. XMStoreVector4A(pDest->Base(), asInt);
  1206. }
  1207. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
  1208. {
  1209. XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w );
  1210. xyzwMatrix = XMMatrixTranspose( xyzwMatrix );
  1211. x = xyzwMatrix.r[0];
  1212. y = xyzwMatrix.r[1];
  1213. z = xyzwMatrix.r[2];
  1214. w = xyzwMatrix.r[3];
  1215. }
  1216. // Return one in the fastest way -- faster even than loading.
  1217. FORCEINLINE fltx4 LoadZeroSIMD( void )
  1218. {
  1219. return XMVectorZero();
  1220. }
  1221. // Return one in the fastest way -- faster even than loading.
  1222. FORCEINLINE fltx4 LoadOneSIMD( void )
  1223. {
  1224. return XMVectorSplatOne();
  1225. }
  1226. FORCEINLINE fltx4 SplatXSIMD( fltx4 a )
  1227. {
  1228. return XMVectorSplatX( a );
  1229. }
  1230. FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
  1231. {
  1232. return XMVectorSplatY( a );
  1233. }
  1234. FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
  1235. {
  1236. return XMVectorSplatZ( a );
  1237. }
  1238. FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
  1239. {
  1240. return XMVectorSplatW( a );
  1241. }
  1242. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  1243. {
  1244. fltx4 result = __vrlimi(a, x, 8, 0);
  1245. return result;
  1246. }
  1247. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  1248. {
  1249. fltx4 result = __vrlimi(a, y, 4, 0);
  1250. return result;
  1251. }
  1252. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  1253. {
  1254. fltx4 result = __vrlimi(a, z, 2, 0);
  1255. return result;
  1256. }
  1257. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  1258. {
  1259. fltx4 result = __vrlimi(a, w, 1, 0);
  1260. return result;
  1261. }
  1262. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
  1263. {
  1264. static int s_nVrlimiMask[4] = { 8, 4, 2, 1 };
  1265. fltx4 val = ReplicateX4( flValue );
  1266. fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0);
  1267. return result;
  1268. }
  1269. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  1270. {
  1271. fltx4 compareOne = a;
  1272. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 );
  1273. }
  1274. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  1275. {
  1276. fltx4 compareOne = a;
  1277. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
  1278. }
  1279. // find the lowest component of a.x, a.y, a.z,
  1280. // and replicate it to the whole return value.
  1281. // ignores a.w.
  1282. // Though this is only five instructions long,
  1283. // they are all dependent, making this stall city.
  1284. // Forcing this inline should hopefully help with scheduling.
  1285. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
  1286. {
  1287. // a is [x,y,z,G] (where G is garbage)
  1288. // rotate left by one
  1289. fltx4 compareOne = a ;
  1290. compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
  1291. // compareOne is [y,z,G,G]
  1292. fltx4 retval = MinSIMD( a, compareOne );
  1293. // retVal is [min(x,y), min(y,z), G, G]
  1294. compareOne = __vrlimi( compareOne, a, 8 , 2);
  1295. // compareOne is [z, G, G, G]
  1296. retval = MinSIMD( retval, compareOne );
  1297. // retVal = [ min(min(x,y),z), G, G, G ]
  1298. // splat the x component out to the whole vector and return
  1299. return SplatXSIMD( retval );
  1300. }
  1301. // find the highest component of a.x, a.y, a.z,
  1302. // and replicate it to the whole return value.
  1303. // ignores a.w.
  1304. // Though this is only five instructions long,
  1305. // they are all dependent, making this stall city.
  1306. // Forcing this inline should hopefully help with scheduling.
  1307. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
  1308. {
  1309. // a is [x,y,z,G] (where G is garbage)
  1310. // rotate left by one
  1311. fltx4 compareOne = a ;
  1312. compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
  1313. // compareOne is [y,z,G,G]
  1314. fltx4 retval = MaxSIMD( a, compareOne );
  1315. // retVal is [max(x,y), max(y,z), G, G]
  1316. compareOne = __vrlimi( compareOne, a, 8 , 2);
  1317. // compareOne is [z, G, G, G]
  1318. retval = MaxSIMD( retval, compareOne );
  1319. // retVal = [ max(max(x,y),z), G, G, G ]
  1320. // splat the x component out to the whole vector and return
  1321. return SplatXSIMD( retval );
  1322. }
  1323. // Transform many (horizontal) points in-place by a 3x4 matrix,
  1324. // here already loaded onto three fltx4 registers.
  1325. // The points must be stored as 16-byte aligned. They are points
  1326. // and not vectors because we assume the w-component to be 1.
  1327. // To spare yourself the annoyance of loading the matrix yourself,
  1328. // use one of the overloads below.
  1329. void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2, FLTX4 mRow3);
  1330. // Transform many (horizontal) points in-place by a 3x4 matrix.
  1331. // The points must be stored as 16-byte aligned. They are points
  1332. // and not vectors because we assume the w-component to be 1.
  1333. // In this function, the matrix need not be aligned.
  1334. FORCEINLINE void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix)
  1335. {
  1336. return TransformManyPointsBy(pVectors, numVectors,
  1337. LoadUnalignedSIMD( pMatrix[0] ), LoadUnalignedSIMD( pMatrix[1] ), LoadUnalignedSIMD( pMatrix[2] ) );
  1338. }
  1339. // Transform many (horizontal) points in-place by a 3x4 matrix.
  1340. // The points must be stored as 16-byte aligned. They are points
  1341. // and not vectors because we assume the w-component to be 1.
  1342. // In this function, the matrix must itself be aligned on a 16-byte
  1343. // boundary.
  1344. FORCEINLINE void TransformManyPointsByA(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix)
  1345. {
  1346. return TransformManyPointsBy(pVectors, numVectors,
  1347. LoadAlignedSIMD( pMatrix[0] ), LoadAlignedSIMD( pMatrix[1] ), LoadAlignedSIMD( pMatrix[2] ) );
  1348. }
  1349. // ------------------------------------
  1350. // INTEGER SIMD OPERATIONS.
  1351. // ------------------------------------
  1352. // Load 4 aligned words into a SIMD register
  1353. FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
  1354. {
  1355. return XMLoadVector4A(pSIMD);
  1356. }
  1357. // Load 4 unaligned words into a SIMD register
  1358. FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD)
  1359. {
  1360. return XMLoadVector4( pSIMD );
  1361. }
  1362. // save into four words, 16-byte aligned
  1363. FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  1364. {
  1365. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  1366. }
  1367. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  1368. {
  1369. *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
  1370. }
  1371. FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  1372. {
  1373. XMStoreVector4(pSIMD, a);
  1374. }
  1375. // Take a fltx4 containing fixed-point uints and
  1376. // return them as single precision floats. No
  1377. // fixed point conversion is done.
  1378. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  1379. {
  1380. return __vcfux( vSrcA, 0 );
  1381. }
  1382. // Take a fltx4 containing fixed-point sints and
  1383. // return them as single precision floats. No
  1384. // fixed point conversion is done.
  1385. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  1386. {
  1387. return __vcfsx( vSrcA, 0 );
  1388. }
  1389. // Take a fltx4 containing fixed-point uints and
  1390. // return them as single precision floats. Each uint
  1391. // will be divided by 2^immed after conversion
  1392. // (eg, this is fixed point math).
  1393. /* as if:
  1394. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
  1395. {
  1396. return __vcfux( vSrcA, uImmed );
  1397. }
  1398. */
  1399. #define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))
  1400. // Take a fltx4 containing fixed-point sints and
  1401. // return them as single precision floats. Each int
  1402. // will be divided by 2^immed (eg, this is fixed point
  1403. // math).
  1404. /* as if:
  1405. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
  1406. {
  1407. return __vcfsx( vSrcA, uImmed );
  1408. }
  1409. */
  1410. #define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))
  1411. // set all components of a vector to a signed immediate int number.
  1412. /* as if:
  1413. FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
  1414. {
  1415. return __vspltisw( toImmediate );
  1416. }
  1417. */
  1418. #define IntSetImmediateSIMD(x) (__vspltisw(x))
  1419. /*
  1420. works on fltx4's as if they are four uints.
  1421. the first parameter contains the words to be shifted,
  1422. the second contains the amount to shift by AS INTS
  1423. for i = 0 to 3
  1424. shift = vSrcB_i*32:(i*32)+4
  1425. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  1426. */
  1427. FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB)
  1428. {
  1429. return __vslw(vSrcA, vSrcB);
  1430. }
  1431. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  1432. {
  1433. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  1434. const fltx4_union & a_union = (const fltx4_union &)a;
  1435. return a_union.m128_f32[ idx ];
  1436. }
  1437. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  1438. {
  1439. fltx4_union & a_union = (fltx4_union &)a;
  1440. return a_union.m128_f32[idx];
  1441. }
  1442. FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
  1443. {
  1444. fltx4 t = __vctuxs( a, 0 );
  1445. const fltx4_union & a_union = (const fltx4_union &)t;
  1446. return a_union.m128_u32[idx];
  1447. }
  1448. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  1449. {
  1450. const fltx4_union & a_union = (const fltx4_union &)a;
  1451. return a_union.m128_u32[idx];
  1452. }
  1453. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  1454. {
  1455. fltx4_union & a_union = (fltx4_union &)a;
  1456. return a_union.m128_u32[idx];
  1457. }
  1458. #else
  1459. //---------------------------------------------------------------------
  1460. // Intel/SSE implementation
  1461. //---------------------------------------------------------------------
  1462. FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
  1463. {
  1464. _mm_store_ps( pSIMD, a );
  1465. }
  1466. FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
  1467. {
  1468. _mm_storeu_ps( pSIMD, a );
  1469. }
  1470. FORCEINLINE fltx4 RotateLeft( const fltx4 & a );
  1471. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a );
  1472. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  1473. {
  1474. _mm_store_ss(pSIMD, a);
  1475. _mm_store_ss(pSIMD+1, RotateLeft(a));
  1476. _mm_store_ss(pSIMD+2, RotateLeft2(a));
  1477. }
  1478. // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
  1479. FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
  1480. {
  1481. StoreAlignedSIMD( pSIMD->Base(),a );
  1482. }
  1483. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  1484. {
  1485. return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) );
  1486. }
  1487. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  1488. {
  1489. return _mm_and_ps( a, b );
  1490. }
  1491. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  1492. {
  1493. return _mm_andnot_ps( a, b );
  1494. }
  1495. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  1496. {
  1497. return _mm_xor_ps( a, b );
  1498. }
  1499. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  1500. {
  1501. return _mm_or_ps( a, b );
  1502. }
  1503. // Squelch the w component of a vector to +0.0.
  1504. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  1505. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  1506. {
  1507. return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) );
  1508. }
  1509. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  1510. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
  1511. {
  1512. return SetWToZeroSIMD( LoadAlignedSIMD(pSIMD.Base()) );
  1513. }
  1514. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  1515. {
  1516. return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
  1517. }
  1518. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  1519. {
  1520. return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
  1521. }
  1522. /// replicate a single 32 bit integer value to all 4 components of an m128
  1523. FORCEINLINE fltx4 ReplicateIX4( int i )
  1524. {
  1525. fltx4 value = _mm_set_ss( * ( ( float *) &i ) );;
  1526. return _mm_shuffle_ps( value, value, 0);
  1527. }
  1528. FORCEINLINE fltx4 ReplicateX4( float flValue )
  1529. {
  1530. __m128 value = _mm_set_ss( flValue );
  1531. return _mm_shuffle_ps( value, value, 0 );
  1532. }
  1533. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  1534. {
  1535. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  1536. #ifndef POSIX
  1537. return a.m128_f32[ idx ];
  1538. #else
  1539. return (reinterpret_cast<float const *>(&a))[idx];
  1540. #endif
  1541. }
  1542. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  1543. {
  1544. #ifndef POSIX
  1545. return a.m128_f32[ idx ];
  1546. #else
  1547. return (reinterpret_cast<float *>(&a))[idx];
  1548. #endif
  1549. }
  1550. FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
  1551. {
  1552. return (uint32)SubFloat(a,idx);
  1553. }
  1554. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  1555. {
  1556. #ifndef POSIX
  1557. return a.m128_u32[idx];
  1558. #else
  1559. return (reinterpret_cast<uint32 const *>(&a))[idx];
  1560. #endif
  1561. }
  1562. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  1563. {
  1564. #ifndef POSIX
  1565. return a.m128_u32[idx];
  1566. #else
  1567. return (reinterpret_cast<uint32 *>(&a))[idx];
  1568. #endif
  1569. }
  1570. // Return one in the fastest way -- on the x360, faster even than loading.
  1571. FORCEINLINE fltx4 LoadZeroSIMD( void )
  1572. {
  1573. return Four_Zeros;
  1574. }
  1575. // Return one in the fastest way -- on the x360, faster even than loading.
  1576. FORCEINLINE fltx4 LoadOneSIMD( void )
  1577. {
  1578. return Four_Ones;
  1579. }
  1580. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  1581. {
  1582. return OrSIMD(
  1583. AndSIMD( ReplacementMask, NewValue ),
  1584. AndNotSIMD( ReplacementMask, OldValue ) );
  1585. }
  1586. // remember, the SSE numbers its words 3 2 1 0
  1587. // The way we want to specify shuffles is backwards from the default
  1588. // MM_SHUFFLE_REV is in array index order (default is reversed)
  1589. #define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a)
  1590. FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a )
  1591. {
  1592. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
  1593. }
  1594. FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a )
  1595. {
  1596. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) );
  1597. }
  1598. FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a )
  1599. {
  1600. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) );
  1601. }
  1602. FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a )
  1603. {
  1604. return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 3, 3, 3, 3 ) );
  1605. }
  1606. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  1607. {
  1608. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a );
  1609. return result;
  1610. }
  1611. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  1612. {
  1613. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a );
  1614. return result;
  1615. }
  1616. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  1617. {
  1618. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a );
  1619. return result;
  1620. }
  1621. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  1622. {
  1623. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a );
  1624. return result;
  1625. }
  1626. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
  1627. {
  1628. fltx4 val = ReplicateX4( flValue );
  1629. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[nComponent] ), val, a );
  1630. return result;
  1631. }
  1632. // a b c d -> b c d a
  1633. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  1634. {
  1635. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) );
  1636. }
  1637. // a b c d -> c d a b
  1638. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  1639. {
  1640. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
  1641. }
  1642. // a b c d -> d a b c
  1643. FORCEINLINE fltx4 RotateRight( const fltx4 & a )
  1644. {
  1645. return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 0, 3, 2, 1) );
  1646. }
  1647. // a b c d -> c d a b
  1648. FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
  1649. {
  1650. return _mm_shuffle_ps( a, a, _MM_SHUFFLE( 1, 0, 3, 2 ) );
  1651. }
  1652. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b
  1653. {
  1654. return _mm_add_ps( a, b );
  1655. };
  1656. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  1657. {
  1658. return _mm_sub_ps( a, b );
  1659. };
  1660. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  1661. {
  1662. return _mm_mul_ps( a, b );
  1663. };
  1664. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  1665. {
  1666. return _mm_div_ps( a, b );
  1667. };
  1668. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  1669. {
  1670. return AddSIMD( MulSIMD(a,b), c );
  1671. }
  1672. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  1673. {
  1674. return SubSIMD( c, MulSIMD(a,b) );
  1675. };
  1676. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  1677. {
  1678. fltx4 m = MulSIMD( a, b );
  1679. float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 );
  1680. return ReplicateX4( flDot );
  1681. }
  1682. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  1683. {
  1684. fltx4 m = MulSIMD( a, b );
  1685. float flDot = SubFloat( m, 0 ) + SubFloat( m, 1 ) + SubFloat( m, 2 ) + SubFloat( m, 3 );
  1686. return ReplicateX4( flDot );
  1687. }
  1688. //TODO: implement as four-way Taylor series (see xbox implementation)
  1689. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  1690. {
  1691. fltx4 result;
  1692. SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
  1693. SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
  1694. SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
  1695. SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
  1696. return result;
  1697. }
  1698. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  1699. {
  1700. // FIXME: Make a fast SSE version
  1701. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  1702. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  1703. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  1704. }
  1705. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c
  1706. {
  1707. // FIXME: Make a fast SSE version
  1708. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  1709. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  1710. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  1711. SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
  1712. }
  1713. //TODO: implement as four-way Taylor series (see xbox implementation)
  1714. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  1715. {
  1716. // FIXME: Make a fast SSE version
  1717. fltx4 result;
  1718. SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
  1719. SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
  1720. SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
  1721. SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
  1722. return result;
  1723. }
  1724. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  1725. {
  1726. fltx4 result;
  1727. SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
  1728. SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
  1729. SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
  1730. SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
  1731. return result;
  1732. }
  1733. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  1734. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  1735. {
  1736. fltx4 result;
  1737. SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  1738. SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  1739. SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  1740. SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  1741. return result;
  1742. }
  1743. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  1744. {
  1745. return SubSIMD(LoadZeroSIMD(),a);
  1746. }
  1747. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  1748. {
  1749. return _mm_movemask_ps( a );
  1750. }
  1751. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  1752. {
  1753. return (0 != TestSignSIMD( a ));
  1754. }
  1755. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  1756. {
  1757. return _mm_cmpeq_ps( a, b );
  1758. }
  1759. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  1760. {
  1761. return _mm_cmpgt_ps( a, b );
  1762. }
  1763. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  1764. {
  1765. return _mm_cmpge_ps( a, b );
  1766. }
  1767. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  1768. {
  1769. return _mm_cmplt_ps( a, b );
  1770. }
  1771. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  1772. {
  1773. return _mm_cmple_ps( a, b );
  1774. }
  1775. // for branching when a.xyzw > b.xyzw
  1776. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  1777. {
  1778. return TestSignSIMD( CmpLeSIMD( a, b ) ) == 0;
  1779. }
  1780. // for branching when a.xyzw >= b.xyzw
  1781. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  1782. {
  1783. return TestSignSIMD( CmpLtSIMD( a, b ) ) == 0;
  1784. }
  1785. // For branching if all a.xyzw == b.xyzw
  1786. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  1787. {
  1788. return TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf;
  1789. }
  1790. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  1791. {
  1792. return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) );
  1793. }
  1794. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  1795. {
  1796. return _mm_min_ps( a, b );
  1797. }
  1798. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  1799. {
  1800. return _mm_max_ps( a, b );
  1801. }
  1802. // SSE lacks rounding operations.
  1803. // Really.
  1804. // You can emulate them by setting the rounding mode for the
  1805. // whole processor and then converting to int, and then back again.
  1806. // But every time you set the rounding mode, you clear out the
  1807. // entire pipeline. So, I can't do them per operation. You
  1808. // have to do it once, before the loop that would call these.
  1809. // Round towards positive infinity
  1810. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  1811. {
  1812. fltx4 retVal;
  1813. SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
  1814. SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
  1815. SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
  1816. SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
  1817. return retVal;
  1818. }
  1819. fltx4 fabs( const fltx4 & x );
  1820. // Round towards negative infinity
  1821. // This is the implementation that was here before; it assumes
  1822. // you are in round-to-floor mode, which I guess is usually the
  1823. // case for us vis-a-vis SSE. It's totally unnecessary on
  1824. // VMX, which has a native floor op.
  1825. FORCEINLINE fltx4 FloorSIMD( const fltx4 &val )
  1826. {
  1827. fltx4 fl4Abs = fabs( val );
  1828. fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
  1829. ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
  1830. return XorSIMD( ival, XorSIMD( val, fl4Abs ) ); // restore sign bits
  1831. }
  1832. inline bool IsAllZeros( const fltx4 & var )
  1833. {
  1834. return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF;
  1835. }
  1836. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  1837. {
  1838. return _mm_sqrt_ps( a );
  1839. }
  1840. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  1841. {
  1842. return _mm_sqrt_ps( a );
  1843. }
  1844. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  1845. {
  1846. return _mm_rsqrt_ps( a );
  1847. }
  1848. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  1849. {
  1850. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  1851. fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  1852. ret = ReciprocalSqrtEstSIMD( ret );
  1853. return ret;
  1854. }
  1855. /// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD
  1856. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  1857. {
  1858. fltx4 guess = ReciprocalSqrtEstSIMD( a );
  1859. // newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2));
  1860. guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess ))));
  1861. guess = MulSIMD( Four_PointFives, guess);
  1862. return guess;
  1863. }
  1864. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  1865. {
  1866. return _mm_rcp_ps( a );
  1867. }
  1868. /// 1/x for all 4 values, more or less
  1869. /// 1/0 will result in a big but NOT infinite result
  1870. FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
  1871. {
  1872. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  1873. fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  1874. ret = ReciprocalEstSIMD( ret );
  1875. return ret;
  1876. }
  1877. /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
  1878. /// No error checking!
  1879. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  1880. {
  1881. fltx4 ret = ReciprocalEstSIMD( a );
  1882. // newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2
  1883. ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) );
  1884. return ret;
  1885. }
  1886. /// 1/x for all 4 values.
  1887. /// 1/0 will result in a big but NOT infinite result
  1888. FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
  1889. {
  1890. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  1891. fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  1892. ret = ReciprocalSIMD( ret );
  1893. return ret;
  1894. }
  1895. // CHRISG: is it worth doing integer bitfiddling for this?
  1896. // 2^x for all values (the antilog)
  1897. FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
  1898. {
  1899. fltx4 retval;
  1900. SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) );
  1901. SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) );
  1902. SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) );
  1903. SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) );
  1904. return retval;
  1905. }
  1906. // Clamps the components of a vector to a specified minimum and maximum range.
  1907. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  1908. {
  1909. return MaxSIMD( min, MinSIMD( max, in ) );
  1910. }
  1911. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w)
  1912. {
  1913. _MM_TRANSPOSE4_PS( x, y, z, w );
  1914. }
  1915. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a )
  1916. {
  1917. // a is [x,y,z,G] (where G is garbage)
  1918. // rotate left by one
  1919. fltx4 compareOne = RotateLeft( a );
  1920. // compareOne is [y,z,G,x]
  1921. fltx4 retval = MinSIMD( a, compareOne );
  1922. // retVal is [min(x,y), ... ]
  1923. compareOne = RotateLeft2( a );
  1924. // compareOne is [z, G, x, y]
  1925. retval = MinSIMD( retval, compareOne );
  1926. // retVal = [ min(min(x,y),z)..]
  1927. // splat the x component out to the whole vector and return
  1928. return SplatXSIMD( retval );
  1929. }
  1930. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a )
  1931. {
  1932. // a is [x,y,z,G] (where G is garbage)
  1933. // rotate left by one
  1934. fltx4 compareOne = RotateLeft( a );
  1935. // compareOne is [y,z,G,x]
  1936. fltx4 retval = MaxSIMD( a, compareOne );
  1937. // retVal is [max(x,y), ... ]
  1938. compareOne = RotateLeft2( a );
  1939. // compareOne is [z, G, x, y]
  1940. retval = MaxSIMD( retval, compareOne );
  1941. // retVal = [ max(max(x,y),z)..]
  1942. // splat the x component out to the whole vector and return
  1943. return SplatXSIMD( retval );
  1944. }
  1945. // ------------------------------------
  1946. // INTEGER SIMD OPERATIONS.
  1947. // ------------------------------------
  1948. #if 0 /* pc does not have these ops */
  1949. // splat all components of a vector to a signed immediate int number.
  1950. FORCEINLINE fltx4 IntSetImmediateSIMD(int to)
  1951. {
  1952. //CHRISG: SSE2 has this, but not SSE1. What to do?
  1953. fltx4 retval;
  1954. SubInt( retval, 0 ) = to;
  1955. SubInt( retval, 1 ) = to;
  1956. SubInt( retval, 2 ) = to;
  1957. SubInt( retval, 3 ) = to;
  1958. return retval;
  1959. }
  1960. #endif
  1961. // Load 4 aligned words into a SIMD register
  1962. FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
  1963. {
  1964. return _mm_load_ps( reinterpret_cast<const float *>(pSIMD) );
  1965. }
  1966. // Load 4 unaligned words into a SIMD register
  1967. FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
  1968. {
  1969. return _mm_loadu_ps( reinterpret_cast<const float *>(pSIMD) );
  1970. }
  1971. // save into four words, 16-byte aligned
  1972. FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
  1973. {
  1974. _mm_store_ps( reinterpret_cast<float *>(pSIMD), a );
  1975. }
  1976. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  1977. {
  1978. _mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a );
  1979. }
  1980. FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
  1981. {
  1982. _mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a );
  1983. }
  1984. // CHRISG: the conversion functions all seem to operate on m64's only...
  1985. // how do we make them work here?
  1986. // Take a fltx4 containing fixed-point uints and
  1987. // return them as single precision floats. No
  1988. // fixed point conversion is done.
  1989. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
  1990. {
  1991. fltx4 retval;
  1992. SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
  1993. SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
  1994. SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
  1995. SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
  1996. return retval;
  1997. }
  1998. // Take a fltx4 containing fixed-point sints and
  1999. // return them as single precision floats. No
  2000. // fixed point conversion is done.
  2001. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  2002. {
  2003. fltx4 retval;
  2004. SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[0]));
  2005. SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[1]));
  2006. SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[2]));
  2007. SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[3]));
  2008. return retval;
  2009. }
  2010. /*
  2011. works on fltx4's as if they are four uints.
  2012. the first parameter contains the words to be shifted,
  2013. the second contains the amount to shift by AS INTS
  2014. for i = 0 to 3
  2015. shift = vSrcB_i*32:(i*32)+4
  2016. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  2017. */
  2018. FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
  2019. {
  2020. i32x4 retval;
  2021. SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
  2022. SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
  2023. SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
  2024. SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
  2025. return retval;
  2026. }
  2027. // Fixed-point conversion and save as SIGNED INTS.
  2028. // pDest->x = Int (vSrc.x)
  2029. // note: some architectures have means of doing
  2030. // fixed point conversion when the fix depth is
  2031. // specified as an immediate.. but there is no way
  2032. // to guarantee an immediate as a parameter to function
  2033. // like this.
  2034. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  2035. {
  2036. #if defined( COMPILER_MSVC64 )
  2037. (*pDest)[0] = SubFloat( vSrc, 0 );
  2038. (*pDest)[1] = SubFloat( vSrc, 1 );
  2039. (*pDest)[2] = SubFloat( vSrc, 2 );
  2040. (*pDest)[3] = SubFloat( vSrc, 3 );
  2041. #else
  2042. __m64 bottom = _mm_cvttps_pi32( vSrc );
  2043. __m64 top = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) );
  2044. *reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom;
  2045. *reinterpret_cast<__m64 *>(&(*pDest)[2]) = top;
  2046. _mm_empty();
  2047. #endif
  2048. }
  2049. #endif
  2050. /// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
  2051. /// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
  2052. class ALIGN16 FourVectors
  2053. {
  2054. public:
  2055. fltx4 x, y, z;
  2056. FORCEINLINE void DuplicateVector(Vector const &v) //< set all 4 vectors to the same vector value
  2057. {
  2058. x=ReplicateX4(v.x);
  2059. y=ReplicateX4(v.y);
  2060. z=ReplicateX4(v.z);
  2061. }
  2062. FORCEINLINE fltx4 const & operator[](int idx) const
  2063. {
  2064. return *((&x)+idx);
  2065. }
  2066. FORCEINLINE fltx4 & operator[](int idx)
  2067. {
  2068. return *((&x)+idx);
  2069. }
  2070. FORCEINLINE void operator+=(FourVectors const &b) //< add 4 vectors to another 4 vectors
  2071. {
  2072. x=AddSIMD(x,b.x);
  2073. y=AddSIMD(y,b.y);
  2074. z=AddSIMD(z,b.z);
  2075. }
  2076. FORCEINLINE void operator-=(FourVectors const &b) //< subtract 4 vectors from another 4
  2077. {
  2078. x=SubSIMD(x,b.x);
  2079. y=SubSIMD(y,b.y);
  2080. z=SubSIMD(z,b.z);
  2081. }
  2082. FORCEINLINE void operator*=(FourVectors const &b) //< scale all four vectors per component scale
  2083. {
  2084. x=MulSIMD(x,b.x);
  2085. y=MulSIMD(y,b.y);
  2086. z=MulSIMD(z,b.z);
  2087. }
  2088. FORCEINLINE void operator*=(const fltx4 & scale) //< scale
  2089. {
  2090. x=MulSIMD(x,scale);
  2091. y=MulSIMD(y,scale);
  2092. z=MulSIMD(z,scale);
  2093. }
  2094. FORCEINLINE void operator*=(float scale) //< uniformly scale all 4 vectors
  2095. {
  2096. fltx4 scalepacked = ReplicateX4(scale);
  2097. *this *= scalepacked;
  2098. }
  2099. FORCEINLINE fltx4 operator*(FourVectors const &b) const //< 4 dot products
  2100. {
  2101. fltx4 dot=MulSIMD(x,b.x);
  2102. dot=MaddSIMD(y,b.y,dot);
  2103. dot=MaddSIMD(z,b.z,dot);
  2104. return dot;
  2105. }
  2106. FORCEINLINE fltx4 operator*(Vector const &b) const //< dot product all 4 vectors with 1 vector
  2107. {
  2108. fltx4 dot=MulSIMD(x,ReplicateX4(b.x));
  2109. dot=MaddSIMD(y,ReplicateX4(b.y), dot);
  2110. dot=MaddSIMD(z,ReplicateX4(b.z), dot);
  2111. return dot;
  2112. }
  2113. FORCEINLINE void VProduct(FourVectors const &b) //< component by component mul
  2114. {
  2115. x=MulSIMD(x,b.x);
  2116. y=MulSIMD(y,b.y);
  2117. z=MulSIMD(z,b.z);
  2118. }
  2119. FORCEINLINE void MakeReciprocal(void) //< (x,y,z)=(1/x,1/y,1/z)
  2120. {
  2121. x=ReciprocalSIMD(x);
  2122. y=ReciprocalSIMD(y);
  2123. z=ReciprocalSIMD(z);
  2124. }
  2125. FORCEINLINE void MakeReciprocalSaturate(void) //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23
  2126. {
  2127. x=ReciprocalSaturateSIMD(x);
  2128. y=ReciprocalSaturateSIMD(y);
  2129. z=ReciprocalSaturateSIMD(z);
  2130. }
  2131. // Assume the given matrix is a rotation, and rotate these vectors by it.
  2132. // If you have a long list of FourVectors structures that you all want
  2133. // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
  2134. inline void RotateBy(const matrix3x4_t& matrix);
  2135. /// You can use this to rotate a long array of FourVectors all by the same
  2136. /// matrix. The first parameter is the head of the array. The second is the
  2137. /// number of vectors to rotate. The third is the matrix.
  2138. static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix );
  2139. /// Assume the vectors are points, and transform them in place by the matrix.
  2140. inline void TransformBy(const matrix3x4_t& matrix);
  2141. /// You can use this to Transform a long array of FourVectors all by the same
  2142. /// matrix. The first parameter is the head of the array. The second is the
  2143. /// number of vectors to rotate. The third is the matrix. The fourth is the
  2144. /// output buffer, which must not overlap the pVectors buffer. This is not
  2145. /// an in-place transformation.
  2146. static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut );
  2147. /// You can use this to Transform a long array of FourVectors all by the same
  2148. /// matrix. The first parameter is the head of the array. The second is the
  2149. /// number of vectors to rotate. The third is the matrix. The fourth is the
  2150. /// output buffer, which must not overlap the pVectors buffer.
  2151. /// This is an in-place transformation.
  2152. static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix );
  2153. // X(),Y(),Z() - get at the desired component of the i'th (0..3) vector.
  2154. FORCEINLINE const float & X(int idx) const
  2155. {
  2156. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  2157. return SubFloat( (fltx4 &)x, idx );
  2158. }
  2159. FORCEINLINE const float & Y(int idx) const
  2160. {
  2161. return SubFloat( (fltx4 &)y, idx );
  2162. }
  2163. FORCEINLINE const float & Z(int idx) const
  2164. {
  2165. return SubFloat( (fltx4 &)z, idx );
  2166. }
  2167. FORCEINLINE float & X(int idx)
  2168. {
  2169. return SubFloat( x, idx );
  2170. }
  2171. FORCEINLINE float & Y(int idx)
  2172. {
  2173. return SubFloat( y, idx );
  2174. }
  2175. FORCEINLINE float & Z(int idx)
  2176. {
  2177. return SubFloat( z, idx );
  2178. }
  2179. FORCEINLINE Vector Vec(int idx) const //< unpack one of the vectors
  2180. {
  2181. return Vector( X(idx), Y(idx), Z(idx) );
  2182. }
  2183. FourVectors(void)
  2184. {
  2185. }
  2186. FourVectors( FourVectors const &src )
  2187. {
  2188. x=src.x;
  2189. y=src.y;
  2190. z=src.z;
  2191. }
  2192. FORCEINLINE void operator=( FourVectors const &src )
  2193. {
  2194. x=src.x;
  2195. y=src.y;
  2196. z=src.z;
  2197. }
  2198. /// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op
  2199. FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
  2200. {
  2201. // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
  2202. // use an unfolded implementation here
  2203. #if _X360
  2204. fltx4 tx = LoadUnalignedSIMD( &a.x );
  2205. fltx4 ty = LoadUnalignedSIMD( &b.x );
  2206. fltx4 tz = LoadUnalignedSIMD( &c.x );
  2207. fltx4 tw = LoadUnalignedSIMD( &d.x );
  2208. fltx4 r0 = __vmrghw(tx, tz);
  2209. fltx4 r1 = __vmrghw(ty, tw);
  2210. fltx4 r2 = __vmrglw(tx, tz);
  2211. fltx4 r3 = __vmrglw(ty, tw);
  2212. x = __vmrghw(r0, r1);
  2213. y = __vmrglw(r0, r1);
  2214. z = __vmrghw(r2, r3);
  2215. #else
  2216. x = LoadUnalignedSIMD( &( a.x ));
  2217. y = LoadUnalignedSIMD( &( b.x ));
  2218. z = LoadUnalignedSIMD( &( c.x ));
  2219. fltx4 w = LoadUnalignedSIMD( &( d.x ));
  2220. // now, matrix is:
  2221. // x y z ?
  2222. // x y z ?
  2223. // x y z ?
  2224. // x y z ?
  2225. TransposeSIMD(x, y, z, w);
  2226. #endif
  2227. }
  2228. /// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op.
  2229. /// all 4 vectors must be 128 bit boundary
  2230. FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d)
  2231. {
  2232. #if _X360
  2233. fltx4 tx = LoadAlignedSIMD(a);
  2234. fltx4 ty = LoadAlignedSIMD(b);
  2235. fltx4 tz = LoadAlignedSIMD(c);
  2236. fltx4 tw = LoadAlignedSIMD(d);
  2237. fltx4 r0 = __vmrghw(tx, tz);
  2238. fltx4 r1 = __vmrghw(ty, tw);
  2239. fltx4 r2 = __vmrglw(tx, tz);
  2240. fltx4 r3 = __vmrglw(ty, tw);
  2241. x = __vmrghw(r0, r1);
  2242. y = __vmrglw(r0, r1);
  2243. z = __vmrghw(r2, r3);
  2244. #else
  2245. x = LoadAlignedSIMD( a );
  2246. y = LoadAlignedSIMD( b );
  2247. z = LoadAlignedSIMD( c );
  2248. fltx4 w = LoadAlignedSIMD( d );
  2249. // now, matrix is:
  2250. // x y z ?
  2251. // x y z ?
  2252. // x y z ?
  2253. // x y z ?
  2254. TransposeSIMD( x, y, z, w );
  2255. #endif
  2256. }
  2257. FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
  2258. {
  2259. LoadAndSwizzleAligned( &a.x, &b.x, &c.x, &d.x );
  2260. }
  2261. /// return the squared length of all 4 vectors
  2262. FORCEINLINE fltx4 length2(void) const
  2263. {
  2264. return (*this)*(*this);
  2265. }
  2266. /// return the approximate length of all 4 vectors. uses the sqrt approximation instruction
  2267. FORCEINLINE fltx4 length(void) const
  2268. {
  2269. return SqrtEstSIMD(length2());
  2270. }
  2271. /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
  2272. FORCEINLINE void VectorNormalizeFast(void)
  2273. {
  2274. fltx4 mag_sq=(*this)*(*this); // length^2
  2275. (*this) *= ReciprocalSqrtEstSIMD(mag_sq); // *(1.0/sqrt(length^2))
  2276. }
  2277. /// normalize all 4 vectors in place.
  2278. FORCEINLINE void VectorNormalize(void)
  2279. {
  2280. fltx4 mag_sq=(*this)*(*this); // length^2
  2281. (*this) *= ReciprocalSqrtSIMD(mag_sq); // *(1.0/sqrt(length^2))
  2282. }
  2283. /// construct a FourVectors from 4 separate Vectors
  2284. FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
  2285. {
  2286. LoadAndSwizzle(a,b,c,d);
  2287. }
  2288. /// construct a FourVectors from 4 separate Vectors
  2289. FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b, VectorAligned const &c, VectorAligned const &d)
  2290. {
  2291. LoadAndSwizzleAligned(a,b,c,d);
  2292. }
  2293. FORCEINLINE fltx4 DistToSqr( FourVectors const &pnt )
  2294. {
  2295. fltx4 fl4dX = SubSIMD( pnt.x, x );
  2296. fltx4 fl4dY = SubSIMD( pnt.y, y );
  2297. fltx4 fl4dZ = SubSIMD( pnt.z, z );
  2298. return AddSIMD( MulSIMD( fl4dX, fl4dX), AddSIMD( MulSIMD( fl4dY, fl4dY ), MulSIMD( fl4dZ, fl4dZ ) ) );
  2299. }
  2300. FORCEINLINE fltx4 TValueOfClosestPointOnLine( FourVectors const &p0, FourVectors const &p1 ) const
  2301. {
  2302. FourVectors lineDelta = p1;
  2303. lineDelta -= p0;
  2304. fltx4 OOlineDirDotlineDir = ReciprocalSIMD( p1 * p1 );
  2305. FourVectors v4OurPnt = *this;
  2306. v4OurPnt -= p0;
  2307. return MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta );
  2308. }
  2309. FORCEINLINE fltx4 DistSqrToLineSegment( FourVectors const &p0, FourVectors const &p1 ) const
  2310. {
  2311. FourVectors lineDelta = p1;
  2312. FourVectors v4OurPnt = *this;
  2313. v4OurPnt -= p0;
  2314. lineDelta -= p0;
  2315. fltx4 OOlineDirDotlineDir = ReciprocalSIMD( lineDelta * lineDelta );
  2316. fltx4 fl4T = MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta );
  2317. fl4T = MinSIMD( fl4T, Four_Ones );
  2318. fl4T = MaxSIMD( fl4T, Four_Zeros );
  2319. lineDelta *= fl4T;
  2320. return v4OurPnt.DistToSqr( lineDelta );
  2321. }
  2322. };
  2323. /// form 4 cross products
  2324. inline FourVectors operator ^(const FourVectors &a, const FourVectors &b)
  2325. {
  2326. FourVectors ret;
  2327. ret.x=SubSIMD(MulSIMD(a.y,b.z),MulSIMD(a.z,b.y));
  2328. ret.y=SubSIMD(MulSIMD(a.z,b.x),MulSIMD(a.x,b.z));
  2329. ret.z=SubSIMD(MulSIMD(a.x,b.y),MulSIMD(a.y,b.x));
  2330. return ret;
  2331. }
  2332. /// component-by-componentwise MAX operator
  2333. inline FourVectors maximum(const FourVectors &a, const FourVectors &b)
  2334. {
  2335. FourVectors ret;
  2336. ret.x=MaxSIMD(a.x,b.x);
  2337. ret.y=MaxSIMD(a.y,b.y);
  2338. ret.z=MaxSIMD(a.z,b.z);
  2339. return ret;
  2340. }
  2341. /// component-by-componentwise MIN operator
  2342. inline FourVectors minimum(const FourVectors &a, const FourVectors &b)
  2343. {
  2344. FourVectors ret;
  2345. ret.x=MinSIMD(a.x,b.x);
  2346. ret.y=MinSIMD(a.y,b.y);
  2347. ret.z=MinSIMD(a.z,b.z);
  2348. return ret;
  2349. }
  2350. /// calculate reflection vector. incident and normal dir assumed normalized
  2351. FORCEINLINE FourVectors VectorReflect( const FourVectors &incident, const FourVectors &normal )
  2352. {
  2353. FourVectors ret = incident;
  2354. fltx4 iDotNx2 = incident * normal;
  2355. iDotNx2 = AddSIMD( iDotNx2, iDotNx2 );
  2356. FourVectors nPart = normal;
  2357. nPart *= iDotNx2;
  2358. ret -= nPart; // i-2(n*i)n
  2359. return ret;
  2360. }
  2361. /// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector.
  2362. FORCEINLINE FourVectors VectorSlide( const FourVectors &incident, const FourVectors &normal )
  2363. {
  2364. FourVectors ret = incident;
  2365. fltx4 iDotN = incident * normal;
  2366. FourVectors nPart = normal;
  2367. nPart *= iDotN;
  2368. ret -= nPart; // i-(n*i)n
  2369. return ret;
  2370. }
  2371. // Assume the given matrix is a rotation, and rotate these vectors by it.
  2372. // If you have a long list of FourVectors structures that you all want
  2373. // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
  2374. void FourVectors::RotateBy(const matrix3x4_t& matrix)
  2375. {
  2376. // Splat out each of the entries in the matrix to a fltx4. Do this
  2377. // in the order that we will need them, to hide latency. I'm
  2378. // avoiding making an array of them, so that they'll remain in
  2379. // registers.
  2380. fltx4 matSplat00, matSplat01, matSplat02,
  2381. matSplat10, matSplat11, matSplat12,
  2382. matSplat20, matSplat21, matSplat22;
  2383. {
  2384. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  2385. // often unaligned. The w components will be the tranpose row of
  2386. // the matrix, but we don't really care about that.
  2387. fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] );
  2388. fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] );
  2389. fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] );
  2390. matSplat00 = SplatXSIMD( matCol0 );
  2391. matSplat01 = SplatYSIMD( matCol0 );
  2392. matSplat02 = SplatZSIMD( matCol0 );
  2393. matSplat10 = SplatXSIMD( matCol1 );
  2394. matSplat11 = SplatYSIMD( matCol1 );
  2395. matSplat12 = SplatZSIMD( matCol1 );
  2396. matSplat20 = SplatXSIMD( matCol2 );
  2397. matSplat21 = SplatYSIMD( matCol2 );
  2398. matSplat22 = SplatZSIMD( matCol2 );
  2399. }
  2400. // Trust in the compiler to schedule these operations correctly:
  2401. fltx4 outX, outY, outZ;
  2402. outX = AddSIMD( AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ), MulSIMD( z, matSplat02 ) );
  2403. outY = AddSIMD( AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ), MulSIMD( z, matSplat12 ) );
  2404. outZ = AddSIMD( AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ), MulSIMD( z, matSplat22 ) );
  2405. x = outX;
  2406. y = outY;
  2407. z = outZ;
  2408. }
  2409. // Assume the given matrix is a rotation, and rotate these vectors by it.
  2410. // If you have a long list of FourVectors structures that you all want
  2411. // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
  2412. void FourVectors::TransformBy(const matrix3x4_t& matrix)
  2413. {
  2414. // Splat out each of the entries in the matrix to a fltx4. Do this
  2415. // in the order that we will need them, to hide latency. I'm
  2416. // avoiding making an array of them, so that they'll remain in
  2417. // registers.
  2418. fltx4 matSplat00, matSplat01, matSplat02,
  2419. matSplat10, matSplat11, matSplat12,
  2420. matSplat20, matSplat21, matSplat22;
  2421. {
  2422. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  2423. // often unaligned. The w components will be the tranpose row of
  2424. // the matrix, but we don't really care about that.
  2425. fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] );
  2426. fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] );
  2427. fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] );
  2428. matSplat00 = SplatXSIMD( matCol0 );
  2429. matSplat01 = SplatYSIMD( matCol0 );
  2430. matSplat02 = SplatZSIMD( matCol0 );
  2431. matSplat10 = SplatXSIMD( matCol1 );
  2432. matSplat11 = SplatYSIMD( matCol1 );
  2433. matSplat12 = SplatZSIMD( matCol1 );
  2434. matSplat20 = SplatXSIMD( matCol2 );
  2435. matSplat21 = SplatYSIMD( matCol2 );
  2436. matSplat22 = SplatZSIMD( matCol2 );
  2437. }
  2438. // Trust in the compiler to schedule these operations correctly:
  2439. fltx4 outX, outY, outZ;
  2440. outX = MaddSIMD( z, matSplat02, AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ) );
  2441. outY = MaddSIMD( z, matSplat12, AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ) );
  2442. outZ = MaddSIMD( z, matSplat22, AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ) );
  2443. x = AddSIMD( outX, ReplicateX4( matrix[0][3] ));
  2444. y = AddSIMD( outY, ReplicateX4( matrix[1][3] ));
  2445. z = AddSIMD( outZ, ReplicateX4( matrix[2][3] ));
  2446. }
  2447. /// quick, low quality perlin-style noise() function suitable for real time use.
  2448. /// return value is -1..1. Only reliable around +/- 1 million or so.
  2449. fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z );
  2450. fltx4 NoiseSIMD( FourVectors const &v );
  2451. // vector valued noise direction
  2452. FourVectors DNoiseSIMD( FourVectors const &v );
  2453. // vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html
  2454. FourVectors CurlNoiseSIMD( FourVectors const &v );
  2455. /// calculate the absolute value of a packed single
  2456. inline fltx4 fabs( const fltx4 & x )
  2457. {
  2458. return AndSIMD( x, LoadAlignedSIMD( g_SIMD_clear_signmask ) );
  2459. }
  2460. /// negate all four components of a SIMD packed single
  2461. inline fltx4 fnegate( const fltx4 & x )
  2462. {
  2463. return XorSIMD( x, LoadAlignedSIMD( g_SIMD_signmask ) );
  2464. }
  2465. fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent);
  2466. // PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some
  2467. // restictions: fractional exponents are only handled with 2 bits of precision. Basically,
  2468. // fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25).
  2469. // negative and fractional powers are handled by the SIMD reciprocal and square root approximation
  2470. // instructions and so are not especially accurate ----Note that this routine does not raise
  2471. // numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)).
  2472. inline fltx4 PowSIMD( const fltx4 & x, float exponent )
  2473. {
  2474. return Pow_FixedPoint_Exponent_SIMD(x,(int) (4.0*exponent));
  2475. }
  2476. // random number generation - generate 4 random numbers quickly.
  2477. void SeedRandSIMD(uint32 seed); // seed the random # generator
  2478. fltx4 RandSIMD( int nContext = 0 ); // return 4 numbers in the 0..1 range
  2479. // for multithreaded, you need to use these and use the argument form of RandSIMD:
  2480. int GetSIMDRandContext( void );
  2481. void ReleaseSIMDRandContext( int nContext );
  2482. FORCEINLINE fltx4 RandSignedSIMD( void ) // -1..1
  2483. {
  2484. return SubSIMD( MulSIMD( Four_Twos, RandSIMD() ), Four_Ones );
  2485. }
  2486. // SIMD versions of mathlib simplespline functions
  2487. // hermite basis function for smooth interpolation
  2488. // Similar to Gain() above, but very cheap to call
  2489. // value should be between 0 & 1 inclusive
  2490. inline fltx4 SimpleSpline( const fltx4 & value )
  2491. {
  2492. // Arranged to avoid a data dependency between these two MULs:
  2493. fltx4 valueDoubled = MulSIMD( value, Four_Twos );
  2494. fltx4 valueSquared = MulSIMD( value, value );
  2495. // Nice little ease-in, ease-out spline-like curve
  2496. return SubSIMD(
  2497. MulSIMD( Four_Threes, valueSquared ),
  2498. MulSIMD( valueDoubled, valueSquared ) );
  2499. }
  2500. // remaps a value in [startInterval, startInterval+rangeInterval] from linear to
  2501. // spline using SimpleSpline
  2502. inline fltx4 SimpleSplineRemapValWithDeltas( const fltx4 & val,
  2503. const fltx4 & A, const fltx4 & BMinusA,
  2504. const fltx4 & OneOverBMinusA, const fltx4 & C,
  2505. const fltx4 & DMinusC )
  2506. {
  2507. // if ( A == B )
  2508. // return val >= B ? D : C;
  2509. fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA );
  2510. return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) );
  2511. }
  2512. inline fltx4 SimpleSplineRemapValWithDeltasClamped( const fltx4 & val,
  2513. const fltx4 & A, const fltx4 & BMinusA,
  2514. const fltx4 & OneOverBMinusA, const fltx4 & C,
  2515. const fltx4 & DMinusC )
  2516. {
  2517. // if ( A == B )
  2518. // return val >= B ? D : C;
  2519. fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA );
  2520. cVal = MinSIMD( Four_Ones, MaxSIMD( Four_Zeros, cVal ) );
  2521. return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) );
  2522. }
  2523. FORCEINLINE fltx4 FracSIMD( const fltx4 &val )
  2524. {
  2525. fltx4 fl4Abs = fabs( val );
  2526. fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
  2527. ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
  2528. return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits
  2529. }
  2530. FORCEINLINE fltx4 Mod2SIMD( const fltx4 &val )
  2531. {
  2532. fltx4 fl4Abs = fabs( val );
  2533. fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( (float *) g_SIMD_lsbmask ), AddSIMD( fl4Abs, Four_2ToThe23s ) ), Four_2ToThe23s );
  2534. ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Twos ), ival );
  2535. return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits
  2536. }
  2537. FORCEINLINE fltx4 Mod2SIMDPositiveInput( const fltx4 &val )
  2538. {
  2539. fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( g_SIMD_lsbmask ), AddSIMD( val, Four_2ToThe23s ) ), Four_2ToThe23s );
  2540. ival = MaskedAssign( CmpGtSIMD( ival, val ), SubSIMD( ival, Four_Twos ), ival );
  2541. return SubSIMD( val, ival );
  2542. }
  2543. // approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi.
  2544. // no range reduction is done - for values outside of 0..1 you won't like the results
  2545. FORCEINLINE fltx4 _SinEst01SIMD( const fltx4 &val )
  2546. {
  2547. // really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between.
  2548. // sufficient for simple oscillation.
  2549. return MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) );
  2550. }
  2551. FORCEINLINE fltx4 _Sin01SIMD( const fltx4 &val )
  2552. {
  2553. // not a bad approximation : parabola always over-estimates. Squared parabola always
  2554. // underestimates. So lets blend between them: goodsin = badsin + .225*( badsin^2-badsin)
  2555. fltx4 fl4BadEst = MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) );
  2556. return AddSIMD( MulSIMD( Four_Point225s, SubSIMD( MulSIMD( fl4BadEst, fl4BadEst ), fl4BadEst ) ), fl4BadEst );
  2557. }
  2558. // full range useable implementations
  2559. FORCEINLINE fltx4 SinEst01SIMD( const fltx4 &val )
  2560. {
  2561. fltx4 fl4Abs = fabs( val );
  2562. fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs );
  2563. fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones );
  2564. fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) );
  2565. fltx4 fl4Sin = _SinEst01SIMD( fl4val );
  2566. fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) );
  2567. return fl4Sin;
  2568. }
  2569. FORCEINLINE fltx4 Sin01SIMD( const fltx4 &val )
  2570. {
  2571. fltx4 fl4Abs = fabs( val );
  2572. fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs );
  2573. fltx4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones );
  2574. fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) );
  2575. fltx4 fl4Sin = _Sin01SIMD( fl4val );
  2576. fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) );
  2577. return fl4Sin;
  2578. }
  2579. // Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1)
  2580. FORCEINLINE fltx4 PreCalcBiasParameter( const fltx4 &bias_parameter )
  2581. {
  2582. // convert perlin-style-bias parameter to the value right for the approximation
  2583. return SubSIMD( ReciprocalSIMD( bias_parameter ), Four_Twos );
  2584. }
  2585. FORCEINLINE fltx4 BiasSIMD( const fltx4 &val, const fltx4 &precalc_param )
  2586. {
  2587. // similar to bias function except pass precalced bias value from calling PreCalcBiasParameter.
  2588. //!!speed!! use reciprocal est?
  2589. //!!speed!! could save one op by precalcing _2_ values
  2590. return DivSIMD( val, AddSIMD( MulSIMD( precalc_param, SubSIMD( Four_Ones, val ) ), Four_Ones ) );
  2591. }
  2592. //-----------------------------------------------------------------------------
  2593. // Box/plane test
  2594. // NOTE: The w component of emins + emaxs must be 1 for this to work
  2595. //-----------------------------------------------------------------------------
  2596. FORCEINLINE int BoxOnPlaneSideSIMD( const fltx4& emins, const fltx4& emaxs, const cplane_t *p, float tolerance = 0.f )
  2597. {
  2598. fltx4 corners[2];
  2599. fltx4 normal = LoadUnalignedSIMD( p->normal.Base() );
  2600. fltx4 dist = ReplicateX4( -p->dist );
  2601. normal = SetWSIMD( normal, dist );
  2602. fltx4 t4 = ReplicateX4( tolerance );
  2603. fltx4 negt4 = ReplicateX4( -tolerance );
  2604. fltx4 cmp = CmpGeSIMD( normal, Four_Zeros );
  2605. corners[0] = MaskedAssign( cmp, emaxs, emins );
  2606. corners[1] = MaskedAssign( cmp, emins, emaxs );
  2607. fltx4 dot1 = Dot4SIMD( normal, corners[0] );
  2608. fltx4 dot2 = Dot4SIMD( normal, corners[1] );
  2609. cmp = CmpGeSIMD( dot1, t4 );
  2610. fltx4 cmp2 = CmpGtSIMD( negt4, dot2 );
  2611. fltx4 result = MaskedAssign( cmp, Four_Ones, Four_Zeros );
  2612. fltx4 result2 = MaskedAssign( cmp2, Four_Twos, Four_Zeros );
  2613. result = AddSIMD( result, result2 );
  2614. intx4 sides;
  2615. ConvertStoreAsIntsSIMD( &sides, result );
  2616. return sides[0];
  2617. }
  2618. #endif // _ssemath_h