Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

5972 lines
180 KiB

  1. //===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
  2. //
  3. // Purpose: - defines SIMD "structure of arrays" classes and functions.
  4. //
  5. //===========================================================================//
  6. #ifndef SSEMATH_H
  7. #define SSEMATH_H
  8. #if defined( _X360 )
  9. #include <xboxmath.h>
  10. #elif defined ( _PS3 )
  11. #include <vectormath/c/vectormath_aos.h>
  12. #include <vectormath/c/vectormath_aos_v.h>
  13. #else
  14. #include <xmmintrin.h>
  15. #ifndef _LINUX
  16. #include <emmintrin.h>
  17. #endif
  18. #endif
  19. #ifndef SPU
  20. #include "mathlib/vector.h"
  21. #include "mathlib/mathlib.h"
  22. #else
  23. #include "mathlib/math_pfns.h"
  24. #endif
  25. #include "mathlib/fltx4.h"
  26. // The FLTX4 type is a fltx4 used as a parameter to a function.
  27. // On the 360, the best way to do this is pass-by-copy on the registers.
  28. // On the PC, the best way is to pass by const reference.
  29. // The compiler will sometimes, but not always, replace a pass-by-const-ref
  30. // with a pass-in-reg on the 360; to avoid this confusion, you can
  31. // explicitly use a FLTX4 as the parameter type.
  32. #ifdef _X360
  33. typedef __vector4 FLTX4;
  34. #elif defined( _PS3 )
  35. typedef vec_float4 FLTX4;
  36. #else
  37. typedef const fltx4 & FLTX4;
  38. #endif
  39. // A 16-byte aligned int32 datastructure
  40. // (for use when writing out fltx4's as SIGNED
  41. // ints).
  42. struct ALIGN16 intx4
  43. {
  44. int32 m_i32[4];
  45. inline int & operator[](int which)
  46. {
  47. return m_i32[which];
  48. }
  49. inline const int & operator[](int which) const
  50. {
  51. return m_i32[which];
  52. }
  53. inline int32 *Base() {
  54. return m_i32;
  55. }
  56. inline const int32 *Base() const
  57. {
  58. return m_i32;
  59. }
  60. inline bool operator==(const intx4 &other) const
  61. {
  62. return m_i32[0] == other.m_i32[0] &&
  63. m_i32[1] == other.m_i32[1] &&
  64. m_i32[2] == other.m_i32[2] &&
  65. m_i32[3] == other.m_i32[3] ;
  66. }
  67. } ALIGN16_POST;
  68. #if defined( _DEBUG ) && defined( _X360 )
  69. FORCEINLINE void TestVPUFlags()
  70. {
  71. // Check that the VPU is in the appropriate (Java-compliant) mode (see 3.2.1 in altivec_pem.pdf on xds.xbox.com)
  72. __vector4 a;
  73. __asm
  74. {
  75. mfvscr a;
  76. }
  77. unsigned int * flags = (unsigned int *)&a;
  78. unsigned int controlWord = flags[3];
  79. Assert(controlWord == 0);
  80. }
  81. #else // _DEBUG
  82. FORCEINLINE void TestVPUFlags() {}
  83. #endif // _DEBUG
  84. // useful constants in SIMD packed float format:
  85. // (note: some of these aren't stored on the 360,
  86. // but are manufactured directly in one or two
  87. // instructions, saving a load and possible L2
  88. // miss.)
  89. #ifdef _X360
  90. // Shouldn't the PS3 have something similar?
  91. #define Four_Zeros XMVectorZero() // 0 0 0 0
  92. #define Four_Ones XMVectorSplatOne() // 1 1 1 1
  93. extern const fltx4 Four_Twos; // 2 2 2 2
  94. extern const fltx4 Four_Threes; // 3 3 3 3
  95. extern const fltx4 Four_Fours; // guess.
  96. extern const fltx4 Four_Point225s; // .225 .225 .225 .225
  97. extern const fltx4 Four_PointFives; // .5 .5 .5 .5
  98. extern const fltx4 Four_Thirds; // 1/3
  99. extern const fltx4 Four_TwoThirds; // 2/3
  100. extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1
  101. extern const fltx4 Four_DegToRad; // (float)(M_PI_F / 180.f) times four
  102. #elif defined(SPU)
  103. #define Four_Zeros spu_splats( 0.0f ) // 0 0 0 0
  104. #define Four_Ones spu_splats( 1.0f ) // 1 1 1 1
  105. #define Four_Twos spu_splats( 2.0f ) // 2 2 2 2
  106. #define Four_Threes spu_splats( 3.0f ) // 3 3 3 3
  107. #define Four_Fours spu_splats( 4.0f ) // guess.
  108. #define Four_Point225s spu_splats( 0.225f ) // .225 .225 .225 .225
  109. #define Four_PointFives spu_splats( 0.5f ) // .5 .5 .5 .5
  110. #define Four_Thirds spu_splats( 0.33333333 ); // 1/3
  111. #define Four_TwoThirds spu_splats( 0.66666666 ); // 2/3
  112. #define Four_NegativeOnes spu_splats( -1.0f ) // -1 -1 -1 -1
  113. #define Four_DegToRad spu_splats((float)(M_PI_F / 180.f))
  114. #else
  115. extern const fltx4 Four_Zeros; // 0 0 0 0
  116. extern const fltx4 Four_Ones; // 1 1 1 1
  117. extern const fltx4 Four_Twos; // 2 2 2 2
  118. extern const fltx4 Four_Threes; // 3 3 3 3
  119. extern const fltx4 Four_Fours; // guess.
  120. extern const fltx4 Four_Point225s; // .225 .225 .225 .225
  121. extern const fltx4 Four_PointFives; // .5 .5 .5 .5
  122. extern const fltx4 Four_Thirds; // 1/3
  123. extern const fltx4 Four_TwoThirds; // 2/3
  124. extern const fltx4 Four_NegativeOnes; // -1 -1 -1 -1
  125. extern const fltx4 Four_DegToRad; // (float)(M_PI_F / 180.f) times four
  126. #endif
  127. extern const fltx4 Four_Epsilons; // FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
  128. extern const fltx4 Four_2ToThe21s; // (1<<21)..
  129. extern const fltx4 Four_2ToThe22s; // (1<<22)..
  130. extern const fltx4 Four_2ToThe23s; // (1<<23)..
  131. extern const fltx4 Four_2ToThe24s; // (1<<24)..
  132. extern const fltx4 Four_Origin; // 0 0 0 1 (origin point, like vr0 on the PS2)
  133. extern const fltx4 Four_FLT_MAX; // FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
  134. extern const fltx4 Four_Negative_FLT_MAX; // -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
  135. extern const fltx4 g_SIMD_0123; // 0 1 2 3 as float
  136. // coefficients for polynomial approximation of srgb conversions
  137. // 4th order polynomial for x^(1/2.2), x in 0..1
  138. extern const fltx4 Four_LinearToGammaCoefficients_A; // *x^4
  139. extern const fltx4 Four_LinearToGammaCoefficients_B; // *x^3
  140. extern const fltx4 Four_LinearToGammaCoefficients_C; // *x^2
  141. extern const fltx4 Four_LinearToGammaCoefficients_D; // *x^1
  142. extern const fltx4 Four_LinearToGammaCoefficients_E; // *x^0
  143. // 3rd order polynomial for x^2.2 x in 0..1
  144. extern const fltx4 Four_GammaToLinearCoefficients_A; // *x^3
  145. extern const fltx4 Four_GammaToLinearCoefficients_B; // *x^2
  146. extern const fltx4 Four_GammaToLinearCoefficients_C; // *x^1
  147. extern const fltx4 Four_GammaToLinearCoefficients_D; // *x^0
  148. // external aligned integer constants
  149. #ifndef ALIGN16_POST
  150. #define ALIGN16_POST
  151. #endif
  152. extern const ALIGN16 int32 g_SIMD_clear_signmask[] ALIGN16_POST; // 0x7fffffff x 4
  153. extern const ALIGN16 int32 g_SIMD_signmask[] ALIGN16_POST; // 0x80000000 x 4
  154. extern const ALIGN16 int32 g_SIMD_lsbmask[] ALIGN16_POST; // 0xfffffffe x 4
  155. extern const ALIGN16 int32 g_SIMD_clear_wmask[] ALIGN16_POST; // -1 -1 -1 0
  156. extern const ALIGN16 int32 g_SIMD_ComponentMask[4][4] ALIGN16_POST; // [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
  157. extern const ALIGN16 int32 g_SIMD_AllOnesMask[] ALIGN16_POST; // ~0,~0,~0,~0
  158. extern const fltx4 g_SIMD_Identity[4]; // [1 0 0 0], [0 1 0 0], [0 0 1 0], [0 0 0 1]
  159. extern const ALIGN16 int32 g_SIMD_Low16BitsMask[] ALIGN16_POST; // 0xffff x 4
  160. // this mask is used for skipping the tail of things. If you have N elements in an array, and wish
  161. // to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration.
  162. extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;
  163. extern const int32 ALIGN16 g_SIMD_EveryOtherMask[]; // 0, ~0, 0, ~0
  164. // Define prefetch macros.
  165. // The characteristics of cache and prefetch are completely
  166. // different between the different platforms, so you DO NOT
  167. // want to just define one macro that maps to every platform
  168. // intrinsic under the hood -- you need to prefetch at different
  169. // intervals between x86 and PPC, for example, and that is
  170. // a higher level code change.
  171. // On the other hand, I'm tired of typing #ifdef _X360
  172. // all over the place, so this is just a nop on Intel, PS3.
  173. #ifdef PLATFORM_PPC
  174. #if defined(_X360)
  175. #define PREFETCH360(address, offset) __dcbt(offset,address)
  176. #elif defined(_PS3)
  177. #define PREFETCH360(address, offset) __dcbt( reinterpret_cast< const char * >(address) + offset )
  178. #else
  179. #error Prefetch not defined for this platform!
  180. #endif
  181. #else
  182. #define PREFETCH360(x,y) // nothing
  183. #endif
  184. // Here's a handy function to align a pointer to the next
  185. // sixteen byte boundary -- it'll round it up to the nearest
  186. // multiple of 16. This is useful if you're subdividing
  187. // big swaths of allocated memory, but in that case, remember
  188. // to leave yourself the necessary slack in the allocation.
  189. template<class T>
  190. inline T *AlignPointer(void * ptr)
  191. {
  192. #if defined( __clang__ )
  193. uintp temp = (uintp)ptr;
  194. #else
  195. unsigned temp = ptr;
  196. #endif
  197. temp = ALIGN_VALUE(temp, sizeof(T));
  198. return (T *)temp;
  199. }
  200. #ifdef _PS3
  201. // Note that similar defines exist in math_pfns.h
  202. // Maybe we should consolidate in one place for all platforms.
  203. #define _VEC_CLEAR_SIGNMASK (__vector unsigned int) {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff}
  204. #define _VEC_SIGNMASK (__vector unsigned int) { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }
  205. #define _VEC_LSBMASK (__vector unsigned int) { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe }
  206. #define _VEC_CLEAR_WMASK (__vector unsigned int) {0xffffffff, 0xffffffff, 0xffffffff, 0}
  207. #define _VEC_COMPONENT_MASK_0 (__vector unsigned int) {0xffffffff, 0, 0, 0}
  208. #define _VEC_COMPONENT_MASK_1 (__vector unsigned int) {0, 0xffffffff, 0, 0}
  209. #define _VEC_COMPONENT_MASK_2 (__vector unsigned int) {0, 0, 0xffffffff, 0}
  210. #define _VEC_COMPONENT_MASK_3 (__vector unsigned int) {0, 0, 0, 0xffffffff}
  211. #define _VEC_SWIZZLE_WZYX (__vector unsigned char) { 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b, 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03 }
  212. #define _VEC_SWIZZLE_ZWXY (__vector unsigned char) { 0x08,0x09,0x0a,0x0b, 0x0c,0x0d,0x0e,0x0f, 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07 }
  213. #define _VEC_SWIZZLE_YXWZ (__vector unsigned char) { 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03, 0x0c,0x0d,0x0e,0x0f, 0x08,0x09,0x0a,0x0b }
  214. #define _VEC_ZERO (__vector unsigned int) {0,0,0,0}
  215. #define _VEC_FLTMAX (__vector float) {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}
  216. #define _VEC_FLTMIN (__vector float) {FLT_MIN,FLT_MIN,FLT_MIN,FLT_MIN}
  217. #define _VEC_ORIGIN (__vector unsigned int) { 0x00000000, 0x00000000, 0x00000000, 0xffffffff }
  218. #endif
  219. #if USE_STDC_FOR_SIMD
  220. //---------------------------------------------------------------------
  221. // Standard C (fallback/Linux) implementation (only there for compat - slow)
  222. //---------------------------------------------------------------------
  223. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  224. {
  225. return a.m128_f32[ idx ];
  226. }
  227. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  228. {
  229. return a.m128_f32[idx];
  230. }
  231. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  232. {
  233. return a.m128_u32[idx];
  234. }
  235. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  236. {
  237. return a.m128_u32[idx];
  238. }
  239. // Return one in the fastest way -- on the x360, faster even than loading.
  240. FORCEINLINE fltx4 LoadZeroSIMD( void )
  241. {
  242. return Four_Zeros;
  243. }
  244. // Return one in the fastest way -- on the x360, faster even than loading.
  245. FORCEINLINE fltx4 LoadOneSIMD( void )
  246. {
  247. return Four_Ones;
  248. }
  249. FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a )
  250. {
  251. fltx4 retVal;
  252. SubFloat( retVal, 0 ) = SubFloat( a, 0 );
  253. SubFloat( retVal, 1 ) = SubFloat( a, 0 );
  254. SubFloat( retVal, 2 ) = SubFloat( a, 0 );
  255. SubFloat( retVal, 3 ) = SubFloat( a, 0 );
  256. return retVal;
  257. }
  258. FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
  259. {
  260. fltx4 retVal;
  261. SubFloat( retVal, 0 ) = SubFloat( a, 1 );
  262. SubFloat( retVal, 1 ) = SubFloat( a, 1 );
  263. SubFloat( retVal, 2 ) = SubFloat( a, 1 );
  264. SubFloat( retVal, 3 ) = SubFloat( a, 1 );
  265. return retVal;
  266. }
  267. FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
  268. {
  269. fltx4 retVal;
  270. SubFloat( retVal, 0 ) = SubFloat( a, 2 );
  271. SubFloat( retVal, 1 ) = SubFloat( a, 2 );
  272. SubFloat( retVal, 2 ) = SubFloat( a, 2 );
  273. SubFloat( retVal, 3 ) = SubFloat( a, 2 );
  274. return retVal;
  275. }
  276. FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
  277. {
  278. fltx4 retVal;
  279. SubFloat( retVal, 0 ) = SubFloat( a, 3 );
  280. SubFloat( retVal, 1 ) = SubFloat( a, 3 );
  281. SubFloat( retVal, 2 ) = SubFloat( a, 3 );
  282. SubFloat( retVal, 3 ) = SubFloat( a, 3 );
  283. return retVal;
  284. }
  285. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  286. {
  287. fltx4 result = a;
  288. SubFloat( result, 0 ) = SubFloat( x, 0 );
  289. return result;
  290. }
  291. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  292. {
  293. fltx4 result = a;
  294. SubFloat( result, 1 ) = SubFloat( y, 1 );
  295. return result;
  296. }
  297. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  298. {
  299. fltx4 result = a;
  300. SubFloat( result, 2 ) = SubFloat( z, 2 );
  301. return result;
  302. }
  303. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  304. {
  305. fltx4 result = a;
  306. SubFloat( result, 3 ) = SubFloat( w, 3 );
  307. return result;
  308. }
  309. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
  310. {
  311. fltx4 result = a;
  312. SubFloat( result, nComponent ) = flValue;
  313. return result;
  314. }
  315. // a b c d -> b c d a
  316. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  317. {
  318. fltx4 retVal;
  319. SubFloat( retVal, 0 ) = SubFloat( a, 1 );
  320. SubFloat( retVal, 1 ) = SubFloat( a, 2 );
  321. SubFloat( retVal, 2 ) = SubFloat( a, 3 );
  322. SubFloat( retVal, 3 ) = SubFloat( a, 0 );
  323. return retVal;
  324. }
  325. // a b c d -> c d a b
  326. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  327. {
  328. fltx4 retVal;
  329. SubFloat( retVal, 0 ) = SubFloat( a, 2 );
  330. SubFloat( retVal, 1 ) = SubFloat( a, 3 );
  331. SubFloat( retVal, 2 ) = SubFloat( a, 0 );
  332. SubFloat( retVal, 3 ) = SubFloat( a, 1 );
  333. return retVal;
  334. }
  335. #define BINOP(op) \
  336. fltx4 retVal; \
  337. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \
  338. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \
  339. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \
  340. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \
  341. return retVal;
  342. #define IBINOP(op) \
  343. fltx4 retVal; \
  344. SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \
  345. SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \
  346. SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \
  347. SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \
  348. return retVal;
  349. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
  350. {
  351. BINOP(+);
  352. }
  353. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  354. {
  355. BINOP(-);
  356. };
  357. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  358. {
  359. BINOP(*);
  360. }
  361. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  362. {
  363. BINOP(/);
  364. }
  365. FORCEINLINE fltx4 DivEstSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  366. {
  367. BINOP(/);
  368. }
  369. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  370. {
  371. return AddSIMD( MulSIMD(a,b), c );
  372. }
  373. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  374. {
  375. return SubSIMD( c, MulSIMD(a,b) );
  376. };
  377. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  378. {
  379. fltx4 result;
  380. SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
  381. SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
  382. SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
  383. SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
  384. return result;
  385. }
  386. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  387. {
  388. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  389. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  390. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  391. }
  392. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  393. {
  394. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  395. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  396. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  397. SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
  398. }
  399. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  400. {
  401. fltx4 result;
  402. SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
  403. SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
  404. SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
  405. SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
  406. return result;
  407. }
  408. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  409. {
  410. fltx4 result;
  411. SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
  412. SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
  413. SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
  414. SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
  415. return result;
  416. }
  417. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  418. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  419. {
  420. fltx4 result;
  421. SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  422. SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  423. SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  424. SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  425. return result;
  426. }
  427. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  428. {
  429. fltx4 retVal;
  430. SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  431. SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  432. SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  433. SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  434. return retVal;
  435. }
  436. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  437. {
  438. fltx4 retVal;
  439. SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  440. SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  441. SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  442. SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  443. return retVal;
  444. }
  445. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  446. {
  447. IBINOP(&);
  448. }
  449. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  450. {
  451. fltx4 retVal;
  452. SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 );
  453. SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 );
  454. SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 );
  455. SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 );
  456. return retVal;
  457. }
  458. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  459. {
  460. IBINOP(^);
  461. }
  462. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  463. {
  464. IBINOP(|);
  465. }
  466. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  467. {
  468. fltx4 retval;
  469. SubFloat( retval, 0 ) = -SubFloat( a, 0 );
  470. SubFloat( retval, 1 ) = -SubFloat( a, 1 );
  471. SubFloat( retval, 2 ) = -SubFloat( a, 2 );
  472. SubFloat( retval, 3 ) = -SubFloat( a, 3 );
  473. return retval;
  474. }
  475. FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
  476. {
  477. return ( SubFloat( a, 0 ) == 0.0 ) &&
  478. ( SubFloat( a, 1 ) == 0.0 ) &&
  479. ( SubFloat( a, 2 ) == 0.0 ) &&
  480. ( SubFloat( a, 3 ) == 0.0 ) ;
  481. }
  482. // for branching when a.xyzw > b.xyzw
  483. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  484. {
  485. return SubFloat(a,0) > SubFloat(b,0) &&
  486. SubFloat(a,1) > SubFloat(b,1) &&
  487. SubFloat(a,2) > SubFloat(b,2) &&
  488. SubFloat(a,3) > SubFloat(b,3);
  489. }
  490. // for branching when a.xyzw >= b.xyzw
  491. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  492. {
  493. return SubFloat(a,0) >= SubFloat(b,0) &&
  494. SubFloat(a,1) >= SubFloat(b,1) &&
  495. SubFloat(a,2) >= SubFloat(b,2) &&
  496. SubFloat(a,3) >= SubFloat(b,3);
  497. }
  498. // For branching if all a.xyzw == b.xyzw
  499. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  500. {
  501. return SubFloat(a,0) == SubFloat(b,0) &&
  502. SubFloat(a,1) == SubFloat(b,1) &&
  503. SubFloat(a,2) == SubFloat(b,2) &&
  504. SubFloat(a,3) == SubFloat(b,3);
  505. }
  506. // For branching if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w
  507. FORCEINLINE bool IsAnyEqual( const fltx4 & a, const fltx4 & b )
  508. {
  509. return SubFloat(a,0) == SubFloat(b,0) ||
  510. SubFloat(a,1) == SubFloat(b,1) ||
  511. SubFloat(a,2) == SubFloat(b,2) ||
  512. SubFloat(a,3) == SubFloat(b,3);
  513. }
  514. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  515. {
  516. int nRet = 0;
  517. nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0
  518. nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1
  519. nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2
  520. nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3
  521. return nRet;
  522. }
  523. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  524. {
  525. return (0 != TestSignSIMD( a ));
  526. }
  527. FORCEINLINE bool IsAnyTrue( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  528. {
  529. return (0 != TestSignSIMD( a ));
  530. }
  531. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  532. {
  533. fltx4 retVal;
  534. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0;
  535. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0;
  536. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0;
  537. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0;
  538. return retVal;
  539. }
  540. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  541. {
  542. fltx4 retVal;
  543. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0;
  544. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0;
  545. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0;
  546. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0;
  547. return retVal;
  548. }
  549. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  550. {
  551. fltx4 retVal;
  552. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0;
  553. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0;
  554. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0;
  555. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0;
  556. return retVal;
  557. }
  558. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  559. {
  560. fltx4 retVal;
  561. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0;
  562. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0;
  563. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0;
  564. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0;
  565. return retVal;
  566. }
  567. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  568. {
  569. fltx4 retVal;
  570. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0;
  571. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0;
  572. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0;
  573. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0;
  574. return retVal;
  575. }
  576. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  577. {
  578. fltx4 retVal;
  579. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0;
  580. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0;
  581. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0;
  582. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0;
  583. return retVal;
  584. }
  585. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  586. {
  587. return OrSIMD(
  588. AndSIMD( ReplacementMask, NewValue ),
  589. AndNotSIMD( ReplacementMask, OldValue ) );
  590. }
  591. FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
  592. {
  593. fltx4 retVal;
  594. SubFloat( retVal, 0 ) = flValue;
  595. SubFloat( retVal, 1 ) = flValue;
  596. SubFloat( retVal, 2 ) = flValue;
  597. SubFloat( retVal, 3 ) = flValue;
  598. return retVal;
  599. }
  600. /// replicate a single 32 bit integer value to all 4 components of an m128
  601. FORCEINLINE fltx4 ReplicateIX4( int nValue )
  602. {
  603. fltx4 retVal;
  604. SubInt( retVal, 0 ) = nValue;
  605. SubInt( retVal, 1 ) = nValue;
  606. SubInt( retVal, 2 ) = nValue;
  607. SubInt( retVal, 3 ) = nValue;
  608. return retVal;
  609. }
  610. // Round towards positive infinity
  611. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  612. {
  613. fltx4 retVal;
  614. SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
  615. SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
  616. SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
  617. SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
  618. return retVal;
  619. }
  620. // Round towards negative infinity
  621. FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
  622. {
  623. fltx4 retVal;
  624. SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) );
  625. SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) );
  626. SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) );
  627. SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) );
  628. return retVal;
  629. }
  630. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  631. {
  632. fltx4 retVal;
  633. SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
  634. SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
  635. SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
  636. SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
  637. return retVal;
  638. }
  639. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  640. {
  641. fltx4 retVal;
  642. SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
  643. SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
  644. SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
  645. SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
  646. return retVal;
  647. }
  648. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  649. {
  650. fltx4 retVal;
  651. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
  652. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
  653. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
  654. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
  655. return retVal;
  656. }
  657. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  658. {
  659. fltx4 retVal;
  660. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON );
  661. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON );
  662. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON );
  663. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON );
  664. return retVal;
  665. }
  666. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  667. {
  668. fltx4 retVal;
  669. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
  670. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
  671. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
  672. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
  673. return retVal;
  674. }
  675. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  676. {
  677. fltx4 retVal;
  678. SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
  679. SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
  680. SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
  681. SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
  682. return retVal;
  683. }
  684. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  685. {
  686. fltx4 retVal;
  687. SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
  688. SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
  689. SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
  690. SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
  691. return retVal;
  692. }
  693. /// 1/x for all 4 values.
  694. /// 1/0 will result in a big but NOT infinite result
  695. FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
  696. {
  697. fltx4 retVal;
  698. SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
  699. SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
  700. SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
  701. SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
  702. return retVal;
  703. }
  704. FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
  705. {
  706. fltx4 retVal;
  707. SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
  708. SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
  709. SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
  710. SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
  711. return retVal;
  712. }
  713. // 2^x for all values (the antilog)
  714. FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
  715. {
  716. fltx4 retVal;
  717. SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) );
  718. SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) );
  719. SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) );
  720. SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) );
  721. return retVal;
  722. }
  723. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  724. {
  725. float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
  726. SubFloat( a, 1 ) * SubFloat( b, 1 ) +
  727. SubFloat( a, 2 ) * SubFloat( b, 2 );
  728. return ReplicateX4( flDot );
  729. }
  730. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  731. {
  732. float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
  733. SubFloat( a, 1 ) * SubFloat( b, 1 ) +
  734. SubFloat( a, 2 ) * SubFloat( b, 2 ) +
  735. SubFloat( a, 3 ) * SubFloat( b, 3 );
  736. return ReplicateX4( flDot );
  737. }
  738. // Clamps the components of a vector to a specified minimum and maximum range.
  739. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  740. {
  741. return MaxSIMD( min, MinSIMD( max, in ) );
  742. }
  743. // Squelch the w component of a vector to +0.0.
  744. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  745. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  746. {
  747. fltx4 retval;
  748. retval = a;
  749. SubFloat( retval, 0 ) = 0;
  750. return retval;
  751. }
  752. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  753. {
  754. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  755. }
  756. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  757. {
  758. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  759. }
  760. // load a single unaligned float into the x component of a SIMD word
  761. FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
  762. {
  763. fltx4 retval;
  764. SubFloat( retval, 0 ) = *pFlt;
  765. return retval;
  766. }
  767. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  768. {
  769. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  770. }
  771. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  772. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
  773. {
  774. fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
  775. // squelch w
  776. SubInt( retval, 3 ) = 0;
  777. return retval;
  778. }
  779. // construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
  780. FORCEINLINE fltx4 LoadGatherSIMD( const float &x, const float &y, const float &z, const float &w )
  781. {
  782. fltx4 retval = { x, y, z, w };
  783. return retval;
  784. }
  785. FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
  786. {
  787. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  788. }
  789. FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
  790. {
  791. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  792. }
  793. FORCEINLINE void StoreUnalignedFloat( float *pSingleFloat, const fltx4 & a )
  794. {
  795. *pSingleFloat = SubFloat( a, 0 );
  796. }
  797. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  798. {
  799. *pSIMD = SubFloat(a, 0);
  800. *(pSIMD+1) = SubFloat(a, 1);
  801. *(pSIMD+2) = SubFloat(a, 2);
  802. }
  803. // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
  804. FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
  805. {
  806. StoreAlignedSIMD(pSIMD->Base(),a);
  807. }
  808. // Store the x,y,z components of the four FLTX4 parameters
  809. // into the four consecutive Vectors:
  810. // pDestination[0], pDestination[1], pDestination[2], pDestination[3]
  811. // The Vectors are assumed to be unaligned.
  812. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  813. Vector * const pDestination )
  814. {
  815. StoreUnaligned3SIMD( pDestination->Base(), a );
  816. StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
  817. StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
  818. StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
  819. }
  820. // Store the x,y,z components of the four FLTX4 parameters
  821. // into the four consecutive Vectors:
  822. // pDestination , pDestination + 1, pDestination + 2, pDestination + 3
  823. // The Vectors are assumed to start on an ALIGNED address, that is,
  824. // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
  825. FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  826. Vector * const pDestination )
  827. {
  828. StoreUnaligned3SIMD( pDestination->Base(), a );
  829. StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
  830. StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
  831. StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
  832. }
  833. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
  834. {
  835. #define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) { float tmp = SubFloat( _a_, _ia_ ); SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); SubFloat( _b_, _ib_ ) = tmp; }
  836. SWAP_FLOATS( x, 1, y, 0 );
  837. SWAP_FLOATS( x, 2, z, 0 );
  838. SWAP_FLOATS( x, 3, w, 0 );
  839. SWAP_FLOATS( y, 2, z, 1 );
  840. SWAP_FLOATS( y, 3, w, 1 );
  841. SWAP_FLOATS( z, 3, w, 2 );
  842. }
  843. // find the lowest component of a.x, a.y, a.z,
  844. // and replicate it to the whole return value.
  845. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
  846. {
  847. float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
  848. return ReplicateX4(lowest);
  849. }
  850. // find the highest component of a.x, a.y, a.z,
  851. // and replicate it to the whole return value.
  852. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
  853. {
  854. float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
  855. return ReplicateX4(highest);
  856. }
  857. // Fixed-point conversion and save as SIGNED INTS.
  858. // pDest->x = Int (vSrc.x)
  859. // note: some architectures have means of doing
  860. // fixed point conversion when the fix depth is
  861. // specified as an immediate.. but there is no way
  862. // to guarantee an immediate as a parameter to function
  863. // like this.
  864. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  865. {
  866. (*pDest)[0] = SubFloat(vSrc, 0);
  867. (*pDest)[1] = SubFloat(vSrc, 1);
  868. (*pDest)[2] = SubFloat(vSrc, 2);
  869. (*pDest)[3] = SubFloat(vSrc, 3);
  870. }
  871. // ------------------------------------
  872. // INTEGER SIMD OPERATIONS.
  873. // ------------------------------------
  874. // splat all components of a vector to a signed immediate int number.
  875. FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue )
  876. {
  877. fltx4 retval;
  878. SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue;
  879. return retval;
  880. }
  881. // Load 4 aligned words into a SIMD register
  882. FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD)
  883. {
  884. return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
  885. }
  886. // Load 4 unaligned words into a SIMD register
  887. FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
  888. {
  889. return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
  890. }
  891. // save into four words, 16-byte aligned
  892. FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  893. {
  894. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  895. }
  896. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  897. {
  898. *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
  899. }
  900. FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  901. {
  902. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  903. }
  904. // Load four consecutive uint16's, and turn them into floating point numbers.
  905. // This function isn't especially fast and could be made faster if anyone is
  906. // using it heavily.
  907. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
  908. {
  909. fltx4 retval;
  910. SubFloat( retval, 0 ) = pInts[0];
  911. SubFloat( retval, 1 ) = pInts[1];
  912. SubFloat( retval, 2 ) = pInts[2];
  913. SubFloat( retval, 3 ) = pInts[3];
  914. }
  915. // Take a fltx4 containing fixed-point uints and
  916. // return them as single precision floats. No
  917. // fixed point conversion is done.
  918. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
  919. {
  920. Assert(0); /* pc has no such operation */
  921. fltx4 retval;
  922. SubFloat( retval, 0 ) = ( (float) SubInt( vSrcA, 0 ) );
  923. SubFloat( retval, 1 ) = ( (float) SubInt( vSrcA, 1 ) );
  924. SubFloat( retval, 2 ) = ( (float) SubInt( vSrcA, 2 ) );
  925. SubFloat( retval, 3 ) = ( (float) SubInt( vSrcA, 3 ) );
  926. return retval;
  927. }
  928. #if 0 /* pc has no such op */
  929. // Take a fltx4 containing fixed-point sints and
  930. // return them as single precision floats. No
  931. // fixed point conversion is done.
  932. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  933. {
  934. fltx4 retval;
  935. SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) );
  936. SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) );
  937. SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) );
  938. SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) );
  939. return retval;
  940. }
  941. /*
  942. works on fltx4's as if they are four uints.
  943. the first parameter contains the words to be shifted,
  944. the second contains the amount to shift by AS INTS
  945. for i = 0 to 3
  946. shift = vSrcB_i*32:(i*32)+4
  947. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  948. */
  949. FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
  950. {
  951. i32x4 retval;
  952. SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
  953. SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
  954. SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
  955. SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
  956. return retval;
  957. }
  958. #endif
  959. #elif ( defined( _PS3 ) )
  960. #define SN_IMPROVED_INTRINSICS ( (( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )) ||\
  961. (defined(__SN_VER__) && (__SN_VER__ > 25002)) )
  962. //---------------------------------------------------------------------
  963. // PS3 implementation
  964. //---------------------------------------------------------------------
  965. FORCEINLINE float FloatSIMD( fltx4 & a, int idx )
  966. {
  967. #if SN_IMPROVED_INTRINSICS
  968. return vec_extract(a,idx);
  969. #else
  970. fltx4_union a_union;
  971. vec_st(a, 0, &a_union.vmxf);
  972. return a_union.m128_f32[idx];
  973. #endif
  974. }
  975. FORCEINLINE unsigned int UIntSIMD( u32x4 & a, int idx )
  976. {
  977. #if SN_IMPROVED_INTRINSICS
  978. return vec_extract(a,idx);
  979. #else
  980. fltx4_union a_union;
  981. vec_st(a, 0, &a_union.vmxui);
  982. return a_union.m128_u32[idx];
  983. #endif
  984. }
  985. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
  986. {
  987. return vec_add( a, b );
  988. }
  989. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  990. {
  991. return vec_sub( a, b );
  992. }
  993. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  994. {
  995. return vec_madd( a, b, _VEC_ZEROF );
  996. }
  997. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  998. {
  999. return vec_madd( a, b, c );
  1000. }
  1001. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  1002. {
  1003. return vec_nmsub( a, b, c );
  1004. };
  1005. FORCEINLINE fltx4 Dot3SIMD( const fltx4& a, const fltx4& b)
  1006. {
  1007. // oliviern: it seems that this code could be optimized
  1008. // (or maybe the latency will slow down if there is nothing to put in between)
  1009. // Something like that (to verify on PS3 and SPU):
  1010. // result2 = vec_madd(a, b, _VEC_ZEROF); // a0 * b0, a1 * b1, a2 * b2, a3 * b3
  1011. // result = vec_add(vec_sld(result2, result2, 4), result2); // (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0)
  1012. // result = vec_add(vec_sld(result2, result2, 8), result); // (a0 * b0) + (a1 * b1) + (a2 * b2), (a1 * b1) + (a2 * b2) + (a3 * b3), (a2 * b2) + (a3 * b3) + (a0 * b0), (a3 * b3) + (a0 * b0) + ...
  1013. // result = vec_splat(result, 0); // DotProduct3...
  1014. // 6 SIMD instructions instead of 8 (but again with potentially one more latency - it depends if other stuff can be interleaved in between).
  1015. // It may still be a bit faster in the worst case.
  1016. fltx4 result;
  1017. result = vec_madd( a, b, _VEC_ZEROF );
  1018. result = vec_madd( vec_sld(a,a,4), vec_sld(b,b,4), result );
  1019. result = vec_madd( vec_sld(a,a,8), vec_sld(b,b,8), result );
  1020. // replicate across all
  1021. result = vec_splat(result,0);
  1022. return result;
  1023. }
  1024. FORCEINLINE fltx4 Dot4SIMD( const fltx4& a, const fltx4& b)
  1025. {
  1026. // See comment in Dot3SIMD, we could reduce to 6 SIMD instructions instead of 7 (but again with potentially one more latency).
  1027. // result = vec_madd(a, b, _VEC_ZEROF); // a0 * b0, a1 * b1, a2 * b2, a3 * b3
  1028. // result = vec_add(vec_sld(result, result, 4), result); // (a0 * b0) + (a1 * b1), (a1 * b1) + (a2 * b2), (a2 * b2) + (a3 * b3), (a3 * b3) + (a0 * b0)
  1029. // result = vec_add(vec_sld(result, result, 8), result); // (a0 * b0) + (a1 * b1) + (a2 * b2) + (a3 * b3), ...
  1030. // result = vec_splat(result, 0); // DotProduct3...
  1031. // 6 SIMD instructions instead of 7 (but again with potentially one more latency - it depends if other stuff can be interleaved in between).
  1032. // It may be a wash in the worst case.
  1033. fltx4 result;
  1034. result = vec_madd( a, b, _VEC_ZEROF );
  1035. result = vec_madd( vec_sld(a,a,4), vec_sld(b,b,4), result );
  1036. result = vec_add( vec_sld(result,result,8), result );
  1037. // replicate across all
  1038. result = vec_splat(result,0);
  1039. return result;
  1040. }
  1041. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  1042. {
  1043. return sinf4( radians );
  1044. }
  1045. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  1046. {
  1047. sincosf4( radians, &sine, &cosine );
  1048. }
  1049. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c
  1050. {
  1051. sincosf4( radians, &sine, &cosine );
  1052. }
  1053. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  1054. {
  1055. return acosf4( cs );
  1056. }
  1057. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  1058. {
  1059. return atan2f4( a, b );
  1060. }
  1061. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  1062. {
  1063. return asinf4( sine );
  1064. }
  1065. // DivSIMD defined further down, since it uses ReciprocalSIMD
  1066. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  1067. {
  1068. return vec_max( a, b );
  1069. }
  1070. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  1071. {
  1072. return vec_min( a, b );
  1073. }
  1074. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  1075. {
  1076. return vec_and( a, b );
  1077. }
  1078. FORCEINLINE fltx4 AndSIMD( const bi32x4 & a, const fltx4 & b ) // a & b
  1079. {
  1080. return vec_and( (fltx4)a, b );
  1081. }
  1082. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const bi32x4 & b ) // a & b
  1083. {
  1084. return vec_and( a, (fltx4)b );
  1085. }
  1086. FORCEINLINE bi32x4 AndSIMD( const bi32x4 & a, const bi32x4 & b ) // a & b
  1087. {
  1088. return vec_and( a, b );
  1089. }
  1090. #if 0
  1091. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  1092. {
  1093. // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
  1094. return vec_andc( b, a);
  1095. }
  1096. FORCEINLINE fltx4 AndNotSIMD( const bi32x4 & a, const fltx4 & b ) // ~a & b
  1097. {
  1098. // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
  1099. return vec_andc( b, (fltx4)a);
  1100. }
  1101. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const bi32x4 & b ) // ~a & b
  1102. {
  1103. // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
  1104. return (fltx4)vec_andc( b, (bi32x4)a);
  1105. }
  1106. FORCEINLINE bi32x4 AndNotSIMD( const bi32x4 & a, const bi32x4 & b ) // ~a & b
  1107. {
  1108. // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
  1109. return vec_andc( b, a);
  1110. }
  1111. #else
  1112. template< typename T, typename U >
  1113. FORCEINLINE T AndNotSIMD( const T &a, const U &b ) // ~a & b
  1114. {
  1115. return vec_andc( b, (T)a );
  1116. }
  1117. // specialize for the case of bi, flt
  1118. FORCEINLINE fltx4 AndNotSIMD( const bi32x4 &a, const fltx4 &b ) // ~a & b
  1119. {
  1120. return vec_andc( b, (fltx4)a );
  1121. }
  1122. #endif
  1123. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  1124. {
  1125. return vec_xor( a, b );
  1126. }
  1127. FORCEINLINE fltx4 XorSIMD( const bi32x4 & a, const fltx4 & b ) // a ^ b
  1128. {
  1129. return vec_xor( (fltx4)a, b );
  1130. }
  1131. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const bi32x4 & b ) // a ^ b
  1132. {
  1133. return vec_xor( a, (fltx4)b );
  1134. }
  1135. FORCEINLINE bi32x4 XorSIMD( const bi32x4 & a, const bi32x4 & b ) // a ^ b
  1136. {
  1137. return vec_xor( a, b );
  1138. }
  1139. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  1140. {
  1141. return vec_or( a, b );
  1142. }
  1143. FORCEINLINE fltx4 OrSIMD( const bi32x4 & a, const fltx4 & b ) // a | b
  1144. {
  1145. return vec_or( (fltx4)a, b );
  1146. }
  1147. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const bi32x4 & b ) // a | b
  1148. {
  1149. return vec_or( a, (fltx4)b );
  1150. }
  1151. FORCEINLINE i32x4 OrSIMD( const i32x4 & a, const i32x4 & b ) // a | b
  1152. {
  1153. return vec_or( a, b );
  1154. }
  1155. FORCEINLINE u32x4 OrSIMD( const u32x4 & a, const u32x4 & b ) // a | b
  1156. {
  1157. return vec_or( a, b );
  1158. }
  1159. #if !defined(__SPU__) // bi32x4 typedef to same as u32x4 on SPU
  1160. FORCEINLINE bi32x4 OrSIMD( const bi32x4 & a, const bi32x4 & b ) // a | b
  1161. {
  1162. return vec_or( a, b );
  1163. }
  1164. #endif
  1165. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  1166. {
  1167. return( SubSIMD( _VEC_ZEROF, a ) );
  1168. // untested
  1169. // vec_float4 signMask;
  1170. // vec_float4 result;
  1171. // signMask = vec_splat_s32(-1);
  1172. // signMask = vec_sll(signMask, signMask);
  1173. // result = vec_xor(a, signMask);
  1174. // return result;
  1175. }
  1176. FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero?
  1177. {
  1178. return vec_any_eq( a, _VEC_ZEROF );
  1179. }
  1180. FORCEINLINE bool IsAnyZeros( const bi32x4 & a ) // any floats are zero?
  1181. {
  1182. return vec_any_eq( (u32x4)a, _VEC_ZERO );
  1183. }
  1184. FORCEINLINE bool IsAllZeros( const bi32x4 & a ) // all floats of a zero?
  1185. {
  1186. return vec_all_eq( (u32x4)a, _VEC_ZERO );
  1187. }
  1188. FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero?
  1189. {
  1190. #if SN_IMPROVED_INTRINSICS
  1191. // push 1.0 into w (NON-ZERO)
  1192. fltx4 b = vec_insert(1.0f,a,3);
  1193. return vec_any_eq( b, _VEC_ZEROF );
  1194. #else
  1195. fltx4 b = vec_perm(a,_VEC_ONEF,_VEC_PERMUTE_XYZ0W1);
  1196. return vec_any_eq( b, _VEC_ZEROF );
  1197. #endif
  1198. }
  1199. // for branching when a.xyzw > b.xyzw
  1200. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  1201. {
  1202. return vec_all_gt( a, b );
  1203. }
  1204. // for branching when a.xyzw >= b.xyzw
  1205. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  1206. {
  1207. return vec_all_ge(a,b);
  1208. }
  1209. FORCEINLINE bool IsAllEqual( const fltx4 &a, const fltx4 &b )
  1210. {
  1211. return vec_all_eq(a,b);
  1212. }
  1213. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  1214. {
  1215. // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
  1216. int nRet = 0;
  1217. fltx4_union a_union;
  1218. vec_st(a,0,&a_union.vmxf);
  1219. nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
  1220. nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
  1221. nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
  1222. nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3
  1223. return nRet;
  1224. }
  1225. FORCEINLINE int TestSignSIMD( const bi32x4 & a ) // mask of which floats have the high bit set
  1226. {
  1227. // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
  1228. int nRet = 0;
  1229. fltx4_union a_union;
  1230. vec_st(a,0,&a_union.vmxbi);
  1231. nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
  1232. nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
  1233. nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
  1234. nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3
  1235. return nRet;
  1236. }
  1237. FORCEINLINE bool IsAnyNegative( const bi32x4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  1238. {
  1239. return (0 != TestSignSIMD( a ));
  1240. }
  1241. // Squelch the w component of a vector to +0.0.
  1242. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  1243. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  1244. {
  1245. return (fltx4)vec_and( (u32x4)a, _VEC_CLEAR_WMASK );
  1246. }
  1247. FORCEINLINE bi32x4 SetWToZeroSIMD( const bi32x4 & a )
  1248. {
  1249. return (bi32x4)vec_and( (u32x4)a, _VEC_CLEAR_WMASK );
  1250. }
  1251. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  1252. {
  1253. // NOTE: this tests the top bits of each vector element using integer math
  1254. // (so it ignores NaNs - it will return true for "-NaN")
  1255. return vec_any_lt( a, _VEC_ZEROF );
  1256. }
  1257. FORCEINLINE bool IsAnyTrue( const fltx4 & a )
  1258. {
  1259. return vec_any_ne( a, _VEC_ZEROF );
  1260. }
  1261. #ifdef DIFFERENT_NATIVE_VECTOR_TYPES
  1262. FORCEINLINE bool IsAnyTrue( const bi32x4 & a )
  1263. {
  1264. return vec_any_ne( (vector unsigned int) a, _VEC_0L );
  1265. }
  1266. #endif
  1267. FORCEINLINE bi32x4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  1268. {
  1269. return (bi32x4)vec_cmpeq( a, b );
  1270. }
  1271. FORCEINLINE bi32x4 CmpEqSIMD( const i32x4 & a, const i32x4 & b ) // (a==b) ? ~0:0
  1272. {
  1273. return (bi32x4)vec_cmpeq( a, b );
  1274. }
  1275. FORCEINLINE bi32x4 CmpEqSIMD( const u32x4 & a, const u32x4 & b ) // (a==b) ? ~0:0
  1276. {
  1277. return (bi32x4)vec_cmpeq( a, b );
  1278. }
  1279. FORCEINLINE bi32x4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  1280. {
  1281. return (bi32x4)vec_cmpgt( a, b );
  1282. }
  1283. FORCEINLINE bi32x4 CmpGtSIMD( const i32x4 & a, const i32x4 & b ) // (a>b) ? ~0:0
  1284. {
  1285. return (bi32x4)vec_cmpgt( a, b );
  1286. }
  1287. FORCEINLINE bi32x4 CmpGtSIMD( const u32x4 & a, const u32x4 & b ) // (a>b) ? ~0:0
  1288. {
  1289. return (bi32x4)vec_cmpgt( a, b );
  1290. }
  1291. FORCEINLINE bi32x4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  1292. {
  1293. return (bi32x4)vec_cmpge( a, b );
  1294. }
  1295. FORCEINLINE bi32x4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  1296. {
  1297. return (bi32x4)vec_cmplt( a, b );
  1298. }
  1299. FORCEINLINE bi32x4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  1300. {
  1301. return (bi32x4)vec_cmple( a, b );
  1302. }
  1303. FORCEINLINE bi32x4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  1304. {
  1305. i32x4 control;
  1306. control = vec_cmpb(a,b);
  1307. return (bi32x4)vec_cmpeq( (u32x4)control, _VEC_ZERO );
  1308. }
  1309. FORCEINLINE int CmpAnyLeSIMD( const fltx4 & a, const fltx4 & b )
  1310. {
  1311. return vec_any_le( a, b );
  1312. }
  1313. FORCEINLINE int CmpAnyGeSIMD( const fltx4 & a, const fltx4 & b )
  1314. {
  1315. return vec_any_ge( a, b );
  1316. }
  1317. FORCEINLINE int CmpAnyLtSIMD( const fltx4 & a, const fltx4 & b )
  1318. {
  1319. return vec_any_lt( a, b );
  1320. }
  1321. FORCEINLINE int CmpAnyLtSIMD( const bi32x4 & a, const i32x4 & b )
  1322. {
  1323. return vec_any_lt( (i32x4)a, b );
  1324. }
  1325. FORCEINLINE int CmpAnyGtSIMD( const fltx4 & a, const fltx4 & b )
  1326. {
  1327. return vec_any_gt( a, b );
  1328. }
  1329. FORCEINLINE int CmpAnyNeSIMD( const fltx4 & a, const fltx4 & b )
  1330. {
  1331. return vec_any_ne( a, b );
  1332. }
  1333. FORCEINLINE int CmpAnyNeSIMD( const bi32x4 & a, const bi32x4 & b )
  1334. {
  1335. return vec_any_ne( a, b );
  1336. }
  1337. FORCEINLINE int CmpAnyNeSIMD( const bi32x4 & a, const i32x4 & b )
  1338. {
  1339. return vec_any_ne( a, (bi32x4)b );
  1340. }
  1341. FORCEINLINE int CmpAllLeSIMD( const fltx4 & a, const fltx4 & b )
  1342. {
  1343. return vec_all_le( a, b );
  1344. }
  1345. FORCEINLINE fltx4 MaskedAssign( const bi32x4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  1346. {
  1347. return vec_sel( OldValue, NewValue, ReplacementMask );
  1348. }
  1349. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  1350. {
  1351. return vec_sel( OldValue, NewValue, (const bi32x4) ReplacementMask );
  1352. }
  1353. FORCEINLINE vector signed short MaskedAssign( const vector unsigned short & ReplacementMask, const vector signed short & NewValue, const vector signed short & OldValue )
  1354. {
  1355. return vec_sel( OldValue, NewValue, ReplacementMask );
  1356. }
  1357. // AKA "Broadcast", "Splat"
  1358. FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
  1359. {
  1360. #if SN_IMPROVED_INTRINSICS
  1361. return vec_splats(flValue);
  1362. #else
  1363. // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  1364. float * pValue = &flValue;
  1365. Assert( pValue );
  1366. Assert( ((unsigned int)pValue & 3) == 0);
  1367. fltx4 result;
  1368. result = vec_ld(0, pValue);
  1369. result = vec_splat( vec_perm( result, result, vec_lvsl(0, pValue) ), 0 );
  1370. return result;
  1371. #endif
  1372. }
  1373. FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a
  1374. {
  1375. #if SN_IMPROVED_INTRINSICS
  1376. return vec_splats(*pValue);
  1377. #else
  1378. Assert( pValue );
  1379. fltx4 result;
  1380. result = vec_ld(0, pValue);
  1381. result = vec_splat( vec_perm( result, result, vec_lvsl(0, pValue) ), 0 );
  1382. return result;
  1383. #endif
  1384. }
  1385. /// replicate a single 32 bit integer value to all 4 components of an m128
  1386. FORCEINLINE i32x4 ReplicateIX4( int nValue )
  1387. {
  1388. #if SN_IMPROVED_INTRINSICS
  1389. return vec_splats(nValue);
  1390. #else
  1391. // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
  1392. int * pValue = &nValue;
  1393. Assert( pValue );
  1394. Assert( ((unsigned int)pValue & 3) == 0);
  1395. i32x4 result;
  1396. result = vec_ld(0, pValue);
  1397. result = vec_splat( vec_perm( result, result, vec_lvsl(0, pValue) ), 0 );
  1398. return result;
  1399. #endif
  1400. }
  1401. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  1402. {
  1403. return sqrtf4(a);
  1404. }
  1405. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  1406. {
  1407. #if defined( _PS3 ) && !defined( SPU )
  1408. // This is exactly what the Xbox 360 does in XMVectorSqrtEst
  1409. fltx4 vRecipSquareRoot = vec_rsqrte( a );
  1410. i32x4 vOne = vec_splat_s32( 1 );
  1411. i32x4 vAllOnes = vec_splat_s32( -1 );
  1412. i32x4 vShiftLeft24 = vec_splat_s32( -8 ); // -8 is the same bit pattern as 24 with a 5-bit mask
  1413. fltx4 vZero = (fltx4)vec_splat_s32( 0 );
  1414. u32x4 vInputShifted = vec_sl( (u32x4)a, (u32x4)vOne );
  1415. u32x4 vInfinityShifted = vec_sl( (u32x4)vAllOnes, (u32x4)vShiftLeft24 );
  1416. bi32x4 vEqualsZero = vec_vcmpeqfp( a, vZero );
  1417. bi32x4 vEqualsInfinity = vec_vcmpequw( vInputShifted, vInfinityShifted );
  1418. fltx4 vSquareRoot = vec_madd( a, vRecipSquareRoot, _VEC_ZEROF );
  1419. bi32x4 vResultMask = vec_vcmpequw( (u32x4)vEqualsInfinity, (u32x4)vEqualsZero ); // mask has 1s wherever the square root is valid
  1420. fltx4 vCorrectedSquareRoot = vec_sel( a, vSquareRoot, vResultMask );
  1421. return vCorrectedSquareRoot;
  1422. #else
  1423. return SqrtSIMD( a );
  1424. #endif
  1425. }
  1426. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  1427. {
  1428. return vec_rsqrte( a );
  1429. }
  1430. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  1431. {
  1432. // This matches standard library function rsqrtf4
  1433. fltx4 result;
  1434. vmathV4RsqrtPerElem( (VmathVector4 *)&result, (const VmathVector4 *)&a );
  1435. return result;
  1436. }
  1437. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  1438. {
  1439. return vec_re( a );
  1440. }
  1441. /// 1/x for all 4 values, more or less
  1442. /// 1/0 will result in a big but NOT infinite result
  1443. FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
  1444. {
  1445. bi32x4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  1446. fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  1447. ret = ReciprocalEstSIMD( ret );
  1448. return ret;
  1449. }
  1450. /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
  1451. /// No error checking!
  1452. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  1453. {
  1454. // This matches standard library function recipf4
  1455. fltx4 result;
  1456. vmathV4RecipPerElem ( (VmathVector4 *)&result, (const VmathVector4 *)&a );
  1457. return result;
  1458. }
  1459. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  1460. {
  1461. return MulSIMD( ReciprocalSIMD( b ), a );
  1462. }
  1463. FORCEINLINE fltx4 DivEstSIMD( const fltx4 & a, const fltx4 & b ) // Est(a/b)
  1464. {
  1465. return MulSIMD( ReciprocalEstSIMD( b ), a );
  1466. }
  1467. /// 1/x for all 4 values.
  1468. /// 1/0 will result in a big but NOT infinite result
  1469. FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
  1470. {
  1471. // Convert zeros to epsilons
  1472. bi32x4 zero_mask = CmpEqSIMD( a, _VEC_ZEROF );
  1473. fltx4 a_safe = OrSIMD( a, AndSIMD( _VEC_EPSILONF, zero_mask ) );
  1474. return ReciprocalSIMD( a_safe );
  1475. // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does)
  1476. // fltx4 zeroMask = CmpEqSIMD( gFour_Zeros, a );
  1477. // fltx4 a_safe = XMVectorSelect( a, gFour_Epsilons, zeroMask );
  1478. // return ReciprocalSIMD( a_safe );
  1479. }
  1480. // CHRISG: is it worth doing integer bitfiddling for this?
  1481. // 2^x for all values (the antilog)
  1482. FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
  1483. {
  1484. return exp2f4(toPower);
  1485. }
  1486. // a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point",
  1487. // which is accurate to four bits of mantissa.
  1488. FORCEINLINE fltx4 Exp2EstSIMD( const fltx4 &f )
  1489. {
  1490. return exp2f4fast( f );
  1491. }
  1492. // Clamps the components of a vector to a specified minimum and maximum range.
  1493. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  1494. {
  1495. fltx4 result = vec_max(min, in);
  1496. return vec_min(max, result);
  1497. }
  1498. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  1499. {
  1500. #if SN_IMPROVED_INTRINSICS
  1501. fltx4 v0, v1;
  1502. Assert( pSIMD );
  1503. v0 = (fltx4)vec_lvlx( 0, (float*)pSIMD );
  1504. v1 = (fltx4)vec_lvrx( 16, (float*)pSIMD );
  1505. return vec_or(v0, v1);
  1506. #else
  1507. fltx4 v0, v1;
  1508. vector unsigned char permMask;
  1509. Assert( pSIMD );
  1510. v0 = vec_ld( 0, pSIMD );
  1511. permMask = vec_lvsl( 0, pSIMD );
  1512. v1 = vec_ld( 15, pSIMD );
  1513. return vec_perm(v0, v1, permMask);
  1514. #endif
  1515. }
  1516. FORCEINLINE fltx4 LoadUnsignedByte4SIMD( unsigned char *pBytes ) // unpack contiguous 4 bytes into vec float 4
  1517. {
  1518. #if SN_IMPROVED_INTRINSICS
  1519. __vector unsigned char res_uc;
  1520. __vector unsigned short res_us;
  1521. __vector unsigned char vZero8 = (__vector unsigned char)vec_splat_u8(0);
  1522. __vector unsigned short vZero16 = (__vector unsigned short)vec_splat_u16(0);
  1523. res_uc = (__vector unsigned char)vec_lvlx(0, pBytes);
  1524. res_uc = vec_mergeh( vZero8, res_uc );
  1525. res_us = vec_mergeh( vZero16, (__vector unsigned short)res_uc );
  1526. return vec_ctf( (__vector unsigned int)res_us, 0);
  1527. #else
  1528. vector unsigned char v0, v1;
  1529. vector bool char res_uc;
  1530. vector unsigned char permMask;
  1531. vector bool short res_us;
  1532. vector bool char vZero8 = (vector bool char)vec_splat_u8(0);
  1533. vector bool short vZero16 = (vector bool short)vec_splat_u16(0);
  1534. v0 = vec_ld(0, pBytes);
  1535. permMask = vec_lvsl(0, pBytes);
  1536. v1 = vec_ld(3, pBytes);
  1537. res_uc = (vector bool char)vec_perm(v0, v1, permMask);
  1538. res_uc = vec_mergeh( vZero8, res_uc );
  1539. res_us = vec_mergeh( vZero16, (vector bool short)res_uc );
  1540. return vec_ctf( (vector unsigned int)res_us, 0);
  1541. #endif
  1542. }
  1543. FORCEINLINE fltx4 LoadSignedByte4SIMD( signed char *pBytes ) // unpack contiguous 4 bytes into vec float 4
  1544. {
  1545. #if SN_IMPROVED_INTRINSICS
  1546. vector signed char res_uc;
  1547. vector signed short res_us;
  1548. vector signed int res_ui;
  1549. res_uc = (vector signed char)vec_lvlx(0, pBytes);
  1550. res_us = vec_unpackh( res_uc );
  1551. res_ui = vec_unpackh( res_us );
  1552. return vec_ctf( res_ui, 0);
  1553. #else
  1554. vector signed char v0, v1, res_uc;
  1555. vector unsigned char permMask;
  1556. vector signed short res_us;
  1557. vector signed int res_ui;
  1558. v0 = vec_ld(0, pBytes);
  1559. permMask = vec_lvsl(0, pBytes);
  1560. v1 = vec_ld(3, pBytes);
  1561. res_uc = vec_perm(v0, v1, permMask);
  1562. res_us = vec_unpackh( res_uc );
  1563. res_ui = vec_unpackh( res_us );
  1564. return vec_ctf( res_ui, 0);
  1565. #endif
  1566. }
  1567. // load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).
  1568. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  1569. {
  1570. Assert( pSIMD );
  1571. fltx4 v0 = vec_ld( 0, ( float * )( pSIMD ) );
  1572. vector unsigned char permMask = vec_lvsl( 0, ( float * ) ( pSIMD ) );
  1573. fltx4 v1 = vec_ld( 11, ( float * )( pSIMD ) );
  1574. return vec_perm( v0, v1, permMask );
  1575. }
  1576. // load a single unaligned float into the x component of a SIMD word
  1577. FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
  1578. {
  1579. fltx4 v0 = vec_lde( 0, const_cast<float *>(pFlt) );
  1580. vector unsigned char permMask = vec_lvsl( 0, const_cast<float *>(pFlt) );
  1581. return vec_perm( v0, v0, permMask );
  1582. }
  1583. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  1584. {
  1585. return vec_ld( 0, ( float * )pSIMD );
  1586. }
  1587. #ifndef SPU
  1588. // No reason to support VectorAligned on SPU.
  1589. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  1590. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned &pSIMD )
  1591. {
  1592. fltx4 out;
  1593. out = vec_ld( 0, pSIMD.Base() );
  1594. // squelch w
  1595. return (fltx4)vec_and( (u32x4)out, _VEC_CLEAR_WMASK );
  1596. }
  1597. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  1598. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD )
  1599. {
  1600. fltx4 out;
  1601. out = vec_ld( 0, pSIMD->Base() );
  1602. // squelch w
  1603. return (fltx4)vec_and( (u32x4)out, _VEC_CLEAR_WMASK );
  1604. }
  1605. // strongly typed -- for typechecking as we transition to SIMD
  1606. FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
  1607. {
  1608. vec_st(a, 0, pSIMD->Base());
  1609. }
  1610. #endif
  1611. FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
  1612. {
  1613. vec_st(a, 0, pSIMD);
  1614. }
  1615. FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
  1616. {
  1617. #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
  1618. vec_stvlx( a, 0, pSIMD);
  1619. vec_stvrx( a, 16, pSIMD);
  1620. #else
  1621. fltx4_union a_union;
  1622. vec_st(a, 0, &a_union.vmxf);
  1623. pSIMD[0] = a_union.m128_f32[0];
  1624. pSIMD[1] = a_union.m128_f32[1];
  1625. pSIMD[2] = a_union.m128_f32[2];
  1626. pSIMD[3] = a_union.m128_f32[3];
  1627. #endif
  1628. }
  1629. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  1630. {
  1631. fltx4_union a_union;
  1632. vec_st(a, 0, &a_union.vmxf);
  1633. pSIMD[0] = a_union.m128_f32[0];
  1634. pSIMD[1] = a_union.m128_f32[1];
  1635. pSIMD[2] = a_union.m128_f32[2];
  1636. };
  1637. #ifndef SPU
  1638. // No reason to support unaligned Vectors on SPU
  1639. FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d );
  1640. // construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
  1641. FORCEINLINE fltx4 LoadGatherSIMD( const float &x, const float &y, const float &z, const float &w )
  1642. {
  1643. #if USING_POINTLESSLY_SLOW_SONY_CODE
  1644. return vmathV4MakeFromElems_V( x,y,z,w ).vec128;
  1645. #else
  1646. // load the float into the low word of each vector register (this exploits the unaligned load op)
  1647. fltx4 vx = vec_lvlx( 0, &x );
  1648. fltx4 vy = vec_lvlx( 0, &y );
  1649. fltx4 vz = vec_lvlx( 0, &z );
  1650. fltx4 vw = vec_lvlx( 0, &w );
  1651. return Compress4SIMD( vx, vy, vz, vw );
  1652. #endif
  1653. }
  1654. // Store the x,y,z components of the four FLTX4 parameters
  1655. // into the four consecutive Vectors:
  1656. // pDestination[0], pDestination[1], pDestination[2], pDestination[3]
  1657. // The Vectors are assumed to be unaligned.
  1658. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  1659. Vector * const pDestination )
  1660. {
  1661. StoreUnaligned3SIMD( pDestination->Base(), a );
  1662. StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
  1663. StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
  1664. StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
  1665. }
  1666. // Store the x,y,z components of the four FLTX4 parameters
  1667. // into the four consecutive Vectors:
  1668. // pDestination , pDestination + 1, pDestination + 2, pDestination + 3
  1669. // The Vectors are assumed to start on an ALIGNED address, that is,
  1670. // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
  1671. FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  1672. Vector * const pDestination )
  1673. {
  1674. StoreUnaligned3SIMD( pDestination->Base(), a );
  1675. StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
  1676. StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
  1677. StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
  1678. }
  1679. #endif
  1680. // Fixed-point conversion and save as SIGNED INTS.
  1681. // pDest->x = Int (vSrc.x)
  1682. // note: some architectures have means of doing
  1683. // fixed point conversion when the fix depth is
  1684. // specified as an immediate.. but there is no way
  1685. // to guarantee an immediate as a parameter to function
  1686. // like this.
  1687. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  1688. {
  1689. i32x4 asInt = vec_cts( vSrc, 0 );
  1690. vec_st(asInt, 0, pDest->Base());
  1691. }
  1692. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
  1693. {
  1694. fltx4 p0, p1, p2, p3;
  1695. p0 = vec_mergeh(x, z);
  1696. p1 = vec_mergeh(y, w);
  1697. p2 = vec_mergel(x, z);
  1698. p3 = vec_mergel(y, w);
  1699. x = vec_mergeh(p0, p1);
  1700. y = vec_mergel(p0, p1);
  1701. z = vec_mergeh(p2, p3);
  1702. w = vec_mergel(p2, p3);
  1703. }
  1704. // Return one in the fastest way -- faster even than loading.
  1705. FORCEINLINE fltx4 LoadZeroSIMD( void )
  1706. {
  1707. return _VEC_ZEROF;
  1708. }
  1709. FORCEINLINE i32x4 LoadZeroISIMD( void )
  1710. {
  1711. return vec_splat_s32(0);
  1712. }
  1713. // Return one in the fastest way -- faster even than loading.
  1714. FORCEINLINE fltx4 LoadOneSIMD( void )
  1715. {
  1716. return _VEC_ONEF;
  1717. }
  1718. FORCEINLINE i32x4 LoadOneISIMD( void )
  1719. {
  1720. return vec_splat_s32(1);
  1721. }
  1722. FORCEINLINE fltx4 SplatXSIMD( fltx4 a )
  1723. {
  1724. return vec_splat(a,0);
  1725. }
  1726. FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
  1727. {
  1728. return vec_splat(a,1);
  1729. }
  1730. FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
  1731. {
  1732. return vec_splat(a,2);
  1733. }
  1734. FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
  1735. {
  1736. return vec_splat(a,3);
  1737. }
  1738. FORCEINLINE bi32x4 SplatXSIMD( bi32x4 a )
  1739. {
  1740. return vec_splat(a,0);
  1741. }
  1742. FORCEINLINE bi32x4 SplatYSIMD( bi32x4 a )
  1743. {
  1744. return vec_splat(a,1);
  1745. }
  1746. FORCEINLINE bi32x4 SplatZSIMD( bi32x4 a )
  1747. {
  1748. return vec_splat(a,2);
  1749. }
  1750. FORCEINLINE bi32x4 SplatWSIMD( bi32x4 a )
  1751. {
  1752. return vec_splat(a,3);
  1753. }
  1754. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  1755. {
  1756. return vec_sel(a,x, _VEC_COMPONENT_MASK_0);
  1757. }
  1758. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  1759. {
  1760. return vec_sel(a,y, _VEC_COMPONENT_MASK_1);
  1761. }
  1762. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  1763. {
  1764. return vec_sel(a,z, _VEC_COMPONENT_MASK_2);
  1765. }
  1766. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  1767. {
  1768. return vec_sel(a,w, _VEC_COMPONENT_MASK_3);
  1769. }
  1770. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
  1771. {
  1772. #if SN_IMPROVED_INTRINSICS
  1773. return vec_insert( flValue, a, nComponent );
  1774. #else
  1775. fltx4_union a_union;
  1776. a_union.vmxf = vec_ld(0,&a);
  1777. a_union.m128_f32[nComponent] = flValue;
  1778. return a_union.vmxf;
  1779. #endif
  1780. }
  1781. FORCEINLINE float GetComponentSIMD( const fltx4& a, int nComponent )
  1782. {
  1783. #if SN_IMPROVED_INTRINSICS
  1784. return vec_extract( a, nComponent );
  1785. #else
  1786. fltx4_union a_union;
  1787. a_union.vmxf = vec_ld(0,&a);
  1788. return a_union.m128_f32[nComponent];
  1789. #endif
  1790. }
  1791. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  1792. {
  1793. return vec_sld(a,a,4);
  1794. }
  1795. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  1796. {
  1797. return vec_sld(a,a,8);
  1798. }
  1799. FORCEINLINE fltx4 RotateRight( const fltx4 & a )
  1800. {
  1801. return vec_sld(a,a,12);
  1802. }
  1803. FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
  1804. {
  1805. return vec_sld(a,a,8);
  1806. }
  1807. // rotate a vector left by an arbitrary number of
  1808. // bits known at compile time. The bit parameter
  1809. // is template because it's actually used as an
  1810. // immediate field in an instruction, eg it absolutely
  1811. // must be known at compile time. nBits>127 leads
  1812. // to doom.
  1813. // zeroes are shifted in from the right
  1814. template < uint nBits, typename T >
  1815. FORCEINLINE T ShiftLeftByBits(const T &a)
  1816. {
  1817. // hopefully the compiler, seeing nBits as a const immediate, elides these ifs
  1818. if ( nBits >= 128 ) // WTF are you doing?!
  1819. {
  1820. return (T) LoadZeroSIMD();
  1821. }
  1822. else if ( nBits == 0 )
  1823. {
  1824. return a;
  1825. }
  1826. else if ( (nBits > 7) ) // if we have to rotate by at least one byte, do the by-octet rotation first
  1827. {
  1828. T t = vec_sld( a, ((T)LoadZeroSIMD()), (nBits >> 3) ); // rotated left by octets
  1829. return ShiftLeftByBits< (nBits & 0x7) >( t );
  1830. }
  1831. else // we need to rotate by <= 7 bits
  1832. {
  1833. // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
  1834. // the splat, however, does require an immediate. Go IBM!
  1835. vector unsigned int shifter = (vector unsigned int) (vec_splat_s8( ((signed char)(nBits & 0x7)) ));
  1836. return (T) vec_sll( (vector signed int) a, shifter );
  1837. }
  1838. }
  1839. // as above, but shift right
  1840. template < uint nBits, typename T >
  1841. FORCEINLINE T ShiftRightByBits(const T &a)
  1842. {
  1843. // hopefully the compiler, seeing nBits as a const immediate, elides these ifs
  1844. if ( nBits >= 128 ) // WTF are you doing?!
  1845. {
  1846. return (T) LoadZeroSIMD();
  1847. }
  1848. else if ( nBits == 0 )
  1849. {
  1850. return a;
  1851. }
  1852. else if ( (nBits > 7) ) // if we have to rotate by at least one byte, do the by-octet rotation first
  1853. {
  1854. T t = vec_sld( ((T)LoadZeroSIMD()), a, 16 - (nBits >> 3) ); // rotated right by octets -- a rotate right of one is like a rotate left of fifteen.
  1855. return ShiftRightByBits< (nBits & 0x7) >( t );
  1856. }
  1857. else // we need to rotate by <= 7 bits
  1858. {
  1859. // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
  1860. // the splat, however, does require an immediate. Go IBM!
  1861. vector unsigned int shifter = (vector unsigned int) (vec_splat_s8( ((signed char)(nBits & 0x7)) ));
  1862. return (T) vec_srl( (vector unsigned int) a, shifter );
  1863. }
  1864. }
  1865. /**** an example of ShiftLeftByBits:
  1866. fltx4 ShiftByTwentyOne( fltx4 foo )
  1867. {
  1868. return ShiftLeftByBits<21>(foo);
  1869. }
  1870. compiles to:
  1871. ShiftByTwentyOne(float __vector):
  1872. 0x000059FC: 0x1060038C vspltisw v3,0 PIPE
  1873. 0x00005A00: 0x1085030C vspltisb v4,5
  1874. 0x00005A04: 0x104218AC vsldoi v2,v2,v3,2 02 (000059FC) REG PIPE
  1875. 0x00005A08: 0x104221C4 vsl v2,v2,v4 03 (00005A04) REG
  1876. 0x00005A0C: 0x4E800020 blr
  1877. *****/
  1878. // find the lowest component of a.x, a.y, a.z,
  1879. // and replicate it to the whole return value.
  1880. // ignores a.w.
  1881. // Forcing this inline should hopefully help with scheduling.
  1882. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
  1883. {
  1884. fltx4 result;
  1885. fltx4 x = vec_splat( a, 0 );
  1886. fltx4 y = vec_splat( a, 1 );
  1887. fltx4 z = vec_splat( a, 2 );
  1888. if ( vec_any_nan( a ) )
  1889. {
  1890. x = vec_all_nan( x ) ? _VEC_FLTMAX : x;
  1891. y = vec_all_nan( y ) ? _VEC_FLTMAX : y;
  1892. z = vec_all_nan( z ) ? _VEC_FLTMAX : z;
  1893. }
  1894. result = vec_min( y, x );
  1895. result = vec_min( z, result );
  1896. return result;
  1897. }
  1898. // find the highest component of a.x, a.y, a.z,
  1899. // and replicate it to the whole return value.
  1900. // ignores a.w.
  1901. // Though this is only five instructions long,
  1902. // they are all dependent, making this stall city.
  1903. // Forcing this inline should hopefully help with scheduling.
  1904. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
  1905. {
  1906. fltx4 result;
  1907. fltx4 x = vec_splat( a, 0 );
  1908. fltx4 y = vec_splat( a, 1 );
  1909. fltx4 z = vec_splat( a, 2 );
  1910. if ( vec_any_nan( a ) )
  1911. {
  1912. x = vec_all_nan( x ) ? _VEC_FLTMIN : x;
  1913. y = vec_all_nan( y ) ? _VEC_FLTMIN : y;
  1914. z = vec_all_nan( z ) ? _VEC_FLTMIN : z;
  1915. }
  1916. result = vec_max( y, x );
  1917. result = vec_max( z, result );
  1918. return result;
  1919. }
  1920. // ------------------------------------
  1921. // INTEGER SIMD OPERATIONS.
  1922. // ------------------------------------
  1923. // Load 4 aligned words into a SIMD register
  1924. FORCEINLINE i32x4 LoadAlignedIntSIMD(const int32 * RESTRICT pSIMD)
  1925. {
  1926. return vec_ld(0, const_cast<int32 *>(pSIMD));
  1927. }
  1928. // Load 4 unaligned words into a SIMD register
  1929. FORCEINLINE i32x4 LoadUnalignedIntSIMD(const int32 * RESTRICT pSIMD)
  1930. {
  1931. i32x4 v0, v1;
  1932. vector unsigned char permMask;
  1933. Assert( pSIMD );
  1934. v0 = vec_ld( 0, const_cast<int32 *>(pSIMD) );
  1935. permMask = vec_lvsl( 0, const_cast<int32 *>(pSIMD) );
  1936. v1 = vec_ld( 15, const_cast<int32 *>(pSIMD) );
  1937. return vec_perm(v0, v1, permMask);
  1938. }
  1939. // save into four words, 16-byte aligned
  1940. FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const i32x4 & a )
  1941. {
  1942. vec_st(a,0,pSIMD);
  1943. }
  1944. FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  1945. {
  1946. vec_st((i32x4)a,0,pSIMD);
  1947. }
  1948. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const i32x4 & a )
  1949. {
  1950. vec_st(a,0,pSIMD.Base());
  1951. }
  1952. FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const i32x4 & a )
  1953. {
  1954. #if SN_IMPROVED_INTRINSICS
  1955. // NOTE : NOT TESTED
  1956. vec_stvlx(a,0,pSIMD);
  1957. vec_stvrx(a,16,pSIMD);
  1958. #else
  1959. fltx4_union tmp;
  1960. vec_st(a,0,&tmp.vmxi);
  1961. pSIMD[0] = tmp.m128_u32[0];
  1962. pSIMD[1] = tmp.m128_u32[1];
  1963. pSIMD[2] = tmp.m128_u32[2];
  1964. pSIMD[3] = tmp.m128_u32[3];
  1965. #endif
  1966. }
  1967. // a={ a.x, a.z, b.x, b.z }
  1968. // combine two fltx4s by throwing away every other field.
  1969. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )
  1970. {
  1971. const int32 ALIGN16 n4shuffleACXZ[4] ALIGN16_POST = { 0x00010203, 0x08090A0B, 0x10111213, 0x18191A1B };
  1972. return vec_perm( a, b, (vec_uchar16)LoadAlignedIntSIMD( n4shuffleACXZ ) );
  1973. }
  1974. // a={ a.x, b.x, c.x, d.x }
  1975. // combine 4 fltx4s by throwing away 3/4s of the fields
  1976. // TODO: make more efficient by doing this in a parallel way at the caller
  1977. // Compress4SIMD(FourVectors.. )
  1978. FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )
  1979. {
  1980. fltx4 ab = vec_mergeh( a, b ); // a.x, b.x, a.y, b.y
  1981. fltx4 cd = vec_mergeh( c, d ); // c.x, d.x...
  1982. static const int32 ALIGN16 shuffleABXY[4] ALIGN16_POST = { 0x00010203, 0x04050607, 0x10111213, 0x14151617 };
  1983. return vec_perm( ab, cd, (vec_uchar16)LoadAlignedIntSIMD( shuffleABXY ) );
  1984. }
  1985. // Take a fltx4 containing fixed-point uints and
  1986. // return them as single precision floats. No
  1987. // fixed point conversion is done.
  1988. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  1989. {
  1990. return vec_ctf(vSrcA,0);
  1991. }
  1992. // Take a fltx4 containing fixed-point sints and
  1993. // return them as single precision floats. No
  1994. // fixed point conversion is done.
  1995. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  1996. {
  1997. return vec_ctf(vSrcA,0);
  1998. }
  1999. // Take a fltx4 containing fixed-point uints and
  2000. // return them as single precision floats. Each uint
  2001. // will be divided by 2^immed after conversion
  2002. // (eg, this is fixed point math).
  2003. /* as if:
  2004. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
  2005. {
  2006. return vec_ctf(vSrcA,uImmed);
  2007. }
  2008. */
  2009. #define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) ))
  2010. // Take a fltx4 containing fixed-point sints and
  2011. // return them as single precision floats. Each int
  2012. // will be divided by 2^immed (eg, this is fixed point
  2013. // math).
  2014. /* as if:
  2015. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
  2016. {
  2017. return vec_ctf(vSrcA,uImmed);
  2018. }
  2019. */
  2020. #define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (vec_ctf( (vSrcA), (uImmed) ))
  2021. // set all components of a vector to a signed immediate int number.
  2022. /* as if:
  2023. FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
  2024. {
  2025. return vec_splat_s32( toImmediate );
  2026. }
  2027. */
  2028. #define IntSetImmediateSIMD(x) (vec_splat_s32(x))
  2029. /*
  2030. works on fltx4's as if they are four uints.
  2031. the first parameter contains the words to be shifted,
  2032. the second contains the amount to shift by AS INTS
  2033. for i = 0 to 3
  2034. shift = vSrcB_i*32:(i*32)+4
  2035. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  2036. */
  2037. FORCEINLINE u32x4 IntShiftLeftWordSIMD(u32x4 vSrcA, u32x4 vSrcB)
  2038. {
  2039. return vec_sl(vSrcA, vSrcB);
  2040. }
  2041. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  2042. {
  2043. #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
  2044. return( vec_extract( a, idx ) );
  2045. #else // GCC 4.1.1
  2046. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  2047. fltx4_union a_union;
  2048. vec_st(a, 0, &a_union.vmxf);
  2049. return a_union.m128_f32[idx];
  2050. #endif // GCC 4.1.1
  2051. }
  2052. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  2053. {
  2054. fltx4_union & a_union = (fltx4_union &)a;
  2055. return a_union.m128_f32[idx];
  2056. }
  2057. FORCEINLINE uint32 SubInt( const u32x4 & a, int idx )
  2058. {
  2059. #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
  2060. return( vec_extract( a, idx ) );
  2061. #else // GCC 4.1.1
  2062. fltx4_union a_union;
  2063. vec_st(a, 0, &a_union.vmxui);
  2064. return a_union.m128_u32[idx];
  2065. #endif // GCC 4.1.1
  2066. }
  2067. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  2068. {
  2069. #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
  2070. return( vec_extract( (u32x4)a, idx ) );
  2071. #else
  2072. fltx4_union a_union;
  2073. vec_st(a, 0, &a_union.vmxf);
  2074. return a_union.m128_u32[idx];
  2075. #endif
  2076. }
  2077. FORCEINLINE uint32 & SubInt( u32x4 & a, int idx )
  2078. {
  2079. fltx4_union & a_union = (fltx4_union &)a;
  2080. return a_union.m128_u32[idx];
  2081. }
  2082. FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
  2083. {
  2084. #if ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ == 1 ) && ( __GNUC_PATCHLEVEL__ == 1 )
  2085. return( vec_extract( vec_ctu( a, 0 ), idx ) );
  2086. #else
  2087. u32x4 t = vec_ctu( a, 0 );
  2088. return SubInt(t,idx);
  2089. #endif
  2090. }
  2091. // perform an Altivec permute op. There is no corresponding SSE op, so
  2092. // this function is missing from that fork. This is deliberate, because
  2093. // permute-based algorithms simply need to be abandoned and rebuilt
  2094. // differently way for SSE.
  2095. // (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm )
  2096. template< typename T, typename U >
  2097. FORCEINLINE T PermuteVMX( T a, T b, U swizzleMask )
  2098. {
  2099. return vec_perm( a, b, (vec_uchar16) swizzleMask );
  2100. }
  2101. // __fsel(double fComparand, double fValGE, double fLT) == fComparand >= 0 ? fValGE : fLT
  2102. // this is much faster than if ( aFloat > 0 ) { x = .. }
  2103. #if !defined(__SPU__)
  2104. #define fsel __fsel
  2105. #endif
  2106. inline bool IsVector3LessThan(const fltx4 &v1, const fltx4 &v2 )
  2107. {
  2108. return vec_any_lt( v1, v2 );
  2109. }
  2110. inline bool IsVector3GreaterOrEqual(const fltx4 &v1, const fltx4 &v2 )
  2111. {
  2112. return !IsVector3LessThan( v1, v2 );
  2113. }
  2114. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  2115. {
  2116. fltx4 retVal;
  2117. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON );
  2118. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON );
  2119. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON );
  2120. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON );
  2121. return retVal;
  2122. }
  2123. // Round towards negative infinity
  2124. FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
  2125. {
  2126. fltx4 retVal;
  2127. SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) );
  2128. SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) );
  2129. SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) );
  2130. SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) );
  2131. return retVal;
  2132. }
  2133. #elif ( defined( _X360 ) )
  2134. //---------------------------------------------------------------------
  2135. // X360 implementation
  2136. //---------------------------------------------------------------------
  2137. inline bool IsVector3LessThan(const fltx4 &v1, const fltx4 &v2 )
  2138. {
  2139. return !XMVector3GreaterOrEqual( v1, v2 );
  2140. }
  2141. inline BOOL IsVector3GreaterOrEqual(const fltx4 &v1, const fltx4 &v2 )
  2142. {
  2143. return XMVector3GreaterOrEqual( v1, v2 );
  2144. }
  2145. FORCEINLINE float & FloatSIMD( fltx4 & a, int idx )
  2146. {
  2147. fltx4_union & a_union = (fltx4_union &)a;
  2148. return a_union.m128_f32[idx];
  2149. }
  2150. FORCEINLINE unsigned int & UIntSIMD( fltx4 & a, int idx )
  2151. {
  2152. fltx4_union & a_union = (fltx4_union &)a;
  2153. return a_union.m128_u32[idx];
  2154. }
  2155. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
  2156. {
  2157. return __vaddfp( a, b );
  2158. }
  2159. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  2160. {
  2161. return __vsubfp( a, b );
  2162. }
  2163. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  2164. {
  2165. return __vmulfp( a, b );
  2166. }
  2167. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  2168. {
  2169. return __vmaddfp( a, b, c );
  2170. }
  2171. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  2172. {
  2173. return __vnmsubfp( a, b, c );
  2174. };
  2175. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  2176. {
  2177. return __vmsum3fp( a, b );
  2178. }
  2179. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  2180. {
  2181. return __vmsum4fp( a, b );
  2182. }
  2183. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  2184. {
  2185. return XMVectorSin( radians );
  2186. }
  2187. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  2188. {
  2189. XMVectorSinCos( &sine, &cosine, radians );
  2190. }
  2191. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  2192. {
  2193. XMVectorSinCos( &sine, &cosine, radians );
  2194. }
  2195. FORCEINLINE void CosSIMD( fltx4 &cosine, const fltx4 &radians )
  2196. {
  2197. cosine = XMVectorCos( radians );
  2198. }
  2199. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  2200. {
  2201. return XMVectorASin( sine );
  2202. }
  2203. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  2204. {
  2205. return XMVectorACos( cs );
  2206. }
  2207. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  2208. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  2209. {
  2210. return XMVectorATan2( a, b );
  2211. }
  2212. // DivSIMD defined further down, since it uses ReciprocalSIMD
  2213. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  2214. {
  2215. return __vmaxfp( a, b );
  2216. }
  2217. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  2218. {
  2219. return __vminfp( a, b );
  2220. }
  2221. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  2222. {
  2223. return __vand( a, b );
  2224. }
  2225. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  2226. {
  2227. // NOTE: a and b are swapped in the call: SSE complements the first argument, VMX the second
  2228. return __vandc( b, a );
  2229. }
  2230. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  2231. {
  2232. return __vxor( a, b );
  2233. }
  2234. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  2235. {
  2236. return __vor( a, b );
  2237. }
  2238. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  2239. {
  2240. return XMVectorNegate(a);
  2241. }
  2242. FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
  2243. {
  2244. unsigned int equalFlags = 0;
  2245. __vcmpeqfpR( a, Four_Zeros, &equalFlags );
  2246. return XMComparisonAllTrue( equalFlags );
  2247. }
  2248. FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero?
  2249. {
  2250. unsigned int conditionregister;
  2251. XMVectorEqualR(&conditionregister, a, XMVectorZero());
  2252. return XMComparisonAnyTrue(conditionregister);
  2253. }
  2254. FORCEINLINE bool IsAnyXYZZero( const fltx4 &a ) // are any of x,y,z zero?
  2255. {
  2256. // copy a's x component into w, in case w was zero.
  2257. fltx4 temp = __vrlimi(a, a, 1, 1);
  2258. unsigned int conditionregister;
  2259. XMVectorEqualR(&conditionregister, temp, XMVectorZero());
  2260. return XMComparisonAnyTrue(conditionregister);
  2261. }
  2262. // for branching when a.xyzw > b.xyzw
  2263. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  2264. {
  2265. unsigned int cr;
  2266. XMVectorGreaterR(&cr,a,b);
  2267. return XMComparisonAllTrue(cr);
  2268. }
  2269. // for branching when a.xyzw >= b.xyzw
  2270. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  2271. {
  2272. unsigned int cr;
  2273. XMVectorGreaterOrEqualR(&cr,a,b);
  2274. return XMComparisonAllTrue(cr);
  2275. }
  2276. // for branching when a.xyzw > b.xyzw
  2277. FORCEINLINE bool IsAnyGreaterThan( const fltx4 &a, const fltx4 &b )
  2278. {
  2279. unsigned int cr;
  2280. XMVectorGreaterR(&cr,a,b);
  2281. return XMComparisonAnyTrue(cr);
  2282. }
  2283. // for branching when a.xyzw >= b.xyzw
  2284. FORCEINLINE bool IsAnyGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  2285. {
  2286. unsigned int cr;
  2287. XMVectorGreaterOrEqualR(&cr,a,b);
  2288. return XMComparisonAnyTrue(cr);
  2289. }
  2290. // For branching if all a.xyzw == b.xyzw
  2291. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  2292. {
  2293. unsigned int cr;
  2294. XMVectorEqualR(&cr,a,b);
  2295. return XMComparisonAllTrue(cr);
  2296. }
  2297. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  2298. {
  2299. // NOTE: this maps to SSE way better than it does to VMX (most code uses IsAnyNegative(), though)
  2300. int nRet = 0;
  2301. const fltx4_union & a_union = (const fltx4_union &)a;
  2302. nRet |= ( a_union.m128_u32[0] & 0x80000000 ) >> 31; // sign(x) -> bit 0
  2303. nRet |= ( a_union.m128_u32[1] & 0x80000000 ) >> 30; // sign(y) -> bit 1
  2304. nRet |= ( a_union.m128_u32[2] & 0x80000000 ) >> 29; // sign(z) -> bit 2
  2305. nRet |= ( a_union.m128_u32[3] & 0x80000000 ) >> 28; // sign(w) -> bit 3
  2306. return nRet;
  2307. }
  2308. // Squelch the w component of a vector to +0.0.
  2309. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  2310. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  2311. {
  2312. return __vrlimi( a, __vzero(), 1, 0 );
  2313. }
  2314. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  2315. {
  2316. // NOTE: this tests the top bits of each vector element using integer math
  2317. // (so it ignores NaNs - it will return true for "-NaN")
  2318. unsigned int equalFlags = 0;
  2319. fltx4 signMask = __vspltisw( -1 ); // 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF 0xFFFFFFFF (low order 5 bits of each element = 31)
  2320. signMask = __vslw( signMask, signMask ); // 0x80000000 0x80000000 0x80000000 0x80000000
  2321. __vcmpequwR( Four_Zeros, __vand( signMask, a ), &equalFlags );
  2322. return !XMComparisonAllTrue( equalFlags );
  2323. }
  2324. FORCEINLINE bool IsAnyTrue( const fltx4 & a )
  2325. {
  2326. unsigned int equalFlags = 0;
  2327. __vcmpequwR( Four_Zeros, a, &equalFlags ); // compare to zero
  2328. return XMComparisonAnyFalse( equalFlags ); // at least one element was not zero, eg was true
  2329. }
  2330. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  2331. {
  2332. return __vcmpeqfp( a, b );
  2333. }
  2334. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  2335. {
  2336. return __vcmpgtfp( a, b );
  2337. }
  2338. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  2339. {
  2340. return __vcmpgefp( a, b );
  2341. }
  2342. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  2343. {
  2344. return __vcmpgtfp( b, a );
  2345. }
  2346. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  2347. {
  2348. return __vcmpgefp( b, a );
  2349. }
  2350. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  2351. {
  2352. return XMVectorInBounds( a, b );
  2353. }
  2354. // returned[i] = ReplacementMask[i] == 0 ? OldValue : NewValue
  2355. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  2356. {
  2357. return __vsel( OldValue, NewValue, ReplacementMask );
  2358. }
  2359. // perform an Altivec permute op. There is no corresponding SSE op, so
  2360. // this function is missing from that fork. This is deliberate, because
  2361. // permute-based algorithms simply need to be abandoned and rebuilt
  2362. // differently way for SSE.
  2363. // (see http://developer.apple.com/hardwaredrivers/ve/sse.html#Translation_Perm )
  2364. template< typename T, typename U >
  2365. FORCEINLINE T PermuteVMX( T a, T b, U swizzleMask )
  2366. {
  2367. return __vperm( a, b, swizzleMask );
  2368. }
  2369. // AKA "Broadcast", "Splat"
  2370. FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
  2371. {
  2372. // NOTE: if flValue comes from a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  2373. float * pValue = &flValue;
  2374. Assert( pValue );
  2375. Assert( ((unsigned int)pValue & 3) == 0);
  2376. return __vspltw( __lvlx( pValue, 0 ), 0 );
  2377. }
  2378. FORCEINLINE fltx4 ReplicateX4( const float *pValue ) // a,a,a,a
  2379. {
  2380. Assert( pValue );
  2381. return __vspltw( __lvlx( pValue, 0 ), 0 );
  2382. }
  2383. /// replicate a single 32 bit integer value to all 4 components of an m128
  2384. FORCEINLINE fltx4 ReplicateIX4( int nValue )
  2385. {
  2386. // NOTE: if nValue comes from a register, this causes a Load-Hit-Store stall (should not mix ints with fltx4s!)
  2387. int * pValue = &nValue;
  2388. Assert( pValue );
  2389. Assert( ((unsigned int)pValue & 3) == 0);
  2390. return __vspltw( __lvlx( pValue, 0 ), 0 );
  2391. }
  2392. // Round towards positive infinity
  2393. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  2394. {
  2395. return __vrfip(a);
  2396. }
  2397. // Round towards nearest integer
  2398. FORCEINLINE fltx4 RoundSIMD( const fltx4 &a )
  2399. {
  2400. return __vrfin(a);
  2401. }
  2402. // Round towards negative infinity
  2403. FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
  2404. {
  2405. return __vrfim(a);
  2406. }
  2407. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  2408. {
  2409. // This is emulated from rsqrt
  2410. return XMVectorSqrtEst( a );
  2411. }
  2412. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  2413. {
  2414. // This is emulated from rsqrt
  2415. return XMVectorSqrt( a );
  2416. }
  2417. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  2418. {
  2419. return __vrsqrtefp( a );
  2420. }
  2421. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  2422. {
  2423. // Convert zeros to epsilons
  2424. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  2425. fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  2426. return ReciprocalSqrtEstSIMD( a_safe );
  2427. }
  2428. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  2429. {
  2430. // This uses Newton-Raphson to improve the HW result
  2431. return XMVectorReciprocalSqrt( a );
  2432. }
  2433. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  2434. {
  2435. return __vrefp( a );
  2436. }
  2437. /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
  2438. /// No error checking!
  2439. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  2440. {
  2441. // This uses Newton-Raphson to improve the HW result
  2442. return XMVectorReciprocal( a );
  2443. }
  2444. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  2445. {
  2446. return MulSIMD( ReciprocalSIMD( b ), a );
  2447. }
  2448. FORCEINLINE fltx4 DivEstSIMD( const fltx4 & a, const fltx4 & b ) // Est(a/b)
  2449. {
  2450. return MulSIMD( ReciprocalEstSIMD( b ), a );
  2451. }
  2452. /// 1/x for all 4 values.
  2453. /// 1/0 will result in a big but NOT infinite result
  2454. FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
  2455. {
  2456. // Convert zeros to epsilons
  2457. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  2458. fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  2459. return ReciprocalEstSIMD( a_safe );
  2460. }
  2461. FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
  2462. {
  2463. // Convert zeros to epsilons
  2464. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  2465. fltx4 a_safe = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  2466. return ReciprocalSIMD( a_safe );
  2467. // FIXME: This could be faster (BUT: it doesn't preserve the sign of -0.0, whereas the above does)
  2468. // fltx4 zeroMask = CmpEqSIMD( Four_Zeros, a );
  2469. // fltx4 a_safe = XMVectorSelect( a, Four_Epsilons, zeroMask );
  2470. // return ReciprocalSIMD( a_safe );
  2471. }
  2472. // CHRISG: is it worth doing integer bitfiddling for this?
  2473. // 2^x for all values (the antilog)
  2474. FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
  2475. {
  2476. return XMVectorExp(toPower);
  2477. }
  2478. // a unique Altivec concept, the "Vector 2 Raised to the Exponent Estimate Floating Point",
  2479. // which is accurate to four bits of mantissa.
  2480. FORCEINLINE fltx4 Exp2EstSIMD( const fltx4 &f )
  2481. {
  2482. return XMVectorExpEst( f );
  2483. }
  2484. // Clamps the components of a vector to a specified minimum and maximum range.
  2485. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  2486. {
  2487. return XMVectorClamp(in, min, max);
  2488. }
  2489. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  2490. {
  2491. return XMLoadVector4( pSIMD );
  2492. }
  2493. // load a 3-vector (as opposed to LoadUnalignedSIMD, which loads a 4-vec).
  2494. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  2495. {
  2496. return XMLoadVector3( pSIMD );
  2497. }
  2498. // load a single unaligned float into the x component of a SIMD word
  2499. FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
  2500. {
  2501. return __lvlx( pFlt, 0 );
  2502. }
  2503. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  2504. {
  2505. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  2506. }
  2507. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  2508. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
  2509. {
  2510. fltx4 out = XMLoadVector3A(pSIMD.Base());
  2511. // squelch w
  2512. return __vrlimi( out, __vzero(), 1, 0 );
  2513. }
  2514. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  2515. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned * RESTRICT pSIMD )
  2516. {
  2517. fltx4 out = XMLoadVector3A(pSIMD);
  2518. // squelch w
  2519. return __vrlimi( out, __vzero(), 1, 0 );
  2520. }
  2521. FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
  2522. {
  2523. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  2524. }
  2525. FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
  2526. {
  2527. XMStoreVector4( pSIMD, a );
  2528. }
  2529. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  2530. {
  2531. XMStoreVector3( pSIMD, a );
  2532. }
  2533. // strongly typed -- for typechecking as we transition to SIMD
  2534. FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
  2535. {
  2536. XMStoreVector3A(pSIMD->Base(),a);
  2537. }
  2538. // Store the x,y,z components of the four FLTX4 parameters
  2539. // into the four consecutive Vectors:
  2540. // pDestination[0], pDestination[1], pDestination[2], pDestination[3]
  2541. // The Vectors are assumed to be unaligned.
  2542. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  2543. Vector * const pDestination )
  2544. {
  2545. // since four Vec3s == 48 bytes, we can use full-vector stores here, so long as
  2546. // we arrange the data properly first.
  2547. // The vrlimi ops trash the destination param which is why we require
  2548. // pass-by-copy. I'm counting on the compiler to schedule these properly.
  2549. b = __vrlimi( b, b, 15, 1 ); // b = y1z1__x1
  2550. c = __vrlimi( c, c, 15, 2 ); // c = z2__x2y2
  2551. a = __vrlimi( a, b, 1, 0 ); // a = x0y0z0x1
  2552. b = __vrlimi( b, c, 2|1, 0 ); // b = y1z1x2y2
  2553. c = __vrlimi( c, d, 4|2|1, 3 ); // c = z2x3y3z3
  2554. float *RESTRICT pOut = pDestination->Base();
  2555. StoreUnalignedSIMD( pOut + 0, a );
  2556. StoreUnalignedSIMD( pOut + 4, b );
  2557. StoreUnalignedSIMD( pOut + 8, c );
  2558. }
  2559. // Store the x,y,z components of the four FLTX4 parameters
  2560. // into the four consecutive Vectors:
  2561. // pDestination , pDestination + 1, pDestination + 2, pDestination + 3
  2562. // The Vectors are assumed to start on an ALIGNED address, that is,
  2563. // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
  2564. FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  2565. Vector * const pDestination )
  2566. {
  2567. // since four Vec3s == 48 bytes, we can use full-vector stores here, so long as
  2568. // we arrange the data properly first.
  2569. // The vrlimi ops trash the destination param which is why we require
  2570. // pass-by-copy. I'm counting on the compiler to schedule these properly.
  2571. b = __vrlimi( b, b, 15, 1 ); // b = y1z1__x1
  2572. c = __vrlimi( c, c, 15, 2 ); // c = z2__x2y2
  2573. a = __vrlimi( a, b, 1, 0 ); // a = x0y0z0x1
  2574. b = __vrlimi( b, c, 2|1, 0 ); // b = y1z1x2y2
  2575. c = __vrlimi( c, d, 4|2|1, 3 ); // c = z2x3y3z3
  2576. float *RESTRICT pOut = pDestination->Base();
  2577. StoreAlignedSIMD( pOut + 0, a );
  2578. StoreAlignedSIMD( pOut + 4, b );
  2579. StoreAlignedSIMD( pOut + 8, c );
  2580. }
  2581. // Fixed-point conversion and save as SIGNED INTS.
  2582. // pDest->x = Int (vSrc.x)
  2583. // note: some architectures have means of doing
  2584. // fixed point conversion when the fix depth is
  2585. // specified as an immediate.. but there is no way
  2586. // to guarantee an immediate as a parameter to function
  2587. // like this.
  2588. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  2589. {
  2590. fltx4 asInt = __vctsxs( vSrc, 0 );
  2591. XMStoreVector4A(pDest->Base(), asInt);
  2592. }
  2593. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
  2594. {
  2595. XMMATRIX xyzwMatrix = _XMMATRIX( x, y, z, w );
  2596. xyzwMatrix = XMMatrixTranspose( xyzwMatrix );
  2597. x = xyzwMatrix.r[0];
  2598. y = xyzwMatrix.r[1];
  2599. z = xyzwMatrix.r[2];
  2600. w = xyzwMatrix.r[3];
  2601. }
  2602. // Return one in the fastest way -- faster even than loading.
  2603. FORCEINLINE fltx4 LoadZeroSIMD( void )
  2604. {
  2605. return XMVectorZero();
  2606. }
  2607. // Return one in the fastest way -- faster even than loading.
  2608. FORCEINLINE fltx4 LoadOneSIMD( void )
  2609. {
  2610. return XMVectorSplatOne();
  2611. }
  2612. FORCEINLINE fltx4 SplatXSIMD( fltx4 a )
  2613. {
  2614. return XMVectorSplatX( a );
  2615. }
  2616. FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
  2617. {
  2618. return XMVectorSplatY( a );
  2619. }
  2620. FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
  2621. {
  2622. return XMVectorSplatZ( a );
  2623. }
  2624. FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
  2625. {
  2626. return XMVectorSplatW( a );
  2627. }
  2628. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  2629. {
  2630. fltx4 result = __vrlimi(a, x, 8, 0);
  2631. return result;
  2632. }
  2633. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  2634. {
  2635. fltx4 result = __vrlimi(a, y, 4, 0);
  2636. return result;
  2637. }
  2638. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  2639. {
  2640. fltx4 result = __vrlimi(a, z, 2, 0);
  2641. return result;
  2642. }
  2643. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  2644. {
  2645. fltx4 result = __vrlimi(a, w, 1, 0);
  2646. return result;
  2647. }
  2648. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
  2649. {
  2650. static int s_nVrlimiMask[4] = { 8, 4, 2, 1 };
  2651. fltx4 val = ReplicateX4( flValue );
  2652. fltx4 result = __vrlimi(a, val, s_nVrlimiMask[nComponent], 0);
  2653. return result;
  2654. }
  2655. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  2656. {
  2657. fltx4 compareOne = a;
  2658. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 1 );
  2659. }
  2660. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  2661. {
  2662. fltx4 compareOne = a;
  2663. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
  2664. }
  2665. FORCEINLINE fltx4 RotateRight( const fltx4 & a )
  2666. {
  2667. fltx4 compareOne = a;
  2668. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 3 );
  2669. }
  2670. FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
  2671. {
  2672. fltx4 compareOne = a;
  2673. return __vrlimi( compareOne, a, 8 | 4 | 2 | 1, 2 );
  2674. }
  2675. // rotate a vector left by an arbitrary number of
  2676. // bits known at compile time. The bit parameter
  2677. // is template because it's actually used as an
  2678. // immediate field in an instruction, eg it absolutely
  2679. // must be known at compile time. nBits>127 leads
  2680. // to doom.
  2681. // zeroes are shifted in from the right
  2682. template < uint nBits >
  2683. FORCEINLINE fltx4 ShiftLeftByBits(const fltx4 &a)
  2684. {
  2685. // hopefully the compiler, seeing nBits as a const immediate, elides these ifs
  2686. if ( nBits >= 128 ) // WTF are you doing?!
  2687. {
  2688. return LoadZeroSIMD();
  2689. }
  2690. else if ( nBits == 0 )
  2691. {
  2692. return a;
  2693. }
  2694. else if ( (nBits > 7) ) // if we have to rotate by at least one byte, do the by-octet rotation first
  2695. {
  2696. fltx4 t = __vsldoi( a, (LoadZeroSIMD()), (nBits >> 3) ); // rotated left by octets
  2697. return ShiftLeftByBits< (nBits & 0x7) >( t );
  2698. }
  2699. else // we need to rotate by <= 7 bits
  2700. {
  2701. // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
  2702. // the splat, however, does require an immediate. Go IBM!
  2703. u32x4 shifter = u32x4 (__vspltisb( ((signed char)(nBits & 0x7)) ));
  2704. return __vsl( a, shifter );
  2705. }
  2706. }
  2707. // as above, but shift right
  2708. template < uint nBits >
  2709. FORCEINLINE fltx4 ShiftRightByBits(const fltx4 &a)
  2710. {
  2711. // hopefully the compiler, seeing nBits as a const immediate, elides these ifs
  2712. if ( nBits >= 128 ) // WTF are you doing?!
  2713. {
  2714. return LoadZeroSIMD();
  2715. }
  2716. else if ( nBits == 0 )
  2717. {
  2718. return a;
  2719. }
  2720. else if ( (nBits > 7) ) // if we have to rotate by at least one byte, do the by-octet rotation first
  2721. {
  2722. fltx4 t = __vsldoi( (LoadZeroSIMD()), a, 16 - (nBits >> 3) ); // rotated right by octets -- a rotate right of one is like a rotate left of fifteen.
  2723. return ShiftRightByBits< (nBits & 0x7) >( t );
  2724. }
  2725. else // we need to rotate by <= 7 bits
  2726. {
  2727. // on AltiVec there's no immediate shift left by bits; we need to splat the bits onto a vector and runtime shift.
  2728. // the splat, however, does require an immediate. Go IBM!
  2729. u32x4 shifter = u32x4 (__vspltisb( ((signed char)(nBits & 0x7)) ));
  2730. return __vsr( a, shifter );
  2731. }
  2732. }
  2733. // find the lowest component of a.x, a.y, a.z,
  2734. // and replicate it to the whole return value.
  2735. // ignores a.w.
  2736. // Though this is only five instructions long,
  2737. // they are all dependent, making this stall city.
  2738. // Forcing this inline should hopefully help with scheduling.
  2739. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
  2740. {
  2741. // a is [x,y,z,G] (where G is garbage)
  2742. // rotate left by one
  2743. fltx4 compareOne = a ;
  2744. compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
  2745. // compareOne is [y,z,G,G]
  2746. fltx4 retval = MinSIMD( a, compareOne );
  2747. // retVal is [min(x,y), min(y,z), G, G]
  2748. compareOne = __vrlimi( compareOne, a, 8 , 2);
  2749. // compareOne is [z, G, G, G]
  2750. retval = MinSIMD( retval, compareOne );
  2751. // retVal = [ min(min(x,y),z), G, G, G ]
  2752. // splat the x component out to the whole vector and return
  2753. return SplatXSIMD( retval );
  2754. }
  2755. // find the highest component of a.x, a.y, a.z,
  2756. // and replicate it to the whole return value.
  2757. // ignores a.w.
  2758. // Though this is only five instructions long,
  2759. // they are all dependent, making this stall city.
  2760. // Forcing this inline should hopefully help with scheduling.
  2761. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
  2762. {
  2763. // a is [x,y,z,G] (where G is garbage)
  2764. // rotate left by one
  2765. fltx4 compareOne = a ;
  2766. compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
  2767. // compareOne is [y,z,G,G]
  2768. fltx4 retval = MaxSIMD( a, compareOne );
  2769. // retVal is [max(x,y), max(y,z), G, G]
  2770. compareOne = __vrlimi( compareOne, a, 8 , 2);
  2771. // compareOne is [z, G, G, G]
  2772. retval = MaxSIMD( retval, compareOne );
  2773. // retVal = [ max(max(x,y),z), G, G, G ]
  2774. // splat the x component out to the whole vector and return
  2775. return SplatXSIMD( retval );
  2776. }
  2777. // Transform many (horizontal) points in-place by a 3x4 matrix,
  2778. // here already loaded onto three fltx4 registers.
  2779. // The points must be stored as 16-byte aligned. They are points
  2780. // and not vectors because we assume the w-component to be 1.
  2781. // To spare yourself the annoyance of loading the matrix yourself,
  2782. // use one of the overloads below.
  2783. void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow1, FLTX4 mRow2, FLTX4 mRow3);
  2784. // Transform many (horizontal) points in-place by a 3x4 matrix.
  2785. // The points must be stored as 16-byte aligned. They are points
  2786. // and not vectors because we assume the w-component to be 1.
  2787. // In this function, the matrix need not be aligned.
  2788. FORCEINLINE void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix)
  2789. {
  2790. return TransformManyPointsBy(pVectors, numVectors,
  2791. LoadUnalignedSIMD( pMatrix[0] ), LoadUnalignedSIMD( pMatrix[1] ), LoadUnalignedSIMD( pMatrix[2] ) );
  2792. }
  2793. // Transform many (horizontal) points in-place by a 3x4 matrix.
  2794. // The points must be stored as 16-byte aligned. They are points
  2795. // and not vectors because we assume the w-component to be 1.
  2796. // In this function, the matrix must itself be aligned on a 16-byte
  2797. // boundary.
  2798. FORCEINLINE void TransformManyPointsByA(VectorAligned * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t &pMatrix)
  2799. {
  2800. return TransformManyPointsBy(pVectors, numVectors,
  2801. LoadAlignedSIMD( pMatrix[0] ), LoadAlignedSIMD( pMatrix[1] ), LoadAlignedSIMD( pMatrix[2] ) );
  2802. }
  2803. // ------------------------------------
  2804. // INTEGER SIMD OPERATIONS.
  2805. // ------------------------------------
  2806. // Load 4 aligned words into a SIMD register
  2807. FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
  2808. {
  2809. return XMLoadVector4A(pSIMD);
  2810. }
  2811. // Load 4 unaligned words into a SIMD register
  2812. FORCEINLINE i32x4 LoadUnalignedIntSIMD(const void * RESTRICT pSIMD)
  2813. {
  2814. return XMLoadVector4( pSIMD );
  2815. }
  2816. // save into four words, 16-byte aligned
  2817. FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  2818. {
  2819. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  2820. }
  2821. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  2822. {
  2823. *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
  2824. }
  2825. FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  2826. {
  2827. XMStoreVector4(pSIMD, a);
  2828. }
  2829. // Load four consecutive uint16's, and turn them into floating point numbers.
  2830. // This function isn't especially fast and could be made faster if anyone is
  2831. // using it heavily.
  2832. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
  2833. {
  2834. return XMLoadUShort4(reinterpret_cast<const XMUSHORT4 *>(pInts));
  2835. }
  2836. // a={ a.x, a.z, b.x, b.z }
  2837. // combine two fltx4s by throwing away every other field.
  2838. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )
  2839. {
  2840. return XMVectorPermute( a, b, XMVectorPermuteControl( 0, 2, 4, 6 ) );
  2841. }
  2842. // a={ a.x, b.x, c.x, d.x }
  2843. // combine 4 fltx4s by throwing away 3/4s of the fields
  2844. // TODO: make more efficient by doing this in a parallel way at the caller
  2845. // Compress4SIMD(FourVectors.. )
  2846. FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )
  2847. {
  2848. fltx4 abcd = __vrlimi( a, b, 4, 3 ); // a.x, b.x, a.z, a.w
  2849. abcd = __vrlimi( abcd, c, 2, 2 ); // ax, bx, cx, aw
  2850. abcd = __vrlimi( abcd, d, 1, 1 ); // ax, bx, cx, dx
  2851. return abcd;
  2852. }
  2853. // construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
  2854. FORCEINLINE fltx4 LoadGatherSIMD( const float &x, const float &y, const float &z, const float &w )
  2855. {
  2856. // load the float into the low word of each vector register (this exploits the unaligned load op)
  2857. fltx4 vx = __lvlx( &x, 0 );
  2858. fltx4 vy = __lvlx( &y, 0 );
  2859. fltx4 vz = __lvlx( &z, 0 );
  2860. fltx4 vw = __lvlx( &w, 0 );
  2861. return Compress4SIMD( vx, vy, vz, vw );
  2862. }
  2863. // Take a fltx4 containing fixed-point uints and
  2864. // return them as single precision floats. No
  2865. // fixed point conversion is done.
  2866. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  2867. {
  2868. return __vcfux( vSrcA, 0 );
  2869. }
  2870. // Take a fltx4 containing fixed-point sints and
  2871. // return them as single precision floats. No
  2872. // fixed point conversion is done.
  2873. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  2874. {
  2875. return __vcfsx( vSrcA, 0 );
  2876. }
  2877. // Take a fltx4 containing fixed-point uints and
  2878. // return them as single precision floats. Each uint
  2879. // will be divided by 2^immed after conversion
  2880. // (eg, this is fixed point math).
  2881. /* as if:
  2882. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
  2883. {
  2884. return __vcfux( vSrcA, uImmed );
  2885. }
  2886. */
  2887. #define UnsignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfux( (vSrcA), (uImmed) ))
  2888. // Take a fltx4 containing fixed-point sints and
  2889. // return them as single precision floats. Each int
  2890. // will be divided by 2^immed (eg, this is fixed point
  2891. // math).
  2892. /* as if:
  2893. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA, unsigned int uImmed )
  2894. {
  2895. return __vcfsx( vSrcA, uImmed );
  2896. }
  2897. */
  2898. #define SignedFixedIntConvertToFltSIMD(vSrcA, uImmed) (__vcfsx( (vSrcA), (uImmed) ))
  2899. // set all components of a vector to a signed immediate int number.
  2900. /* as if:
  2901. FORCEINLINE fltx4 IntSetImmediateSIMD(int toImmediate)
  2902. {
  2903. return __vspltisw( toImmediate );
  2904. }
  2905. */
  2906. #define IntSetImmediateSIMD(x) (__vspltisw(x))
  2907. /*
  2908. works on fltx4's as if they are four uints.
  2909. the first parameter contains the words to be shifted,
  2910. the second contains the amount to shift by AS INTS
  2911. for i = 0 to 3
  2912. shift = vSrcB_i*32:(i*32)+4
  2913. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  2914. */
  2915. FORCEINLINE fltx4 IntShiftLeftWordSIMD(fltx4 vSrcA, fltx4 vSrcB)
  2916. {
  2917. return __vslw(vSrcA, vSrcB);
  2918. }
  2919. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  2920. {
  2921. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  2922. const fltx4_union & a_union = (const fltx4_union &)a;
  2923. return a_union.m128_f32[ idx ];
  2924. }
  2925. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  2926. {
  2927. fltx4_union & a_union = (fltx4_union &)a;
  2928. return a_union.m128_f32[idx];
  2929. }
  2930. FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
  2931. {
  2932. fltx4 t = __vctuxs( a, 0 );
  2933. const fltx4_union & a_union = (const fltx4_union &)t;
  2934. return a_union.m128_u32[idx];
  2935. }
  2936. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  2937. {
  2938. const fltx4_union & a_union = (const fltx4_union &)a;
  2939. return a_union.m128_u32[idx];
  2940. }
  2941. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  2942. {
  2943. fltx4_union & a_union = (fltx4_union &)a;
  2944. return a_union.m128_u32[idx];
  2945. }
  2946. #else
  2947. //---------------------------------------------------------------------
  2948. // Intel/SSE implementation
  2949. //---------------------------------------------------------------------
  2950. FORCEINLINE void StoreAlignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
  2951. {
  2952. _mm_store_ps( pSIMD, a );
  2953. }
  2954. FORCEINLINE void StoreAlignedSIMD( short * RESTRICT pSIMD, const shortx8 & a )
  2955. {
  2956. _mm_store_si128( (shortx8 *)pSIMD, a );
  2957. }
  2958. FORCEINLINE void StoreUnalignedSIMD( float * RESTRICT pSIMD, const fltx4 & a )
  2959. {
  2960. _mm_storeu_ps( pSIMD, a );
  2961. }
  2962. FORCEINLINE void StoreUnalignedSIMD(short* RESTRICT pSIMD, const shortx8& a)
  2963. {
  2964. _mm_storeu_si128((shortx8*)pSIMD, a);
  2965. }
  2966. FORCEINLINE void StoreUnalignedFloat( float *pSingleFloat, const fltx4 & a )
  2967. {
  2968. _mm_store_ss( pSingleFloat, a );
  2969. }
  2970. FORCEINLINE fltx4 RotateLeft( const fltx4 & a );
  2971. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a );
  2972. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  2973. {
  2974. _mm_store_ss(pSIMD, a);
  2975. _mm_store_ss(pSIMD+1, RotateLeft(a));
  2976. _mm_store_ss(pSIMD+2, RotateLeft2(a));
  2977. }
  2978. // strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
  2979. FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
  2980. {
  2981. StoreAlignedSIMD( pSIMD->Base(),a );
  2982. }
  2983. // Store the x,y,z components of the four FLTX4 parameters
  2984. // into the four consecutive Vectors:
  2985. // pDestination[0], pDestination[1], pDestination[2], pDestination[3]
  2986. // The Vectors are assumed to be unaligned.
  2987. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  2988. Vector * const pDestination )
  2989. {
  2990. StoreUnaligned3SIMD( pDestination->Base(), a );
  2991. StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
  2992. StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
  2993. StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
  2994. }
  2995. // Store the x,y,z components of the four FLTX4 parameters
  2996. // into the four consecutive Vectors:
  2997. // pDestination , pDestination + 1, pDestination + 2, pDestination + 3
  2998. // The Vectors are assumed to start on an ALIGNED address, that is,
  2999. // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
  3000. FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  3001. Vector * const pDestination )
  3002. {
  3003. StoreUnaligned3SIMD( pDestination->Base(), a );
  3004. StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
  3005. StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
  3006. StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
  3007. }
  3008. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  3009. {
  3010. return _mm_load_ps( reinterpret_cast< const float *> ( pSIMD ) );
  3011. }
  3012. FORCEINLINE shortx8 LoadAlignedShortSIMD( const void *pSIMD )
  3013. {
  3014. return _mm_load_si128( reinterpret_cast< const shortx8 *> ( pSIMD ) );
  3015. }
  3016. FORCEINLINE shortx8 LoadUnalignedShortSIMD( const void *pSIMD )
  3017. {
  3018. return _mm_loadu_si128( reinterpret_cast< const shortx8 *> ( pSIMD ) );
  3019. }
  3020. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  3021. {
  3022. return _mm_and_ps( a, b );
  3023. }
  3024. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // a & ~b
  3025. {
  3026. return _mm_andnot_ps( a, b );
  3027. }
  3028. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  3029. {
  3030. return _mm_xor_ps( a, b );
  3031. }
  3032. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  3033. {
  3034. return _mm_or_ps( a, b );
  3035. }
  3036. // Squelch the w component of a vector to +0.0.
  3037. // Most efficient when you say a = SetWToZeroSIMD(a) (avoids a copy)
  3038. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  3039. {
  3040. return AndSIMD( a, LoadAlignedSIMD( g_SIMD_clear_wmask ) );
  3041. }
  3042. // for the transitional class -- load a 3-by VectorAligned and squash its w component
  3043. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
  3044. {
  3045. return SetWToZeroSIMD( LoadAlignedSIMD(pSIMD.Base()) );
  3046. }
  3047. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  3048. {
  3049. return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
  3050. }
  3051. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  3052. {
  3053. return _mm_loadu_ps( reinterpret_cast<const float *>( pSIMD ) );
  3054. }
  3055. // load a single unaligned float into the x component of a SIMD word
  3056. FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
  3057. {
  3058. return _mm_load_ss(pFlt);
  3059. }
  3060. /// replicate a single 32 bit integer value to all 4 components of an m128
  3061. FORCEINLINE fltx4 ReplicateIX4( int i )
  3062. {
  3063. fltx4 value = _mm_set_ss( * ( ( float *) &i ) );;
  3064. return _mm_shuffle_ps( value, value, 0);
  3065. }
  3066. FORCEINLINE fltx4 ReplicateX4( float flValue )
  3067. {
  3068. __m128 value = _mm_set_ss( flValue );
  3069. return _mm_shuffle_ps( value, value, 0 );
  3070. }
  3071. FORCEINLINE fltx4 ReplicateX4( const float * flValue )
  3072. {
  3073. __m128 value = _mm_set_ss( *flValue );
  3074. return _mm_shuffle_ps( value, value, 0 );
  3075. }
  3076. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  3077. {
  3078. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  3079. #ifndef POSIX
  3080. return a.m128_f32[ idx ];
  3081. #else
  3082. return (reinterpret_cast<float const *>(&a))[idx];
  3083. #endif
  3084. }
  3085. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  3086. {
  3087. #ifndef POSIX
  3088. return a.m128_f32[ idx ];
  3089. #else
  3090. return (reinterpret_cast<float *>(&a))[idx];
  3091. #endif
  3092. }
  3093. FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )
  3094. {
  3095. return (uint32)SubFloat(a,idx);
  3096. }
  3097. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  3098. {
  3099. #ifndef POSIX
  3100. return a.m128_u32[idx];
  3101. #else
  3102. return (reinterpret_cast<uint32 const *>(&a))[idx];
  3103. #endif
  3104. }
  3105. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  3106. {
  3107. #ifndef POSIX
  3108. return a.m128_u32[idx];
  3109. #else
  3110. return (reinterpret_cast<uint32 *>(&a))[idx];
  3111. #endif
  3112. }
  3113. // Return one in the fastest way -- on the x360, faster even than loading.
  3114. FORCEINLINE fltx4 LoadZeroSIMD( void )
  3115. {
  3116. return Four_Zeros;
  3117. }
  3118. // Return one in the fastest way -- on the x360, faster even than loading.
  3119. FORCEINLINE fltx4 LoadOneSIMD( void )
  3120. {
  3121. return Four_Ones;
  3122. }
  3123. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  3124. {
  3125. return OrSIMD(
  3126. AndSIMD( ReplacementMask, NewValue ),
  3127. AndNotSIMD( ReplacementMask, OldValue ) );
  3128. }
  3129. // remember, the SSE numbers its words 3 2 1 0
  3130. // The way we want to specify shuffles is backwards from the default
  3131. // MM_SHUFFLE_REV is in array index order (default is reversed)
  3132. #define MM_SHUFFLE_REV(a,b,c,d) _MM_SHUFFLE(d,c,b,a)
  3133. FORCEINLINE fltx4 SplatXSIMD( fltx4 const & a )
  3134. {
  3135. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
  3136. }
  3137. FORCEINLINE fltx4 SplatYSIMD( fltx4 const &a )
  3138. {
  3139. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 1, 1, 1 ) );
  3140. }
  3141. FORCEINLINE fltx4 SplatZSIMD( fltx4 const &a )
  3142. {
  3143. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 2, 2 ) );
  3144. }
  3145. FORCEINLINE fltx4 SplatWSIMD( fltx4 const &a )
  3146. {
  3147. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 3, 3, 3, 3 ) );
  3148. }
  3149. FORCEINLINE fltx4 ShuffleXXYY( const fltx4 &a )
  3150. {
  3151. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 1, 1 ) );
  3152. }
  3153. FORCEINLINE fltx4 ShuffleXYXY( const fltx4 &a )
  3154. {
  3155. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 1, 0, 1 ) );
  3156. }
  3157. FORCEINLINE fltx4 ShuffleZZWW( const fltx4 &a )
  3158. {
  3159. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 3, 3 ) );
  3160. }
  3161. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  3162. {
  3163. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[0] ), x, a );
  3164. return result;
  3165. }
  3166. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  3167. {
  3168. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[1] ), y, a );
  3169. return result;
  3170. }
  3171. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  3172. {
  3173. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[2] ), z, a );
  3174. return result;
  3175. }
  3176. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  3177. {
  3178. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[3] ), w, a );
  3179. return result;
  3180. }
  3181. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, int nComponent, float flValue )
  3182. {
  3183. fltx4 val = ReplicateX4( flValue );
  3184. fltx4 result = MaskedAssign( LoadAlignedSIMD( g_SIMD_ComponentMask[nComponent] ), val, a );
  3185. return result;
  3186. }
  3187. // a b c d -> b c d a
  3188. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  3189. {
  3190. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 1, 2, 3, 0 ) );
  3191. }
  3192. // a b c d -> c d a b
  3193. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  3194. {
  3195. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
  3196. }
  3197. // a b c d -> d a b c
  3198. FORCEINLINE fltx4 RotateRight( const fltx4 & a )
  3199. {
  3200. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 3, 0, 1, 2 ) );
  3201. }
  3202. // a b c d -> c d a b
  3203. FORCEINLINE fltx4 RotateRight2( const fltx4 & a )
  3204. {
  3205. return _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 3, 0, 1 ) );
  3206. }
  3207. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b ) // a+b
  3208. {
  3209. return _mm_add_ps( a, b );
  3210. }
  3211. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  3212. {
  3213. return _mm_sub_ps( a, b );
  3214. };
  3215. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  3216. {
  3217. return _mm_mul_ps( a, b );
  3218. };
  3219. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  3220. {
  3221. return _mm_div_ps( a, b );
  3222. };
  3223. fltx4 ReciprocalEstSIMD( const fltx4 & a );
  3224. FORCEINLINE fltx4 DivEstSIMD( const fltx4 & a, const fltx4 & b ) // Est(a/b)
  3225. {
  3226. return MulSIMD( ReciprocalEstSIMD( b ), a );
  3227. };
  3228. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  3229. {
  3230. return AddSIMD( MulSIMD(a,b), c );
  3231. }
  3232. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  3233. {
  3234. return SubSIMD( c, MulSIMD(a,b) );
  3235. };
  3236. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  3237. {
  3238. fltx4 m = MulSIMD( a, b );
  3239. return AddSIMD( AddSIMD( SplatXSIMD(m), SplatYSIMD(m) ), SplatZSIMD(m) );
  3240. }
  3241. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  3242. {
  3243. // 4 instructions, serial, order of addition varies so individual elements my differ in the LSB on some CPUs
  3244. fltx4 fl4Product = MulSIMD( a, b );
  3245. fltx4 fl4YXWZ = _mm_shuffle_ps( fl4Product, fl4Product, MM_SHUFFLE_REV(1,0,3,2) );
  3246. fltx4 fl4UUVV = AddSIMD( fl4Product, fl4YXWZ ); // U = X+Y; V = Z+W
  3247. fltx4 fl4VVUU = RotateLeft2( fl4UUVV );
  3248. return AddSIMD( fl4UUVV, fl4VVUU );
  3249. }
  3250. //TODO: implement as four-way Taylor series (see xbox implementation)
  3251. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  3252. {
  3253. fltx4 result;
  3254. SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
  3255. SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
  3256. SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
  3257. SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
  3258. return result;
  3259. }
  3260. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  3261. {
  3262. // FIXME: Make a fast SSE version
  3263. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  3264. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  3265. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  3266. }
  3267. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians ) // a*b + c
  3268. {
  3269. // FIXME: Make a fast SSE version
  3270. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  3271. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  3272. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  3273. SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
  3274. }
  3275. //TODO: implement as four-way Taylor series (see xbox implementation)
  3276. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  3277. {
  3278. // FIXME: Make a fast SSE version
  3279. fltx4 result;
  3280. SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
  3281. SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
  3282. SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
  3283. SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
  3284. return result;
  3285. }
  3286. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  3287. {
  3288. fltx4 result;
  3289. SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
  3290. SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
  3291. SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
  3292. SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
  3293. return result;
  3294. }
  3295. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  3296. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  3297. {
  3298. fltx4 result;
  3299. SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  3300. SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  3301. SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  3302. SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  3303. return result;
  3304. }
  3305. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  3306. {
  3307. return SubSIMD(LoadZeroSIMD(),a);
  3308. }
  3309. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  3310. {
  3311. return _mm_movemask_ps( a );
  3312. }
  3313. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  3314. {
  3315. return (0 != TestSignSIMD( a ));
  3316. }
  3317. FORCEINLINE bool IsAnyTrue( const fltx4 & a )
  3318. {
  3319. return (0 != TestSignSIMD( a ));
  3320. }
  3321. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  3322. {
  3323. return _mm_cmpeq_ps( a, b );
  3324. }
  3325. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  3326. {
  3327. return _mm_cmpgt_ps( a, b );
  3328. }
  3329. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  3330. {
  3331. return _mm_cmpge_ps( a, b );
  3332. }
  3333. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  3334. {
  3335. return _mm_cmplt_ps( a, b );
  3336. }
  3337. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  3338. {
  3339. return _mm_cmple_ps( a, b );
  3340. }
  3341. // for branching when a.xyzw > b.xyzw
  3342. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  3343. {
  3344. return TestSignSIMD( CmpLeSIMD( a, b ) ) == 0;
  3345. }
  3346. // for branching when a.xyzw >= b.xyzw
  3347. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  3348. {
  3349. return TestSignSIMD( CmpLtSIMD( a, b ) ) == 0;
  3350. }
  3351. // For branching if all a.xyzw == b.xyzw
  3352. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  3353. {
  3354. return TestSignSIMD( CmpEqSIMD( a, b ) ) == 0xf;
  3355. }
  3356. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  3357. {
  3358. return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) );
  3359. }
  3360. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  3361. {
  3362. return _mm_min_ps( a, b );
  3363. }
  3364. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  3365. {
  3366. return _mm_max_ps( a, b );
  3367. }
  3368. // SSE lacks rounding operations.
  3369. // Really.
  3370. // You can emulate them by setting the rounding mode for the
  3371. // whole processor and then converting to int, and then back again.
  3372. // But every time you set the rounding mode, you clear out the
  3373. // entire pipeline. So, I can't do them per operation. You
  3374. // have to do it once, before the loop that would call these.
  3375. // Round towards positive infinity
  3376. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  3377. {
  3378. fltx4 retVal;
  3379. SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
  3380. SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
  3381. SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
  3382. SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
  3383. return retVal;
  3384. }
  3385. fltx4 AbsSIMD( const fltx4 & x ); // To make it more coherent with the whole API (the whole SIMD API is postfixed with SIMD except a couple of methods. Well...)
  3386. fltx4 fabs( const fltx4 & x );
  3387. // Round towards negative infinity
  3388. // This is the implementation that was here before; it assumes
  3389. // you are in round-to-floor mode, which I guess is usually the
  3390. // case for us vis-a-vis SSE. It's totally unnecessary on
  3391. // VMX, which has a native floor op.
  3392. FORCEINLINE fltx4 FloorSIMD( const fltx4 &val )
  3393. {
  3394. fltx4 fl4Abs = fabs( val );
  3395. fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
  3396. ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
  3397. return XorSIMD( ival, XorSIMD( val, fl4Abs ) ); // restore sign bits
  3398. }
  3399. FORCEINLINE bool IsAnyZeros( const fltx4 & a ) // any floats are zero?
  3400. {
  3401. return TestSignSIMD( CmpEqSIMD( a, Four_Zeros ) ) != 0;
  3402. }
  3403. inline bool IsAllZeros( const fltx4 & var )
  3404. {
  3405. return TestSignSIMD( CmpEqSIMD( var, Four_Zeros ) ) == 0xF;
  3406. }
  3407. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  3408. {
  3409. return _mm_sqrt_ps( a );
  3410. }
  3411. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  3412. {
  3413. return _mm_sqrt_ps( a );
  3414. }
  3415. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  3416. {
  3417. return _mm_rsqrt_ps( a );
  3418. }
  3419. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  3420. {
  3421. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  3422. fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  3423. ret = ReciprocalSqrtEstSIMD( ret );
  3424. return ret;
  3425. }
  3426. /// uses newton iteration for higher precision results than ReciprocalSqrtEstSIMD
  3427. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  3428. {
  3429. fltx4 guess = ReciprocalSqrtEstSIMD( a );
  3430. // newton iteration for 1/sqrt(a) : y(n+1) = 1/2 (y(n)*(3-a*y(n)^2));
  3431. guess = MulSIMD( guess, SubSIMD( Four_Threes, MulSIMD( a, MulSIMD( guess, guess ))));
  3432. guess = MulSIMD( Four_PointFives, guess);
  3433. return guess;
  3434. }
  3435. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  3436. {
  3437. return _mm_rcp_ps( a );
  3438. }
  3439. /// 1/x for all 4 values, more or less
  3440. /// 1/0 will result in a big but NOT infinite result
  3441. FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
  3442. {
  3443. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  3444. fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  3445. ret = ReciprocalEstSIMD( ret );
  3446. return ret;
  3447. }
  3448. /// 1/x for all 4 values. uses reciprocal approximation instruction plus newton iteration.
  3449. /// No error checking!
  3450. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  3451. {
  3452. fltx4 ret = ReciprocalEstSIMD( a );
  3453. // newton iteration is: Y(n+1) = 2*Y(n)-a*Y(n)^2
  3454. ret = SubSIMD( AddSIMD( ret, ret ), MulSIMD( a, MulSIMD( ret, ret ) ) );
  3455. return ret;
  3456. }
  3457. /// 1/x for all 4 values.
  3458. /// 1/0 will result in a big but NOT infinite result
  3459. FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
  3460. {
  3461. fltx4 zero_mask = CmpEqSIMD( a, Four_Zeros );
  3462. fltx4 ret = OrSIMD( a, AndSIMD( Four_Epsilons, zero_mask ) );
  3463. ret = ReciprocalSIMD( ret );
  3464. return ret;
  3465. }
  3466. // CHRISG: is it worth doing integer bitfiddling for this?
  3467. // 2^x for all values (the antilog)
  3468. FORCEINLINE fltx4 ExpSIMD( const fltx4 &toPower )
  3469. {
  3470. fltx4 retval;
  3471. SubFloat( retval, 0 ) = powf( 2, SubFloat(toPower, 0) );
  3472. SubFloat( retval, 1 ) = powf( 2, SubFloat(toPower, 1) );
  3473. SubFloat( retval, 2 ) = powf( 2, SubFloat(toPower, 2) );
  3474. SubFloat( retval, 3 ) = powf( 2, SubFloat(toPower, 3) );
  3475. return retval;
  3476. }
  3477. // Clamps the components of a vector to a specified minimum and maximum range.
  3478. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  3479. {
  3480. return MaxSIMD( min, MinSIMD( max, in ) );
  3481. }
  3482. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w)
  3483. {
  3484. _MM_TRANSPOSE4_PS( x, y, z, w );
  3485. }
  3486. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a )
  3487. {
  3488. // a is [x,y,z,G] (where G is garbage)
  3489. // rotate left by one
  3490. fltx4 compareOne = RotateLeft( a );
  3491. // compareOne is [y,z,G,x]
  3492. fltx4 retval = MinSIMD( a, compareOne );
  3493. // retVal is [min(x,y), ... ]
  3494. compareOne = RotateLeft2( a );
  3495. // compareOne is [z, G, x, y]
  3496. retval = MinSIMD( retval, compareOne );
  3497. // retVal = [ min(min(x,y),z)..]
  3498. // splat the x component out to the whole vector and return
  3499. return SplatXSIMD( retval );
  3500. }
  3501. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a )
  3502. {
  3503. // a is [x,y,z,G] (where G is garbage)
  3504. // rotate left by one
  3505. fltx4 compareOne = RotateLeft( a );
  3506. // compareOne is [y,z,G,x]
  3507. fltx4 retval = MaxSIMD( a, compareOne );
  3508. // retVal is [max(x,y), ... ]
  3509. compareOne = RotateLeft2( a );
  3510. // compareOne is [z, G, x, y]
  3511. retval = MaxSIMD( retval, compareOne );
  3512. // retVal = [ max(max(x,y),z)..]
  3513. // splat the x component out to the whole vector and return
  3514. return SplatXSIMD( retval );
  3515. }
  3516. inline bool IsVector3LessThan(const fltx4 &v1, const fltx4 &v2 )
  3517. {
  3518. bi32x4 isOut = CmpLtSIMD( v1, v2 );
  3519. return IsAnyNegative( isOut );
  3520. }
  3521. inline bool IsVector4LessThan(const fltx4 &v1, const fltx4 &v2 )
  3522. {
  3523. bi32x4 isOut = CmpLtSIMD( v1, v2 );
  3524. return IsAnyNegative( isOut );
  3525. }
  3526. // ------------------------------------
  3527. // INTEGER SIMD OPERATIONS.
  3528. // ------------------------------------
  3529. #if 0 /* pc does not have these ops */
  3530. // splat all components of a vector to a signed immediate int number.
  3531. FORCEINLINE fltx4 IntSetImmediateSIMD(int to)
  3532. {
  3533. //CHRISG: SSE2 has this, but not SSE1. What to do?
  3534. fltx4 retval;
  3535. SubInt( retval, 0 ) = to;
  3536. SubInt( retval, 1 ) = to;
  3537. SubInt( retval, 2 ) = to;
  3538. SubInt( retval, 3 ) = to;
  3539. return retval;
  3540. }
  3541. #endif
  3542. // Load 4 aligned words into a SIMD register
  3543. FORCEINLINE i32x4 LoadAlignedIntSIMD( const void * RESTRICT pSIMD)
  3544. {
  3545. return _mm_load_ps( reinterpret_cast<const float *>(pSIMD) );
  3546. }
  3547. // Load 4 unaligned words into a SIMD register
  3548. FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
  3549. {
  3550. return _mm_loadu_ps( reinterpret_cast<const float *>(pSIMD) );
  3551. }
  3552. // save into four words, 16-byte aligned
  3553. FORCEINLINE void StoreAlignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
  3554. {
  3555. _mm_store_ps( reinterpret_cast<float *>(pSIMD), a );
  3556. }
  3557. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  3558. {
  3559. _mm_store_ps( reinterpret_cast<float *>(pSIMD.Base()), a );
  3560. }
  3561. FORCEINLINE void StoreUnalignedIntSIMD( int32 * RESTRICT pSIMD, const fltx4 & a )
  3562. {
  3563. _mm_storeu_ps( reinterpret_cast<float *>(pSIMD), a );
  3564. }
  3565. // a={ a.x, a.z, b.x, b.z }
  3566. // combine two fltx4s by throwing away every other field.
  3567. FORCEINLINE fltx4 CompressSIMD( fltx4 const & a, fltx4 const &b )
  3568. {
  3569. return _mm_shuffle_ps( a, b, MM_SHUFFLE_REV( 0, 2, 0, 2 ) );
  3570. }
  3571. // Load four consecutive uint16's, and turn them into floating point numbers.
  3572. // This function isn't especially fast and could be made faster if anyone is
  3573. // using it heavily.
  3574. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
  3575. {
  3576. #ifdef POSIX
  3577. fltx4 retval;
  3578. SubFloat( retval, 0 ) = pInts[0];
  3579. SubFloat( retval, 1 ) = pInts[1];
  3580. SubFloat( retval, 2 ) = pInts[2];
  3581. SubFloat( retval, 3 ) = pInts[3];
  3582. return retval;
  3583. #else
  3584. __m128i inA = _mm_loadl_epi64( (__m128i const*) pInts); // Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result.
  3585. inA = _mm_unpacklo_epi16( inA, _mm_setzero_si128() ); // unpack unsigned 16's to signed 32's
  3586. return _mm_cvtepi32_ps(inA);
  3587. #endif
  3588. }
  3589. // a={ a.x, b.x, c.x, d.x }
  3590. // combine 4 fltx4s by throwing away 3/4s of the fields
  3591. FORCEINLINE fltx4 Compress4SIMD( fltx4 const a, fltx4 const &b, fltx4 const &c, fltx4 const &d )
  3592. {
  3593. fltx4 aacc = _mm_shuffle_ps( a, c, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
  3594. fltx4 bbdd = _mm_shuffle_ps( b, d, MM_SHUFFLE_REV( 0, 0, 0, 0 ) );
  3595. return MaskedAssign( LoadAlignedSIMD( g_SIMD_EveryOtherMask ), bbdd, aacc );
  3596. }
  3597. // outa={a.x, a.x, a.y, a.y}, outb = a.z, a.z, a.w, a.w }
  3598. FORCEINLINE void ExpandSIMD( fltx4 const &a, fltx4 &fl4OutA, fltx4 &fl4OutB )
  3599. {
  3600. fl4OutA = _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 0, 0, 1, 1 ) );
  3601. fl4OutB = _mm_shuffle_ps( a, a, MM_SHUFFLE_REV( 2, 2, 3, 3 ) );
  3602. }
  3603. // construct a fltx4 from four different scalars, which are assumed to be neither aligned nor contiguous
  3604. FORCEINLINE fltx4 LoadGatherSIMD( const float &x, const float &y, const float &z, const float &w )
  3605. {
  3606. // load the float into the low word of each vector register (this exploits the unaligned load op)
  3607. fltx4 vx = _mm_load_ss( &x );
  3608. fltx4 vy = _mm_load_ss( &y );
  3609. fltx4 vz = _mm_load_ss( &z );
  3610. fltx4 vw = _mm_load_ss( &w );
  3611. return Compress4SIMD( vx, vy, vz, vw );
  3612. }
  3613. // CHRISG: the conversion functions all seem to operate on m64's only...
  3614. // how do we make them work here?
  3615. // Take a fltx4 containing fixed-point uints and
  3616. // return them as single precision floats. No
  3617. // fixed point conversion is done.
  3618. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
  3619. {
  3620. fltx4 retval;
  3621. SubFloat( retval, 0 ) = ( (float) SubInt( retval, 0 ) );
  3622. SubFloat( retval, 1 ) = ( (float) SubInt( retval, 1 ) );
  3623. SubFloat( retval, 2 ) = ( (float) SubInt( retval, 2 ) );
  3624. SubFloat( retval, 3 ) = ( (float) SubInt( retval, 3 ) );
  3625. return retval;
  3626. }
  3627. // Take a fltx4 containing fixed-point sints and
  3628. // return them as single precision floats. No
  3629. // fixed point conversion is done.
  3630. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  3631. {
  3632. return _mm_cvtepi32_ps( (const __m128i &)vSrcA );
  3633. }
  3634. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const shortx8 &vSrcA )
  3635. {
  3636. return _mm_cvtepi32_ps( vSrcA );
  3637. }
  3638. #if 0
  3639. // Take a fltx4 containing fixed-point sints and
  3640. // return them as single precision floats. No
  3641. // fixed point conversion is done.
  3642. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  3643. {
  3644. fltx4 retval;
  3645. SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[0]));
  3646. SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[1]));
  3647. SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[2]));
  3648. SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<const int32 *>(&vSrcA)[3]));
  3649. return retval;
  3650. }
  3651. #endif
  3652. /*
  3653. works on fltx4's as if they are four uints.
  3654. the first parameter contains the words to be shifted,
  3655. the second contains the amount to shift by AS INTS
  3656. for i = 0 to 3
  3657. shift = vSrcB_i*32:(i*32)+4
  3658. vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  3659. */
  3660. FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
  3661. {
  3662. i32x4 retval;
  3663. SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
  3664. SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
  3665. SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
  3666. SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
  3667. return retval;
  3668. }
  3669. // Fixed-point conversion and save as SIGNED INTS.
  3670. // pDest->x = Int (vSrc.x)
  3671. // note: some architectures have means of doing
  3672. // fixed point conversion when the fix depth is
  3673. // specified as an immediate.. but there is no way
  3674. // to guarantee an immediate as a parameter to function
  3675. // like this.
  3676. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  3677. {
  3678. #if defined(_MSC_VER) && _MSC_VER >= 1900 && defined(COMPILER_MSVC64)
  3679. (*pDest)[0] = (int)SubFloat(vSrc, 0);
  3680. (*pDest)[1] = (int)SubFloat(vSrc, 1);
  3681. (*pDest)[2] = (int)SubFloat(vSrc, 2);
  3682. (*pDest)[3] = (int)SubFloat(vSrc, 3);
  3683. #else
  3684. __m64 bottom = _mm_cvttps_pi32( vSrc );
  3685. __m64 top = _mm_cvttps_pi32( _mm_movehl_ps(vSrc,vSrc) );
  3686. *reinterpret_cast<__m64 *>(&(*pDest)[0]) = bottom;
  3687. *reinterpret_cast<__m64 *>(&(*pDest)[2]) = top;
  3688. _mm_empty();
  3689. #endif
  3690. }
  3691. #endif
  3692. // a={a.y, a.z, a.w, b.x } b={b.y, b.z, b.w, b.x }
  3693. FORCEINLINE void RotateLeftDoubleSIMD( fltx4 &a, fltx4 &b )
  3694. {
  3695. a = SetWSIMD( RotateLeft( a ), SplatXSIMD( b ) );
  3696. b = RotateLeft( b );
  3697. }
  3698. // // Some convenience operator overloads, which are just aliasing the functions above.
  3699. // Unneccessary on 360, as you already have them from xboxmath.h (same for PS3 PPU and SPU)
  3700. #if !defined(PLATFORM_PPC) && !defined( POSIX ) && !defined(SPU)
  3701. #if 1 // TODO: verify generation of non-bad code.
  3702. // Componentwise add
  3703. FORCEINLINE fltx4 operator+( FLTX4 a, FLTX4 b )
  3704. {
  3705. return AddSIMD( a, b );
  3706. }
  3707. // Componentwise subtract
  3708. FORCEINLINE fltx4 operator-( FLTX4 a, FLTX4 b )
  3709. {
  3710. return SubSIMD( a, b );
  3711. }
  3712. // Componentwise multiply
  3713. FORCEINLINE fltx4 operator*( FLTX4 a, FLTX4 b )
  3714. {
  3715. return MulSIMD( a, b );
  3716. }
  3717. // No divide. You need to think carefully about whether you want a reciprocal
  3718. // or a reciprocal estimate.
  3719. // bitwise and
  3720. FORCEINLINE fltx4 operator&( FLTX4 a, FLTX4 b )
  3721. {
  3722. return AndSIMD( a ,b );
  3723. }
  3724. // bitwise or
  3725. FORCEINLINE fltx4 operator|( FLTX4 a, FLTX4 b )
  3726. {
  3727. return OrSIMD( a, b );
  3728. }
  3729. // bitwise xor
  3730. FORCEINLINE fltx4 operator^( FLTX4 a, FLTX4 b )
  3731. {
  3732. return XorSIMD( a, b );
  3733. }
  3734. // unary negate
  3735. FORCEINLINE fltx4 operator-( FLTX4 a )
  3736. {
  3737. return NegSIMD( a );
  3738. }
  3739. #endif // 0
  3740. #endif
  3741. #if defined(_X360) || defined(_PS3)
  3742. FORCEINLINE fltx4 VectorMergeHighSIMD( fltx4 fl4SrcA, fltx4 fl4SrcB )
  3743. {
  3744. #if defined( _X360 )
  3745. return __vmrghw( fl4SrcA, fl4SrcB );
  3746. #else
  3747. return vec_mergeh( fl4SrcA, fl4SrcB );
  3748. #endif
  3749. }
  3750. FORCEINLINE fltx4 VectorMergeLowSIMD( fltx4 fl4SrcA, fltx4 fl4SrcB )
  3751. {
  3752. #if defined( _X360 )
  3753. return __vmrglw( fl4SrcA, fl4SrcB );
  3754. #else
  3755. return vec_mergel( fl4SrcA, fl4SrcB );
  3756. #endif
  3757. }
  3758. #endif
  3759. #ifndef SPU
  3760. // fourplanes_t, Frustrum_t are not supported on SPU
  3761. // It would make sense to support FourVectors on SPU at some point.
  3762. struct ALIGN16 fourplanes_t
  3763. {
  3764. fltx4 nX;
  3765. fltx4 nY;
  3766. fltx4 nZ;
  3767. fltx4 dist;
  3768. bi32x4 xSign;
  3769. bi32x4 ySign;
  3770. bi32x4 zSign;
  3771. fltx4 nXAbs;
  3772. fltx4 nYAbs;
  3773. fltx4 nZAbs;
  3774. void ComputeSignbits();
  3775. // fast SIMD loads
  3776. void Set4Planes( const VPlane *pPlanes );
  3777. void Set2Planes( const VPlane *pPlanes );
  3778. void Get4Planes( VPlane *pPlanesOut ) const;
  3779. void Get2Planes( VPlane *pPlanesOut ) const;
  3780. // not-SIMD, much slower
  3781. void GetPlane( int index, Vector *pNormal, float *pDist ) const;
  3782. void SetPlane( int index, const Vector &vecNormal, float planeDist );
  3783. };
  3784. class ALIGN16 Frustum_t
  3785. {
  3786. public:
  3787. Frustum_t();
  3788. void SetPlane( int i, const Vector &vecNormal, float dist );
  3789. void GetPlane( int i, Vector *pNormalOut, float *pDistOut ) const;
  3790. void SetPlanes( const VPlane *pPlanes );
  3791. void GetPlanes( VPlane *pPlanesOut ) const;
  3792. // returns false if the box is within the frustum, true if it is outside
  3793. bool CullBox( const Vector &mins, const Vector &maxs ) const;
  3794. bool CullBoxCenterExtents( const Vector &center, const Vector &extents ) const;
  3795. bool CullBox( const fltx4 &fl4Mins, const fltx4 &fl4Maxs ) const;
  3796. bool CullBoxCenterExtents( const fltx4 &fl4Center, const fltx4 &fl4Extents ) const;
  3797. // Return true if frustum contains this bounding volume, false if any corner is outside
  3798. bool Contains( const Vector &mins, const Vector &maxs ) const;
  3799. // Return true if this frustum intersects the frustum, false if it is outside
  3800. bool Intersects( Frustum_t &otherFrustum ) const;
  3801. // Return true if this bounding volume intersects the frustum, false if it is outside
  3802. bool Intersects( const Vector &mins, const Vector &maxs ) const;
  3803. bool IntersectsCenterExtents( const Vector &center, const Vector &extents ) const;
  3804. bool Intersects( const fltx4 &fl4Mins, const fltx4 &fl4Maxs ) const;
  3805. bool IntersectsCenterExtents( const fltx4 &fl4Center, const fltx4 &fl4Extents ) const;
  3806. void CreatePerspectiveFrustum( const Vector& origin, const Vector &forward,
  3807. const Vector &right, const Vector &up, float flZNear, float flZFar,
  3808. float flFovX, float flAspect );
  3809. void CreatePerspectiveFrustumFLU( const Vector& vOrigin, const Vector &vForward,
  3810. const Vector &vLeft, const Vector &vUp, float flZNear, float flZFar,
  3811. float flFovX, float flAspect );
  3812. // Version that accepts angles instead of vectors
  3813. void CreatePerspectiveFrustum( const Vector& origin, const QAngle &angles, float flZNear,
  3814. float flZFar, float flFovX, float flAspectRatio );
  3815. // Generate a frustum based on orthographic parameters
  3816. void CreateOrthoFrustum( const Vector &origin, const Vector &forward, const Vector &right, const Vector &up,
  3817. float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar );
  3818. void CreateOrthoFrustumFLU( const Vector &vOrigin, const Vector &vForward, const Vector &vLeft, const Vector &vUp,
  3819. float flLeft, float flRight, float flBottom, float flTop, float flZNear, float flZFar );
  3820. // The points returned correspond to the corners of the frustum faces
  3821. // Points 0 to 3 correspond to the near face
  3822. // Points 4 to 7 correspond to the far face
  3823. // Returns points in a face in this order:
  3824. // 2--3
  3825. // | |
  3826. // 0--1
  3827. // Returns false if a corner couldn't be generated for some reason.
  3828. bool GetCorners( Vector *pPoints ) const;
  3829. fourplanes_t planes[2];
  3830. };
  3831. #endif
  3832. class FourQuaternions;
  3833. /// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
  3834. /// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
  3835. class ALIGN16 FourVectors
  3836. {
  3837. public:
  3838. fltx4 x, y, z;
  3839. FourVectors(void)
  3840. {
  3841. }
  3842. FourVectors( FourVectors const &src )
  3843. {
  3844. x=src.x;
  3845. y=src.y;
  3846. z=src.z;
  3847. }
  3848. explicit FORCEINLINE FourVectors( float a )
  3849. {
  3850. fltx4 aReplicated = ReplicateX4( a );
  3851. x = y = z = aReplicated;
  3852. }
  3853. FORCEINLINE void Init( void )
  3854. {
  3855. x = Four_Zeros;
  3856. y = Four_Zeros;
  3857. z = Four_Zeros;
  3858. }
  3859. FORCEINLINE void Init( float flX, float flY, float flZ )
  3860. {
  3861. x = ReplicateX4( flX );
  3862. y = ReplicateX4( flY );
  3863. z = ReplicateX4( flZ );
  3864. }
  3865. FORCEINLINE FourVectors( float flX, float flY, float flZ )
  3866. {
  3867. Init( flX, flY, flZ );
  3868. }
  3869. FORCEINLINE void Init( fltx4 const &fl4X, fltx4 const &fl4Y, fltx4 const &fl4Z )
  3870. {
  3871. x = fl4X;
  3872. y = fl4Y;
  3873. z = fl4Z;
  3874. }
  3875. FORCEINLINE FourVectors( fltx4 const &fl4X, fltx4 const &fl4Y, fltx4 const &fl4Z )
  3876. {
  3877. Init( fl4X, fl4Y, fl4Z );
  3878. }
  3879. /// construct a FourVectors from 4 separate Vectors
  3880. FORCEINLINE FourVectors(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
  3881. {
  3882. LoadAndSwizzle(a,b,c,d);
  3883. }
  3884. /// construct a FourVectors from 4 separate Vectors
  3885. FORCEINLINE FourVectors(VectorAligned const &a, VectorAligned const &b, VectorAligned const &c, VectorAligned const &d)
  3886. {
  3887. LoadAndSwizzleAligned(a,b,c,d);
  3888. }
  3889. // construct from twelve floats; really only useful for static const constructors.
  3890. // input arrays must be aligned, and in the fourvectors' native format
  3891. // (eg in xxxx,yyyy,zzzz form)
  3892. // each pointer should be to an aligned array of four floats
  3893. FORCEINLINE FourVectors( const float *xs , const float *ys, const float *zs ) :
  3894. x( LoadAlignedSIMD(xs) ), y( LoadAlignedSIMD(ys) ), z( LoadAlignedSIMD(zs) )
  3895. {};
  3896. FORCEINLINE void DuplicateVector(Vector const &v) //< set all 4 vectors to the same vector value
  3897. {
  3898. x=ReplicateX4(v.x);
  3899. y=ReplicateX4(v.y);
  3900. z=ReplicateX4(v.z);
  3901. }
  3902. FORCEINLINE fltx4 const & operator[](int idx) const
  3903. {
  3904. return *((&x)+idx);
  3905. }
  3906. FORCEINLINE fltx4 & operator[](int idx)
  3907. {
  3908. return *((&x)+idx);
  3909. }
  3910. FORCEINLINE void operator+=(FourVectors const &b) //< add 4 vectors to another 4 vectors
  3911. {
  3912. x=AddSIMD(x,b.x);
  3913. y=AddSIMD(y,b.y);
  3914. z=AddSIMD(z,b.z);
  3915. }
  3916. FORCEINLINE void operator-=(FourVectors const &b) //< subtract 4 vectors from another 4
  3917. {
  3918. x=SubSIMD(x,b.x);
  3919. y=SubSIMD(y,b.y);
  3920. z=SubSIMD(z,b.z);
  3921. }
  3922. FORCEINLINE void operator*=(FourVectors const &b) //< scale all four vectors per component scale
  3923. {
  3924. x=MulSIMD(x,b.x);
  3925. y=MulSIMD(y,b.y);
  3926. z=MulSIMD(z,b.z);
  3927. }
  3928. FORCEINLINE void operator*=(const fltx4 & scale) //< scale
  3929. {
  3930. x=MulSIMD(x,scale);
  3931. y=MulSIMD(y,scale);
  3932. z=MulSIMD(z,scale);
  3933. }
  3934. FORCEINLINE void operator*=(float scale) //< uniformly scale all 4 vectors
  3935. {
  3936. fltx4 scalepacked = ReplicateX4(scale);
  3937. *this *= scalepacked;
  3938. }
  3939. FORCEINLINE fltx4 operator*(FourVectors const &b) const //< 4 dot products
  3940. {
  3941. fltx4 dot=MulSIMD(x,b.x);
  3942. dot=MaddSIMD(y,b.y,dot);
  3943. dot=MaddSIMD(z,b.z,dot);
  3944. return dot;
  3945. }
  3946. FORCEINLINE fltx4 operator*(Vector const &b) const //< dot product all 4 vectors with 1 vector
  3947. {
  3948. fltx4 dot=MulSIMD(x,ReplicateX4(b.x));
  3949. dot=MaddSIMD(y,ReplicateX4(b.y), dot);
  3950. dot=MaddSIMD(z,ReplicateX4(b.z), dot);
  3951. return dot;
  3952. }
  3953. FORCEINLINE FourVectors operator*(float b) const //< scale
  3954. {
  3955. fltx4 scalepacked = ReplicateX4( b );
  3956. FourVectors res;
  3957. res.x = MulSIMD( x, scalepacked );
  3958. res.y = MulSIMD( y, scalepacked );
  3959. res.z = MulSIMD( z, scalepacked );
  3960. return res;
  3961. }
  3962. FORCEINLINE FourVectors operator*( FLTX4 fl4Scale ) const //< scale
  3963. {
  3964. FourVectors res;
  3965. res.x = MulSIMD( x, fl4Scale );
  3966. res.y = MulSIMD( y, fl4Scale );
  3967. res.z = MulSIMD( z, fl4Scale );
  3968. return res;
  3969. }
  3970. FORCEINLINE void VProduct( FourVectors const &b ) //< component by component mul
  3971. {
  3972. x=MulSIMD(x,b.x);
  3973. y=MulSIMD(y,b.y);
  3974. z=MulSIMD(z,b.z);
  3975. }
  3976. FORCEINLINE void MakeReciprocal(void) //< (x,y,z)=(1/x,1/y,1/z)
  3977. {
  3978. x=ReciprocalSIMD(x);
  3979. y=ReciprocalSIMD(y);
  3980. z=ReciprocalSIMD(z);
  3981. }
  3982. FORCEINLINE void MakeReciprocalSaturate(void) //< (x,y,z)=(1/x,1/y,1/z), 1/0=1.0e23
  3983. {
  3984. x=ReciprocalSaturateSIMD(x);
  3985. y=ReciprocalSaturateSIMD(y);
  3986. z=ReciprocalSaturateSIMD(z);
  3987. }
  3988. // Assume the given matrix is a rotation, and rotate these vectors by it.
  3989. // If you have a long list of FourVectors structures that you all want
  3990. // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
  3991. inline void RotateBy(const matrix3x4_t& matrix);
  3992. /***** removed because one of the SWIG permutations doesn't include ssequaternion.h, causing a missing symbol on this function:
  3993. // rotate these vectors ( in place ) by the corresponding quaternions:
  3994. inline void RotateBy( const FourQuaternions &quats );
  3995. ******/
  3996. /// You can use this to rotate a long array of FourVectors all by the same
  3997. /// matrix. The first parameter is the head of the array. The second is the
  3998. /// number of vectors to rotate. The third is the matrix.
  3999. static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix );
  4000. static void RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut );
  4001. /// Assume the vectors are points, and transform them in place by the matrix.
  4002. inline void TransformBy(const matrix3x4_t& matrix);
  4003. /// You can use this to Transform a long array of FourVectors all by the same
  4004. /// matrix. The first parameter is the head of the array. The second is the
  4005. /// number of vectors to rotate. The third is the matrix. The fourth is the
  4006. /// output buffer, which must not overlap the pVectors buffer. This is not
  4007. /// an in-place transformation.
  4008. static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut );
  4009. /// You can use this to Transform a long array of FourVectors all by the same
  4010. /// matrix. The first parameter is the head of the array. The second is the
  4011. /// number of vectors to rotate. The third is the matrix. The fourth is the
  4012. /// output buffer, which must not overlap the pVectors buffer.
  4013. /// This is an in-place transformation.
  4014. static void TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix );
  4015. static void CalcClosestPointOnLineSIMD( const FourVectors &P, const FourVectors &vLineA, const FourVectors &vLineB, FourVectors &vClosest, fltx4 *outT = 0);
  4016. static fltx4 CalcClosestPointToLineTSIMD( const FourVectors &P, const FourVectors &vLineA, const FourVectors &vLineB, FourVectors &vDir );
  4017. // X(),Y(),Z() - get at the desired component of the i'th (0..3) vector.
  4018. FORCEINLINE const float & X(int idx) const
  4019. {
  4020. // NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
  4021. return SubFloat( (fltx4 &)x, idx );
  4022. }
  4023. FORCEINLINE const float & Y(int idx) const
  4024. {
  4025. return SubFloat( (fltx4 &)y, idx );
  4026. }
  4027. FORCEINLINE const float & Z(int idx) const
  4028. {
  4029. return SubFloat( (fltx4 &)z, idx );
  4030. }
  4031. FORCEINLINE float & X(int idx)
  4032. {
  4033. return SubFloat( x, idx );
  4034. }
  4035. FORCEINLINE float & Y(int idx)
  4036. {
  4037. return SubFloat( y, idx );
  4038. }
  4039. FORCEINLINE float & Z(int idx)
  4040. {
  4041. return SubFloat( z, idx );
  4042. }
  4043. FORCEINLINE Vector Vec(int idx) const //< unpack one of the vectors
  4044. {
  4045. return Vector( X(idx), Y(idx), Z(idx) );
  4046. }
  4047. FORCEINLINE void operator=( FourVectors const &src )
  4048. {
  4049. x=src.x;
  4050. y=src.y;
  4051. z=src.z;
  4052. }
  4053. /// LoadAndSwizzle - load 4 Vectors into a FourVectors, performing transpose op
  4054. FORCEINLINE void LoadAndSwizzle(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
  4055. {
  4056. // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
  4057. // use an unfolded implementation here
  4058. #if defined( _X360 ) || defined(_PS3)
  4059. fltx4 tx = LoadUnalignedSIMD( &a.x );
  4060. fltx4 ty = LoadUnalignedSIMD( &b.x );
  4061. fltx4 tz = LoadUnalignedSIMD( &c.x );
  4062. fltx4 tw = LoadUnalignedSIMD( &d.x );
  4063. fltx4 r0 = VectorMergeHighSIMD(tx, tz);
  4064. fltx4 r1 = VectorMergeHighSIMD(ty, tw);
  4065. fltx4 r2 = VectorMergeLowSIMD(tx, tz);
  4066. fltx4 r3 = VectorMergeLowSIMD(ty, tw);
  4067. x = VectorMergeHighSIMD(r0, r1);
  4068. y = VectorMergeLowSIMD(r0, r1);
  4069. z = VectorMergeHighSIMD(r2, r3);
  4070. #else
  4071. x = LoadUnalignedSIMD( &( a.x ));
  4072. y = LoadUnalignedSIMD( &( b.x ));
  4073. z = LoadUnalignedSIMD( &( c.x ));
  4074. fltx4 w = LoadUnalignedSIMD( &( d.x ));
  4075. // now, matrix is:
  4076. // x y z ?
  4077. // x y z ?
  4078. // x y z ?
  4079. // x y z ?
  4080. TransposeSIMD(x, y, z, w);
  4081. #endif
  4082. }
  4083. FORCEINLINE void LoadAndSwizzle(Vector const &a)
  4084. {
  4085. LoadAndSwizzle( a, a, a, a );
  4086. }
  4087. // Broadcasts a, b, c, and d into the four vectors
  4088. // This is only performant if the floats are ALREADY IN MEMORY
  4089. // and not on registers -- eg,
  4090. // .Load( &fltArrray[0], &fltArrray[1], &fltArrray[2], &fltArrray[3] ) is okay,
  4091. // .Load( fltArrray[0] * 0.5f, fltArrray[1] * 0.5f, fltArrray[2] * 0.5f, fltArrray[3] * 0.5f ) is not.
  4092. FORCEINLINE void Load( const float &a, const float &b, const float &c, const float &d )
  4093. {
  4094. #if defined( _X360 ) || defined( _PS3 )
  4095. fltx4 temp[4];
  4096. temp[0] = LoadUnalignedFloatSIMD( &a );
  4097. temp[1] = LoadUnalignedFloatSIMD( &b );
  4098. temp[2] = LoadUnalignedFloatSIMD( &c );
  4099. temp[3] = LoadUnalignedFloatSIMD( &d );
  4100. y = VectorMergeHighSIMD( temp[0], temp[2] ); // ac__
  4101. z = VectorMergeHighSIMD( temp[1], temp[3] ); // bd__
  4102. x = VectorMergeHighSIMD( y, z ); // abcd
  4103. y = x;
  4104. z = x;
  4105. #else
  4106. ALIGN16 float temp[4];
  4107. temp[0] = a; temp[1] = b; temp[2] = c; temp[3] = d;
  4108. fltx4 v = LoadAlignedSIMD( temp );
  4109. x = v;
  4110. y = v;
  4111. z = v;
  4112. #endif
  4113. }
  4114. // transform four horizontal vectors into the internal vertical ones
  4115. FORCEINLINE void LoadAndSwizzle( FLTX4 a, FLTX4 b, FLTX4 c, FLTX4 d )
  4116. {
  4117. #if defined( _X360 ) || defined( _PS3 )
  4118. fltx4 tx = a;
  4119. fltx4 ty = b;
  4120. fltx4 tz = c;
  4121. fltx4 tw = d;
  4122. fltx4 r0 = VectorMergeHighSIMD(tx, tz);
  4123. fltx4 r1 = VectorMergeHighSIMD(ty, tw);
  4124. fltx4 r2 = VectorMergeLowSIMD(tx, tz);
  4125. fltx4 r3 = VectorMergeLowSIMD(ty, tw);
  4126. x = VectorMergeHighSIMD(r0, r1);
  4127. y = VectorMergeLowSIMD(r0, r1);
  4128. z = VectorMergeHighSIMD(r2, r3);
  4129. #else
  4130. x = a;
  4131. y = b;
  4132. z = c;
  4133. fltx4 w = d;
  4134. // now, matrix is:
  4135. // x y z ?
  4136. // x y z ?
  4137. // x y z ?
  4138. // x y z ?
  4139. TransposeSIMD(x, y, z, w);
  4140. #endif
  4141. }
  4142. /// LoadAndSwizzleAligned - load 4 Vectors into a FourVectors, performing transpose op.
  4143. /// all 4 vectors must be 128 bit boundary
  4144. FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d)
  4145. {
  4146. #if defined( _X360 ) || defined( _PS3 )
  4147. fltx4 tx = LoadAlignedSIMD(a);
  4148. fltx4 ty = LoadAlignedSIMD(b);
  4149. fltx4 tz = LoadAlignedSIMD(c);
  4150. fltx4 tw = LoadAlignedSIMD(d);
  4151. fltx4 r0 = VectorMergeHighSIMD(tx, tz);
  4152. fltx4 r1 = VectorMergeHighSIMD(ty, tw);
  4153. fltx4 r2 = VectorMergeLowSIMD(tx, tz);
  4154. fltx4 r3 = VectorMergeLowSIMD(ty, tw);
  4155. x = VectorMergeHighSIMD(r0, r1);
  4156. y = VectorMergeLowSIMD(r0, r1);
  4157. z = VectorMergeHighSIMD(r2, r3);
  4158. #else
  4159. x = LoadAlignedSIMD( a );
  4160. y = LoadAlignedSIMD( b );
  4161. z = LoadAlignedSIMD( c );
  4162. fltx4 w = LoadAlignedSIMD( d );
  4163. // now, matrix is:
  4164. // x y z ?
  4165. // x y z ?
  4166. // x y z ?
  4167. // x y z ?
  4168. TransposeSIMD( x, y, z, w );
  4169. #endif
  4170. }
  4171. FORCEINLINE void LoadAndSwizzleAligned(Vector const &a, Vector const &b, Vector const &c, Vector const &d)
  4172. {
  4173. LoadAndSwizzleAligned( &a.x, &b.x, &c.x, &d.x );
  4174. }
  4175. /// Unpack a FourVectors back into four horizontal fltx4s.
  4176. /// Since the FourVectors doesn't store a w row, you can optionally
  4177. /// specify your own; otherwise it will be 0.
  4178. /// This function ABSOLUTELY MUST be inlined or the reference parameters will
  4179. /// induce a severe load-hit-store.
  4180. FORCEINLINE void TransposeOnto( fltx4 &out0, fltx4 &out1, fltx4 &out2, fltx4 &out3, FLTX4 w = Four_Zeros ) const
  4181. {
  4182. // TransposeSIMD has large sub-expressions that the compiler can't eliminate on x360
  4183. // use an unfolded implementation here
  4184. #if defined( _X360 ) || defined(_PS3)
  4185. fltx4 r0 = VectorMergeHighSIMD(x, z);
  4186. fltx4 r1 = VectorMergeHighSIMD(y, w);
  4187. fltx4 r2 = VectorMergeLowSIMD(x, z);
  4188. fltx4 r3 = VectorMergeLowSIMD(y, w);
  4189. out0 = VectorMergeHighSIMD(r0, r1);
  4190. out1 = VectorMergeLowSIMD(r0, r1);
  4191. out2 = VectorMergeHighSIMD(r2, r3);
  4192. out3 = VectorMergeLowSIMD(r2, r3);
  4193. #else
  4194. out0 = x;
  4195. out1 = y;
  4196. out2 = z;
  4197. out3 = w;
  4198. TransposeSIMD(out0, out1, out2, out3);
  4199. #endif
  4200. }
  4201. #if !defined(__SPU__)
  4202. /// Store a FourVectors into four NON-CONTIGUOUS Vector*'s.
  4203. FORCEINLINE void StoreUnalignedVector3SIMD( Vector * RESTRICT out0, Vector * RESTRICT out1, Vector * RESTRICT out2, Vector * RESTRICT out3 ) const;
  4204. #endif
  4205. /// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s.
  4206. FORCEINLINE void StoreAlignedVectorSIMD( VectorAligned * RESTRICT out0, VectorAligned * RESTRICT out1, VectorAligned * RESTRICT out2, VectorAligned * RESTRICT out3 ) const;
  4207. #if !defined(__SPU__)
  4208. /// Store a FourVectors into four CONSECUTIVE Vectors in memory,
  4209. /// where the first vector IS NOT aligned on a 16-byte boundary.
  4210. FORCEINLINE void StoreUnalignedContigVector3SIMD( Vector * RESTRICT pDestination )
  4211. {
  4212. fltx4 a,b,c,d;
  4213. TransposeOnto(a,b,c,d);
  4214. StoreFourUnalignedVector3SIMD( a, b, c, d, pDestination );
  4215. }
  4216. #endif
  4217. /// Store a FourVectors into four CONSECUTIVE Vectors in memory,
  4218. /// where the first vector IS aligned on a 16-byte boundary.
  4219. /// (since four Vector3s = 48 bytes, groups of four can be said
  4220. /// to be 16-byte aligned though obviously the 2nd, 3d, and 4th
  4221. /// vectors in the group individually are not)
  4222. #if !defined(__SPU__)
  4223. FORCEINLINE void StoreAlignedContigVector3SIMD( Vector * RESTRICT pDestination )
  4224. {
  4225. fltx4 a,b,c,d;
  4226. TransposeOnto(a,b,c,d);
  4227. StoreFourAlignedVector3SIMD( a, b, c, d, pDestination );
  4228. }
  4229. /// Store a FourVectors into four CONSECUTIVE VectorAligneds in memory
  4230. FORCEINLINE void StoreAlignedContigVectorASIMD( VectorAligned * RESTRICT pDestination )
  4231. {
  4232. StoreAlignedVectorSIMD( pDestination, pDestination + 1, pDestination + 2, pDestination + 3 );
  4233. }
  4234. #endif
  4235. /// return the squared length of all 4 vectors, the same name as used on Vector
  4236. FORCEINLINE fltx4 LengthSqr( void ) const
  4237. {
  4238. const FourVectors &a = *this;
  4239. return a * a;
  4240. }
  4241. /// return the squared length of all 4 vectors
  4242. FORCEINLINE fltx4 length2(void) const
  4243. {
  4244. return (*this)*(*this);
  4245. }
  4246. /// return the approximate length of all 4 vectors. uses the sqrt approximation instruction
  4247. FORCEINLINE fltx4 length(void) const
  4248. {
  4249. return SqrtEstSIMD(length2());
  4250. }
  4251. /// full precision square root. upper/lower case name is an artifact - the lower case one should be changed to refelct the lower accuracy. I added the mixed case one for compat with Vector
  4252. FORCEINLINE fltx4 Length( void ) const
  4253. {
  4254. return SqrtSIMD( length2() );
  4255. }
  4256. /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
  4257. FORCEINLINE void VectorNormalizeFast(void)
  4258. {
  4259. fltx4 mag_sq=(*this)*(*this); // length^2
  4260. (*this) *= ReciprocalSqrtEstSIMD(mag_sq); // *(1.0/sqrt(length^2))
  4261. }
  4262. /// normalize all 4 vectors in place.
  4263. FORCEINLINE void VectorNormalize(void)
  4264. {
  4265. fltx4 mag_sq=(*this)*(*this); // length^2
  4266. (*this) *= ReciprocalSqrtSIMD(mag_sq); // *(1.0/sqrt(length^2))
  4267. }
  4268. FORCEINLINE fltx4 DistToSqr( FourVectors const &pnt )
  4269. {
  4270. fltx4 fl4dX = SubSIMD( pnt.x, x );
  4271. fltx4 fl4dY = SubSIMD( pnt.y, y );
  4272. fltx4 fl4dZ = SubSIMD( pnt.z, z );
  4273. return AddSIMD( MulSIMD( fl4dX, fl4dX), AddSIMD( MulSIMD( fl4dY, fl4dY ), MulSIMD( fl4dZ, fl4dZ ) ) );
  4274. }
  4275. FORCEINLINE fltx4 TValueOfClosestPointOnLine( FourVectors const &p0, FourVectors const &p1 ) const
  4276. {
  4277. FourVectors lineDelta = p1;
  4278. lineDelta -= p0;
  4279. fltx4 OOlineDirDotlineDir = ReciprocalSIMD( p1 * p1 );
  4280. FourVectors v4OurPnt = *this;
  4281. v4OurPnt -= p0;
  4282. return MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta );
  4283. }
  4284. FORCEINLINE fltx4 DistSqrToLineSegment( FourVectors const &p0, FourVectors const &p1 ) const
  4285. {
  4286. FourVectors lineDelta = p1;
  4287. FourVectors v4OurPnt = *this;
  4288. v4OurPnt -= p0;
  4289. lineDelta -= p0;
  4290. fltx4 OOlineDirDotlineDir = ReciprocalSIMD( lineDelta * lineDelta );
  4291. fltx4 fl4T = MulSIMD( OOlineDirDotlineDir, v4OurPnt * lineDelta );
  4292. fl4T = MinSIMD( fl4T, Four_Ones );
  4293. fl4T = MaxSIMD( fl4T, Four_Zeros );
  4294. lineDelta *= fl4T;
  4295. return v4OurPnt.DistToSqr( lineDelta );
  4296. }
  4297. FORCEINLINE FourVectors Normalized()const
  4298. {
  4299. fltx4 fl4LengthInv = ReciprocalSqrtSIMD( LengthSqr() );
  4300. FourVectors out;
  4301. out.x = x * fl4LengthInv;
  4302. out.y = y * fl4LengthInv;
  4303. out.z = z * fl4LengthInv;
  4304. return out;
  4305. }
  4306. FORCEINLINE FourVectors NormalizedSafeX() const
  4307. {
  4308. fltx4 f4LenSqr = LengthSqr();
  4309. fltx4 isBigEnough = CmpGeSIMD( f4LenSqr, Four_Epsilons );
  4310. fltx4 fl4LengthInv = ReciprocalSqrtSIMD( f4LenSqr );
  4311. FourVectors out;
  4312. out.x = MaskedAssign( isBigEnough, x * fl4LengthInv, Four_Ones );
  4313. out.y = AndSIMD( y * fl4LengthInv, isBigEnough );
  4314. out.z = AndSIMD( z * fl4LengthInv, isBigEnough );
  4315. return out;
  4316. }
  4317. FORCEINLINE FourVectors NormalizedSafeY() const
  4318. {
  4319. fltx4 f4LenSqr = LengthSqr();
  4320. fltx4 isBigEnough = CmpGeSIMD( f4LenSqr, Four_Epsilons );
  4321. fltx4 fl4LengthInv = ReciprocalSqrtSIMD( f4LenSqr );
  4322. FourVectors out;
  4323. out.x = AndSIMD( x * fl4LengthInv, isBigEnough );
  4324. out.y = MaskedAssign( isBigEnough, y * fl4LengthInv, Four_Ones );
  4325. out.z = AndSIMD( z * fl4LengthInv, isBigEnough );
  4326. return out;
  4327. }
  4328. FORCEINLINE FourVectors NormalizedSafeZ() const
  4329. {
  4330. fltx4 f4LenSqr = LengthSqr();
  4331. fltx4 isBigEnough = CmpGeSIMD( f4LenSqr, Four_Epsilons );
  4332. fltx4 fl4LengthInv = ReciprocalSqrtSIMD( f4LenSqr );
  4333. FourVectors out;
  4334. out.x = AndSIMD( x * fl4LengthInv, isBigEnough );
  4335. out.y = AndSIMD( y * fl4LengthInv, isBigEnough );
  4336. out.z = MaskedAssign( isBigEnough, z * fl4LengthInv, Four_Ones );
  4337. return out;
  4338. }
  4339. };
  4340. inline FourVectors CrossProduct( const FourVectors& a, const FourVectors& b )
  4341. {
  4342. return FourVectors( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x );
  4343. }
  4344. inline fltx4 DotProduct( const FourVectors& a, const FourVectors& b )
  4345. {
  4346. return a.x * b.x + a.y * b.y + a.z * b.z;
  4347. }
  4348. inline FourVectors operator * ( fltx4 left, const FourVectors &right )
  4349. {
  4350. return right * left;
  4351. }
  4352. //
  4353. inline FourVectors Mul( const FourVectors &a, const fltx4 &b )
  4354. {
  4355. FourVectors ret;
  4356. ret.x = MulSIMD( a.x, b );
  4357. ret.y = MulSIMD( a.y, b );
  4358. ret.z = MulSIMD( a.z, b );
  4359. return ret;
  4360. }
  4361. inline FourVectors Mul( const FourVectors &a, const FourVectors &b )
  4362. {
  4363. FourVectors ret;
  4364. ret.x = MulSIMD( a.x, b.x );
  4365. ret.y = MulSIMD( a.y, b.y );
  4366. ret.z = MulSIMD( a.z, b.z );
  4367. return ret;
  4368. }
  4369. inline FourVectors Madd( const FourVectors &a, const fltx4 &b, const FourVectors &c ) // a*b + c
  4370. {
  4371. FourVectors ret;
  4372. ret.x = MaddSIMD( a.x, b, c.x );
  4373. ret.y = MaddSIMD( a.y, b, c.y );
  4374. ret.z = MaddSIMD( a.z, b, c.z );
  4375. return ret;
  4376. }
  4377. /// form 4 cross products
  4378. inline FourVectors operator ^(const FourVectors &a, const FourVectors &b)
  4379. {
  4380. FourVectors ret;
  4381. ret.x=SubSIMD(MulSIMD(a.y,b.z),MulSIMD(a.z,b.y));
  4382. ret.y=SubSIMD(MulSIMD(a.z,b.x),MulSIMD(a.x,b.z));
  4383. ret.z=SubSIMD(MulSIMD(a.x,b.y),MulSIMD(a.y,b.x));
  4384. return ret;
  4385. }
  4386. inline FourVectors operator-(const FourVectors &a, const FourVectors &b)
  4387. {
  4388. FourVectors ret;
  4389. ret.x=SubSIMD(a.x,b.x);
  4390. ret.y=SubSIMD(a.y,b.y);
  4391. ret.z=SubSIMD(a.z,b.z);
  4392. return ret;
  4393. }
  4394. inline FourVectors operator+( const FourVectors &a, const FourVectors &b )
  4395. {
  4396. FourVectors ret;
  4397. ret.x = AddSIMD( a.x, b.x );
  4398. ret.y = AddSIMD( a.y, b.y );
  4399. ret.z = AddSIMD( a.z, b.z );
  4400. return ret;
  4401. }
  4402. /// component-by-componentwise MAX operator
  4403. inline FourVectors maximum(const FourVectors &a, const FourVectors &b)
  4404. {
  4405. FourVectors ret;
  4406. ret.x=MaxSIMD(a.x,b.x);
  4407. ret.y=MaxSIMD(a.y,b.y);
  4408. ret.z=MaxSIMD(a.z,b.z);
  4409. return ret;
  4410. }
  4411. /// component-by-componentwise MIN operator
  4412. inline FourVectors minimum(const FourVectors &a, const FourVectors &b)
  4413. {
  4414. FourVectors ret;
  4415. ret.x=MinSIMD(a.x,b.x);
  4416. ret.y=MinSIMD(a.y,b.y);
  4417. ret.z=MinSIMD(a.z,b.z);
  4418. return ret;
  4419. }
  4420. FORCEINLINE FourVectors RotateLeft( const FourVectors &src )
  4421. {
  4422. FourVectors ret;
  4423. ret.x = RotateLeft( src.x );
  4424. ret.y = RotateLeft( src.y );
  4425. ret.z = RotateLeft( src.z );
  4426. return ret;
  4427. }
  4428. FORCEINLINE FourVectors RotateRight( const FourVectors &src )
  4429. {
  4430. FourVectors ret;
  4431. ret.x = RotateRight( src.x );
  4432. ret.y = RotateRight( src.y );
  4433. ret.z = RotateRight( src.z );
  4434. return ret;
  4435. }
  4436. FORCEINLINE FourVectors MaskedAssign( const bi32x4 & ReplacementMask, const FourVectors & NewValue, const FourVectors & OldValue )
  4437. {
  4438. FourVectors ret;
  4439. ret.x = MaskedAssign( ReplacementMask, NewValue.x, OldValue.x );
  4440. ret.y = MaskedAssign( ReplacementMask, NewValue.y, OldValue.y );
  4441. ret.z = MaskedAssign( ReplacementMask, NewValue.z, OldValue.z );
  4442. return ret;
  4443. }
  4444. /// calculate reflection vector. incident and normal dir assumed normalized
  4445. FORCEINLINE FourVectors VectorReflect( const FourVectors &incident, const FourVectors &normal )
  4446. {
  4447. FourVectors ret = incident;
  4448. fltx4 iDotNx2 = incident * normal;
  4449. iDotNx2 = AddSIMD( iDotNx2, iDotNx2 );
  4450. FourVectors nPart = normal;
  4451. nPart *= iDotNx2;
  4452. ret -= nPart; // i-2(n*i)n
  4453. return ret;
  4454. }
  4455. /// calculate slide vector. removes all components of a vector which are perpendicular to a normal vector.
  4456. FORCEINLINE FourVectors VectorSlide( const FourVectors &incident, const FourVectors &normal )
  4457. {
  4458. FourVectors ret = incident;
  4459. fltx4 iDotN = incident * normal;
  4460. FourVectors nPart = normal;
  4461. nPart *= iDotN;
  4462. ret -= nPart; // i-(n*i)n
  4463. return ret;
  4464. }
  4465. /// normalize all 4 vectors in place. not mega-accurate (uses reciprocal approximation instruction)
  4466. FORCEINLINE FourVectors VectorNormalizeFast( const FourVectors &src )
  4467. {
  4468. fltx4 mag_sq = ReciprocalSqrtEstSIMD( src * src ); // *(1.0/sqrt(length^2))
  4469. FourVectors result;
  4470. result.x = MulSIMD( src.x, mag_sq );
  4471. result.y = MulSIMD( src.y, mag_sq );
  4472. result.z = MulSIMD( src.z, mag_sq );
  4473. return result;
  4474. }
  4475. #if !defined(__SPU__)
  4476. /// Store a FourVectors into four NON-CONTIGUOUS Vector*'s.
  4477. FORCEINLINE void FourVectors::StoreUnalignedVector3SIMD( Vector * RESTRICT out0, Vector * RESTRICT out1, Vector * RESTRICT out2, Vector * RESTRICT out3 ) const
  4478. {
  4479. #ifdef _X360
  4480. fltx4 x0,x1,x2,x3, y0,y1,y2,y3, z0,z1,z2,z3;
  4481. x0 = SplatXSIMD(x); // all x0x0x0x0
  4482. x1 = SplatYSIMD(x);
  4483. x2 = SplatZSIMD(x);
  4484. x3 = SplatWSIMD(x);
  4485. y0 = SplatXSIMD(y);
  4486. y1 = SplatYSIMD(y);
  4487. y2 = SplatZSIMD(y);
  4488. y3 = SplatWSIMD(y);
  4489. z0 = SplatXSIMD(z);
  4490. z1 = SplatYSIMD(z);
  4491. z2 = SplatZSIMD(z);
  4492. z3 = SplatWSIMD(z);
  4493. __stvewx( x0, out0->Base(), 0 ); // store X word
  4494. __stvewx( y0, out0->Base(), 4 ); // store Y word
  4495. __stvewx( z0, out0->Base(), 8 ); // store Z word
  4496. __stvewx( x1, out1->Base(), 0 ); // store X word
  4497. __stvewx( y1, out1->Base(), 4 ); // store Y word
  4498. __stvewx( z1, out1->Base(), 8 ); // store Z word
  4499. __stvewx( x2, out2->Base(), 0 ); // store X word
  4500. __stvewx( y2, out2->Base(), 4 ); // store Y word
  4501. __stvewx( z2, out2->Base(), 8 ); // store Z word
  4502. __stvewx( x3, out3->Base(), 0 ); // store X word
  4503. __stvewx( y3, out3->Base(), 4 ); // store Y word
  4504. __stvewx( z3, out3->Base(), 8 ); // store Z word
  4505. #else
  4506. fltx4 a,b,c,d;
  4507. TransposeOnto(a,b,c,d);
  4508. StoreUnaligned3SIMD( out0->Base(), a );
  4509. StoreUnaligned3SIMD( out1->Base(), b );
  4510. StoreUnaligned3SIMD( out2->Base(), c );
  4511. StoreUnaligned3SIMD( out3->Base(), d );
  4512. #endif
  4513. }
  4514. /// Store a FourVectors into four NON-CONTIGUOUS VectorAligned s.
  4515. FORCEINLINE void FourVectors::StoreAlignedVectorSIMD( VectorAligned * RESTRICT out0, VectorAligned * RESTRICT out1, VectorAligned * RESTRICT out2, VectorAligned * RESTRICT out3 ) const
  4516. {
  4517. fltx4 a,b,c,d;
  4518. TransposeOnto(a,b,c,d);
  4519. StoreAligned3SIMD( out0, a );
  4520. StoreAligned3SIMD( out1, b );
  4521. StoreAligned3SIMD( out2, c );
  4522. StoreAligned3SIMD( out3, d );
  4523. }
  4524. #endif
  4525. #if !defined(__SPU__)
  4526. // Assume the given matrix is a rotation, and rotate these vectors by it.
  4527. // If you have a long list of FourVectors structures that you all want
  4528. // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
  4529. void FourVectors::RotateBy(const matrix3x4_t& matrix)
  4530. {
  4531. // Splat out each of the entries in the matrix to a fltx4. Do this
  4532. // in the order that we will need them, to hide latency. I'm
  4533. // avoiding making an array of them, so that they'll remain in
  4534. // registers.
  4535. fltx4 matSplat00, matSplat01, matSplat02,
  4536. matSplat10, matSplat11, matSplat12,
  4537. matSplat20, matSplat21, matSplat22;
  4538. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  4539. // often unaligned. The w components will be the tranpose row of
  4540. // the matrix, but we don't really care about that.
  4541. fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] );
  4542. fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] );
  4543. fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] );
  4544. matSplat00 = SplatXSIMD( matCol0 );
  4545. matSplat01 = SplatYSIMD( matCol0 );
  4546. matSplat02 = SplatZSIMD( matCol0 );
  4547. matSplat10 = SplatXSIMD( matCol1 );
  4548. matSplat11 = SplatYSIMD( matCol1 );
  4549. matSplat12 = SplatZSIMD( matCol1 );
  4550. matSplat20 = SplatXSIMD( matCol2 );
  4551. matSplat21 = SplatYSIMD( matCol2 );
  4552. matSplat22 = SplatZSIMD( matCol2 );
  4553. // Trust in the compiler to schedule these operations correctly:
  4554. fltx4 outX, outY, outZ;
  4555. outX = AddSIMD( AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ), MulSIMD( z, matSplat02 ) );
  4556. outY = AddSIMD( AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ), MulSIMD( z, matSplat12 ) );
  4557. outZ = AddSIMD( AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ), MulSIMD( z, matSplat22 ) );
  4558. x = outX;
  4559. y = outY;
  4560. z = outZ;
  4561. }
  4562. // Assume the given matrix is a rotation, and rotate these vectors by it.
  4563. // If you have a long list of FourVectors structures that you all want
  4564. // to rotate by the same matrix, use FourVectors::RotateManyBy() instead.
  4565. void FourVectors::TransformBy(const matrix3x4_t& matrix)
  4566. {
  4567. // Splat out each of the entries in the matrix to a fltx4. Do this
  4568. // in the order that we will need them, to hide latency. I'm
  4569. // avoiding making an array of them, so that they'll remain in
  4570. // registers.
  4571. fltx4 matSplat00, matSplat01, matSplat02,
  4572. matSplat10, matSplat11, matSplat12,
  4573. matSplat20, matSplat21, matSplat22;
  4574. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  4575. // often unaligned. The w components will be the tranpose row of
  4576. // the matrix, but we don't really care about that.
  4577. fltx4 matCol0 = LoadUnalignedSIMD( matrix[0] );
  4578. fltx4 matCol1 = LoadUnalignedSIMD( matrix[1] );
  4579. fltx4 matCol2 = LoadUnalignedSIMD( matrix[2] );
  4580. matSplat00 = SplatXSIMD( matCol0 );
  4581. matSplat01 = SplatYSIMD( matCol0 );
  4582. matSplat02 = SplatZSIMD( matCol0 );
  4583. matSplat10 = SplatXSIMD( matCol1 );
  4584. matSplat11 = SplatYSIMD( matCol1 );
  4585. matSplat12 = SplatZSIMD( matCol1 );
  4586. matSplat20 = SplatXSIMD( matCol2 );
  4587. matSplat21 = SplatYSIMD( matCol2 );
  4588. matSplat22 = SplatZSIMD( matCol2 );
  4589. // Trust in the compiler to schedule these operations correctly:
  4590. fltx4 outX, outY, outZ;
  4591. outX = MaddSIMD( z, matSplat02, AddSIMD( MulSIMD( x, matSplat00 ), MulSIMD( y, matSplat01 ) ) );
  4592. outY = MaddSIMD( z, matSplat12, AddSIMD( MulSIMD( x, matSplat10 ), MulSIMD( y, matSplat11 ) ) );
  4593. outZ = MaddSIMD( z, matSplat22, AddSIMD( MulSIMD( x, matSplat20 ), MulSIMD( y, matSplat21 ) ) );
  4594. x = AddSIMD( outX, ReplicateX4( matrix[0][3] ));
  4595. y = AddSIMD( outY, ReplicateX4( matrix[1][3] ));
  4596. z = AddSIMD( outZ, ReplicateX4( matrix[2][3] ));
  4597. }
  4598. #endif
  4599. fltx4 NoiseSIMD( FourVectors const &v );
  4600. // vector valued noise direction
  4601. FourVectors DNoiseSIMD( FourVectors const &v );
  4602. // vector value "curl" noise function. see http://hyperphysics.phy-astr.gsu.edu/hbase/curl.html
  4603. FourVectors CurlNoiseSIMD( FourVectors const &v );
  4604. //#endif // !defined SPU
  4605. /// quick, low quality perlin-style noise() function suitable for real time use.
  4606. /// return value is -1..1. Only reliable around +/- 1 million or so.
  4607. fltx4 NoiseSIMD( const fltx4 & x, const fltx4 & y, const fltx4 & z );
  4608. /// calculate the absolute value of a packed single
  4609. inline fltx4 fabs( const fltx4 & x )
  4610. {
  4611. return AndSIMD( x, LoadAlignedSIMD( g_SIMD_clear_signmask ) );
  4612. }
  4613. // Convenience version
  4614. inline fltx4 AbsSIMD( const fltx4 & x )
  4615. {
  4616. return fabs( x );
  4617. }
  4618. /// negate all four components of a SIMD packed single
  4619. inline fltx4 fnegate( const fltx4 & x )
  4620. {
  4621. return XorSIMD( x, LoadAlignedSIMD( g_SIMD_signmask ) );
  4622. }
  4623. fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent);
  4624. // PowSIMD - raise a SIMD register to a power. This is analogous to the C pow() function, with some
  4625. // restictions: fractional exponents are only handled with 2 bits of precision. Basically,
  4626. // fractions of 0,.25,.5, and .75 are handled. PowSIMD(x,.30) will be the same as PowSIMD(x,.25).
  4627. // negative and fractional powers are handled by the SIMD reciprocal and square root approximation
  4628. // instructions and so are not especially accurate ----Note that this routine does not raise
  4629. // numeric exceptions because it uses SIMD--- This routine is O(log2(exponent)).
  4630. inline fltx4 PowSIMD( const fltx4 & x, float exponent )
  4631. {
  4632. return Pow_FixedPoint_Exponent_SIMD(x,(int) (4.0*exponent));
  4633. }
  4634. /// (x<1)?x^(1/2.2):1. Use a 4th order polynomial to approximate x^(1/2.2) over 0..1
  4635. inline fltx4 LinearToGammaSIMD( fltx4 x )
  4636. {
  4637. // y = -3.7295x4 + 8.9635x3 - 7.7397x2 + 3.443x + 0.048
  4638. x = MaxSIMD( MinSIMD( Four_Ones, x ), Four_Zeros );
  4639. return AddSIMD( Four_LinearToGammaCoefficients_E,
  4640. MulSIMD( x, AddSIMD( Four_LinearToGammaCoefficients_D,
  4641. MulSIMD( x, AddSIMD( Four_LinearToGammaCoefficients_C,
  4642. MulSIMD( x, AddSIMD( Four_LinearToGammaCoefficients_B,
  4643. MulSIMD( x, Four_LinearToGammaCoefficients_A ) ) ) ) ) ) ) );
  4644. }
  4645. inline fltx4 GammaToLinearSIMD( fltx4 x )
  4646. {
  4647. x = MaxSIMD( x, Four_Zeros );
  4648. x = AddSIMD( Four_GammaToLinearCoefficients_D,
  4649. MulSIMD( x, AddSIMD( Four_GammaToLinearCoefficients_C,
  4650. MulSIMD( x, AddSIMD( Four_GammaToLinearCoefficients_B,
  4651. MulSIMD( x, Four_GammaToLinearCoefficients_A ) ) ) ) ) );
  4652. return MinSIMD( x, Four_Ones );
  4653. }
  4654. /// ( x > 1 ) ? x : x^2.2
  4655. inline fltx4 GammaToLinearExtendedSIMD( fltx4 x )
  4656. {
  4657. x = MaxSIMD( x, Four_Zeros );
  4658. fltx4 fl4Ret = AddSIMD( Four_GammaToLinearCoefficients_D,
  4659. MulSIMD( x, AddSIMD( Four_GammaToLinearCoefficients_C,
  4660. MulSIMD( x, AddSIMD( Four_GammaToLinearCoefficients_B,
  4661. MulSIMD( x, Four_GammaToLinearCoefficients_A ) ) ) ) ) );
  4662. return MaskedAssign( CmpGeSIMD( x, Four_Ones ), x, fl4Ret );
  4663. }
  4664. // random number generation - generate 4 random numbers quickly.
  4665. void SeedRandSIMD(uint32 seed); // seed the random # generator
  4666. fltx4 RandSIMD( int nContext = 0 ); // return 4 numbers in the 0..1 range
  4667. // for multithreaded, you need to use these and use the argument form of RandSIMD:
  4668. int GetSIMDRandContext( void );
  4669. void ReleaseSIMDRandContext( int nContext );
  4670. FORCEINLINE fltx4 RandSignedSIMD( void ) // -1..1
  4671. {
  4672. return SubSIMD( MulSIMD( Four_Twos, RandSIMD() ), Four_Ones );
  4673. }
  4674. FORCEINLINE fltx4 LerpSIMD ( const fltx4 &percent, const fltx4 &a, const fltx4 &b)
  4675. {
  4676. return AddSIMD( a, MulSIMD( SubSIMD( b, a ), percent ) );
  4677. }
  4678. FORCEINLINE fltx4 RemapValClampedSIMD(const fltx4 &val, const fltx4 &a, const fltx4 &b, const fltx4 &c, const fltx4 &d) // Remap val from clamped range between a and b to new range between c and d
  4679. {
  4680. fltx4 range = MaskedAssign( CmpEqSIMD( a, b ), Four_Ones, SubSIMD( b, a ) ); //make sure range > 0
  4681. fltx4 cVal = MaxSIMD( Four_Zeros, MinSIMD( Four_Ones, DivSIMD( SubSIMD( val, a ), range ) ) ); //saturate
  4682. return LerpSIMD( cVal, c, d );
  4683. }
  4684. // SIMD versions of mathlib simplespline functions
  4685. // hermite basis function for smooth interpolation
  4686. // Similar to Gain() above, but very cheap to call
  4687. // value should be between 0 & 1 inclusive
  4688. inline fltx4 SimpleSpline( const fltx4 & value )
  4689. {
  4690. // Arranged to avoid a data dependency between these two MULs:
  4691. fltx4 valueDoubled = MulSIMD( value, Four_Twos );
  4692. fltx4 valueSquared = MulSIMD( value, value );
  4693. // Nice little ease-in, ease-out spline-like curve
  4694. return SubSIMD(
  4695. MulSIMD( Four_Threes, valueSquared ),
  4696. MulSIMD( valueDoubled, valueSquared ) );
  4697. }
  4698. // remaps a value in [startInterval, startInterval+rangeInterval] from linear to
  4699. // spline using SimpleSpline
  4700. inline fltx4 SimpleSplineRemapValWithDeltas( const fltx4 & val,
  4701. const fltx4 & A, const fltx4 & BMinusA,
  4702. const fltx4 & OneOverBMinusA, const fltx4 & C,
  4703. const fltx4 & DMinusC )
  4704. {
  4705. // if ( A == B )
  4706. // return val >= B ? D : C;
  4707. fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA );
  4708. return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) );
  4709. }
  4710. inline fltx4 SimpleSplineRemapValWithDeltasClamped( const fltx4 & val,
  4711. const fltx4 & A, const fltx4 & BMinusA,
  4712. const fltx4 & OneOverBMinusA, const fltx4 & C,
  4713. const fltx4 & DMinusC )
  4714. {
  4715. // if ( A == B )
  4716. // return val >= B ? D : C;
  4717. fltx4 cVal = MulSIMD( SubSIMD( val, A), OneOverBMinusA );
  4718. cVal = MinSIMD( Four_Ones, MaxSIMD( Four_Zeros, cVal ) );
  4719. return AddSIMD( C, MulSIMD( DMinusC, SimpleSpline( cVal ) ) );
  4720. }
  4721. FORCEINLINE fltx4 FracSIMD( const fltx4 &val )
  4722. {
  4723. fltx4 fl4Abs = fabs( val );
  4724. fltx4 ival = SubSIMD( AddSIMD( fl4Abs, Four_2ToThe23s ), Four_2ToThe23s );
  4725. ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Ones ), ival );
  4726. return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits
  4727. }
  4728. #ifndef SPU
  4729. // Disable on SPU for the moment as it generates a warning
  4730. // warning: dereferencing type-punned pointer will break strict-aliasing rules
  4731. // This is related to LoadAlignedSIMD( (float *) g_SIMD_lsbmask )
  4732. // LoadAlignedSIMD() under the hood is dereferencing the variable.
  4733. FORCEINLINE fltx4 Mod2SIMD( const fltx4 &val )
  4734. {
  4735. fltx4 fl4Abs = fabs( val );
  4736. fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( (float *) g_SIMD_lsbmask ), AddSIMD( fl4Abs, Four_2ToThe23s ) ), Four_2ToThe23s );
  4737. ival = MaskedAssign( CmpGtSIMD( ival, fl4Abs ), SubSIMD( ival, Four_Twos ), ival );
  4738. return XorSIMD( SubSIMD( fl4Abs, ival ), XorSIMD( val, fl4Abs ) ); // restore sign bits
  4739. }
  4740. #endif
  4741. FORCEINLINE fltx4 Mod2SIMDPositiveInput( const fltx4 &val )
  4742. {
  4743. fltx4 ival = SubSIMD( AndSIMD( LoadAlignedSIMD( g_SIMD_lsbmask ), AddSIMD( val, Four_2ToThe23s ) ), Four_2ToThe23s );
  4744. ival = MaskedAssign( CmpGtSIMD( ival, val ), SubSIMD( ival, Four_Twos ), ival );
  4745. return SubSIMD( val, ival );
  4746. }
  4747. // approximate sin of an angle, with -1..1 representing the whole sin wave period instead of -pi..pi.
  4748. // no range reduction is done - for values outside of 0..1 you won't like the results
  4749. FORCEINLINE fltx4 _SinEst01SIMD( const fltx4 &val )
  4750. {
  4751. // really rough approximation - x*(4-x*4) - a parabola. s(0) = 0, s(.5) = 1, s(1)=0, smooth in-between.
  4752. // sufficient for simple oscillation.
  4753. return MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) );
  4754. }
  4755. FORCEINLINE fltx4 _Sin01SIMD( const fltx4 &val )
  4756. {
  4757. // not a bad approximation : parabola always over-estimates. Squared parabola always
  4758. // underestimates. So lets blend between them: goodsin = badsin + .225*( badsin^2-badsin)
  4759. fltx4 fl4BadEst = MulSIMD( val, SubSIMD( Four_Fours, MulSIMD( val, Four_Fours ) ) );
  4760. return AddSIMD( MulSIMD( Four_Point225s, SubSIMD( MulSIMD( fl4BadEst, fl4BadEst ), fl4BadEst ) ), fl4BadEst );
  4761. }
  4762. // full range useable implementations
  4763. FORCEINLINE fltx4 SinEst01SIMD( const fltx4 &val )
  4764. {
  4765. fltx4 fl4Abs = fabs( val );
  4766. fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs );
  4767. bi32x4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones );
  4768. fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) );
  4769. fltx4 fl4Sin = _SinEst01SIMD( fl4val );
  4770. fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) );
  4771. return fl4Sin;
  4772. }
  4773. FORCEINLINE fltx4 Sin01SIMD( const fltx4 &val )
  4774. {
  4775. fltx4 fl4Abs = fabs( val );
  4776. fltx4 fl4Reduced2 = Mod2SIMDPositiveInput( fl4Abs );
  4777. bi32x4 fl4OddMask = CmpGeSIMD( fl4Reduced2, Four_Ones );
  4778. fltx4 fl4val = SubSIMD( fl4Reduced2, AndSIMD( Four_Ones, fl4OddMask ) );
  4779. fltx4 fl4Sin = _Sin01SIMD( fl4val );
  4780. fl4Sin = XorSIMD( fl4Sin, AndSIMD( LoadAlignedSIMD( g_SIMD_signmask ), XorSIMD( val, fl4OddMask ) ) );
  4781. return fl4Sin;
  4782. }
  4783. FORCEINLINE fltx4 NatExpSIMD( const fltx4 &val ) // why is ExpSimd( x ) defined to be 2^x?
  4784. {
  4785. // need to write this. just stub with normal float implementation for now
  4786. fltx4 fl4Result;
  4787. SubFloat( fl4Result, 0 ) = exp( SubFloat( val, 0 ) );
  4788. SubFloat( fl4Result, 1 ) = exp( SubFloat( val, 1 ) );
  4789. SubFloat( fl4Result, 2 ) = exp( SubFloat( val, 2 ) );
  4790. SubFloat( fl4Result, 3 ) = exp( SubFloat( val, 3 ) );
  4791. return fl4Result;
  4792. }
  4793. // Schlick style Bias approximation see graphics gems 4 : bias(t,a)= t/( (1/a-2)*(1-t)+1)
  4794. FORCEINLINE fltx4 PreCalcBiasParameter( const fltx4 &bias_parameter )
  4795. {
  4796. // convert perlin-style-bias parameter to the value right for the approximation
  4797. return SubSIMD( ReciprocalSIMD( bias_parameter ), Four_Twos );
  4798. }
  4799. FORCEINLINE fltx4 BiasSIMD( const fltx4 &val, const fltx4 &precalc_param )
  4800. {
  4801. // similar to bias function except pass precalced bias value from calling PreCalcBiasParameter.
  4802. //!!speed!! use reciprocal est?
  4803. //!!speed!! could save one op by precalcing _2_ values
  4804. return DivSIMD( val, AddSIMD( MulSIMD( precalc_param, SubSIMD( Four_Ones, val ) ), Four_Ones ) );
  4805. }
  4806. //-----------------------------------------------------------------------------
  4807. // Box/plane test
  4808. // NOTE: The w component of emins + emaxs must be 1 for this to work
  4809. //-----------------------------------------------------------------------------
  4810. #ifndef SPU
  4811. // We don't need this on SPU right now
  4812. FORCEINLINE int BoxOnPlaneSideSIMD( const fltx4& emins, const fltx4& emaxs, const cplane_t *p, float tolerance = 0.f )
  4813. {
  4814. fltx4 corners[2];
  4815. fltx4 normal = LoadUnalignedSIMD( p->normal.Base() );
  4816. fltx4 dist = ReplicateX4( -p->dist );
  4817. normal = SetWSIMD( normal, dist );
  4818. fltx4 t4 = ReplicateX4( tolerance );
  4819. fltx4 negt4 = ReplicateX4( -tolerance );
  4820. bi32x4 cmp = CmpGeSIMD( normal, Four_Zeros );
  4821. corners[0] = MaskedAssign( cmp, emaxs, emins );
  4822. corners[1] = MaskedAssign( cmp, emins, emaxs );
  4823. fltx4 dot1 = Dot4SIMD( normal, corners[0] );
  4824. fltx4 dot2 = Dot4SIMD( normal, corners[1] );
  4825. cmp = CmpGeSIMD( dot1, t4 );
  4826. bi32x4 cmp2 = CmpGtSIMD( negt4, dot2 );
  4827. fltx4 result = MaskedAssign( cmp, Four_Ones, Four_Zeros );
  4828. fltx4 result2 = MaskedAssign( cmp2, Four_Twos, Four_Zeros );
  4829. result = AddSIMD( result, result2 );
  4830. intx4 sides;
  4831. ConvertStoreAsIntsSIMD( &sides, result );
  4832. return sides[0];
  4833. }
  4834. // k-dop bounding volume. 26-dop bounds with 13 plane-pairs plus 3 other "arbitrary bounds". The arbitrary values could be used to hold type info, etc,
  4835. // which can compare against "for free"
  4836. class KDop32_t
  4837. {
  4838. public:
  4839. fltx4 m_Mins[4];
  4840. fltx4 m_Maxes[4];
  4841. FORCEINLINE bool Intersects( KDop32_t const &other ) const;
  4842. FORCEINLINE void operator|=( KDop32_t const & other );
  4843. FORCEINLINE bool IsEmpty( void ) const;
  4844. FORCEINLINE void Init( void )
  4845. {
  4846. for( int i = 0; i < ARRAYSIZE( m_Mins ); i++ )
  4847. {
  4848. m_Mins[i] = Four_FLT_MAX;
  4849. m_Maxes[i] = Four_Negative_FLT_MAX;
  4850. }
  4851. }
  4852. // given a set of points, expand the kdop to contain them
  4853. void AddPointSet( Vector const *pPoints, int nPnts );
  4854. void CreateFromPointSet( Vector const *pPoints, int nPnts );
  4855. };
  4856. FORCEINLINE void KDop32_t::operator|=( KDop32_t const & other )
  4857. {
  4858. m_Mins[0] = MinSIMD( m_Mins[0], other.m_Mins[0] );
  4859. m_Mins[1] = MinSIMD( m_Mins[1], other.m_Mins[1] );
  4860. m_Mins[2] = MinSIMD( m_Mins[2], other.m_Mins[2] );
  4861. m_Mins[3] = MinSIMD( m_Mins[3], other.m_Mins[3] );
  4862. m_Maxes[0] = MaxSIMD( m_Maxes[0], other.m_Maxes[0] );
  4863. m_Maxes[1] = MaxSIMD( m_Maxes[1], other.m_Maxes[1] );
  4864. m_Maxes[2] = MaxSIMD( m_Maxes[2], other.m_Maxes[2] );
  4865. m_Maxes[3] = MaxSIMD( m_Maxes[3], other.m_Maxes[3] );
  4866. }
  4867. FORCEINLINE bool KDop32_t::Intersects( KDop32_t const &other ) const
  4868. {
  4869. bi32x4 c00 = CmpLeSIMD( m_Mins[0], other.m_Maxes[0] );
  4870. bi32x4 c01 = CmpLeSIMD( m_Mins[1], other.m_Maxes[1] );
  4871. bi32x4 c02 = CmpLeSIMD( m_Mins[2], other.m_Maxes[2] );
  4872. bi32x4 c03 = CmpLeSIMD( m_Mins[3], other.m_Maxes[3] );
  4873. bi32x4 c10 = CmpGeSIMD( m_Maxes[0], other.m_Mins[0] );
  4874. bi32x4 c11 = CmpGeSIMD( m_Maxes[1], other.m_Mins[1] );
  4875. bi32x4 c12 = CmpGeSIMD( m_Maxes[2], other.m_Mins[2] );
  4876. bi32x4 c13 = CmpGeSIMD( m_Maxes[3], other.m_Mins[3] );
  4877. bi32x4 a0 = AndSIMD( AndSIMD( c00, c01 ), AndSIMD( c02, c03 ) );
  4878. bi32x4 a1 = AndSIMD( AndSIMD( c10, c11 ), AndSIMD( c12, c13 ) );
  4879. return ! ( IsAnyZeros( AndSIMD( a1, a0 ) ) );
  4880. }
  4881. FORCEINLINE bool KDop32_t::IsEmpty( void ) const
  4882. {
  4883. bi32x4 c00 = CmpLtSIMD( m_Maxes[0], m_Mins[0] );
  4884. bi32x4 c01 = CmpLtSIMD( m_Maxes[1], m_Mins[1] );
  4885. bi32x4 c02 = CmpLtSIMD( m_Maxes[2], m_Mins[2] );
  4886. bi32x4 c03 = CmpLtSIMD( m_Maxes[3], m_Mins[3] );
  4887. return IsAnyTrue( OrSIMD( OrSIMD( c00, c01 ), OrSIMD( c02, c03 ) ) );
  4888. }
  4889. extern const fltx4 g_KDop32XDirs[4];
  4890. extern const fltx4 g_KDop32YDirs[4];
  4891. extern const fltx4 g_KDop32ZDirs[4];
  4892. #endif
  4893. #if 0
  4894. // FIXME!!! If we need a version of this that runs on 360, this is a work-in-progress version that hasn't been debugged.
  4895. #define _VEC_SWIZZLE_QUAT48_UNPACK (__vector unsigned char) { 16, 17, 0, 1, 16, 17, 2, 3, 16, 17, 4, 5, 16, 17, 6, 7 }
  4896. #define _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT (__vector unsigned int ) { 0, 0, 1, 0 }
  4897. // unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
  4898. FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )
  4899. {
  4900. // A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
  4901. // z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
  4902. // w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
  4903. // w's sign bit.
  4904. // fltx4 q16s = XMLoadVector3((const void *)pVec);
  4905. fltx4 q16s = LoadUnaligned3SIMD( (const float * )pVec);
  4906. // fltx4 shift = *( fltx4 * )&g_SIMD_Quat48_Unpack_Shift; // load the aligned shift mask that we use to shuffle z.
  4907. // fltx4 permute = *( fltx4 * )&g_SIMD_Quat48_Unpack_Permute0; // load the permute word that shuffles x,y,z into their own words
  4908. bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
  4909. // q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
  4910. q16s = vec_perm( q16s, Four_Threes, _VEC_SWIZZLE_QUAT48_UNPACK ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
  4911. // q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  4912. // q16s = vec_sl( *( u32x4 * )( void * )( &q16s ), _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT ); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
  4913. u32x4 tmp = IntShiftLeftWordSIMD( *( u32x4 * )&q16s, _VEC_SWIZZLE_QUAT48_UNPACK_SHIFT );
  4914. q16s = *( fltx4 * )&tmp;
  4915. // each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
  4916. const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
  4917. const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
  4918. /*
  4919. fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats.
  4920. // scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
  4921. ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes );
  4922. */
  4923. // fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd );
  4924. fltx4 ret = vec_madd( q16s, vUpkMul, vUpkAdd );
  4925. // now, work out what w must be.
  4926. fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.
  4927. dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );
  4928. fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz
  4929. ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)
  4930. if ( wneg )
  4931. {
  4932. ret = SetWSIMD( ret, NegSIMD( ww ) );
  4933. // ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret
  4934. }
  4935. else
  4936. {
  4937. ret = SetWSIMD( ret, ww );
  4938. // ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret
  4939. }
  4940. return ret;
  4941. }
  4942. #endif
  4943. // These are not optimized right now for some platforms. We should be able to shuffle the values in some platforms.
  4944. // As the methods are hard-coded we can actually avoid loading memory to do the transfer.
  4945. // We should be able to create all versions.
  4946. FORCEINLINE fltx4 SetWFromXSIMD( const fltx4 & a, const fltx4 & x )
  4947. {
  4948. fltx4 value = SplatXSIMD( x );
  4949. return SetWSIMD( a, value );
  4950. }
  4951. FORCEINLINE fltx4 SetWFromYSIMD( const fltx4 & a, const fltx4 & y )
  4952. {
  4953. fltx4 value = SplatYSIMD( y );
  4954. return SetWSIMD( a, value );
  4955. }
  4956. FORCEINLINE fltx4 SetWFromZSIMD( const fltx4 & a, const fltx4 & z )
  4957. {
  4958. fltx4 value = SplatZSIMD( z );
  4959. return SetWSIMD( a, value );
  4960. }
  4961. FORCEINLINE fltx4 CrossProductSIMD( const fltx4 &A, const fltx4 &B )
  4962. {
  4963. #if defined( _X360 )
  4964. return XMVector3Cross( A, B );
  4965. #elif defined( _WIN32 )
  4966. fltx4 A1 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );
  4967. fltx4 B1 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );
  4968. fltx4 Result1 = MulSIMD( A1, B1 );
  4969. fltx4 A2 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );
  4970. fltx4 B2 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );
  4971. fltx4 Result2 = MulSIMD( A2, B2 );
  4972. return SubSIMD( Result1, Result2 );
  4973. #elif defined(_PS3)
  4974. /*
  4975. fltx4 perm1 = (vector unsigned char){0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x00,0x01,0x02,0x03,0x0c,0x0d,0x0e,0x0f};
  4976. fltx4 perm2 = (vector unsigned char){0x08,0x09,0x0a,0x0b,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0x0f};
  4977. fltx4 A1 = __vpermwi( A, A, perm1 );
  4978. fltx4 A2 = __vpermwi( B, B, perm2 );
  4979. fltx4 Result1 = MulSIMD( A1, B1 );
  4980. fltx4 A2 = __vpermwi( A, A, perm2 );
  4981. fltx4 B2 = __vpermwi( B, B, perm1 );
  4982. return MsubSIMD( A2, B2, Result1 );
  4983. */
  4984. return _vmathVfCross( A, B );
  4985. #else
  4986. fltx4 CrossVal;
  4987. SubFloat( CrossVal, 0 ) = SubFloat( A, 1 )*SubFloat( B, 2 ) - SubFloat( A, 2 )*SubFloat( B, 1 );
  4988. SubFloat( CrossVal, 1 ) = SubFloat( A, 2 )*SubFloat( B, 0 ) - SubFloat( A, 0 )*SubFloat( B, 2 );
  4989. SubFloat( CrossVal, 2 ) = SubFloat( A, 0 )*SubFloat( B, 1 ) - SubFloat( A, 1 )*SubFloat( B, 0 );
  4990. SubFloat( CrossVal, 3 ) = 0;
  4991. return CrossVal;
  4992. #endif
  4993. }
  4994. inline const fltx4 Length3SIMD(const fltx4 vec)
  4995. {
  4996. fltx4 scLengthSqr = Dot3SIMD(vec,vec);
  4997. bi32x4 isSignificant = CmpGtSIMD(scLengthSqr, Four_Epsilons);
  4998. fltx4 scLengthInv = ReciprocalSqrtSIMD(scLengthSqr);
  4999. return AndSIMD(isSignificant, MulSIMD(scLengthInv, scLengthSqr));
  5000. }
  5001. inline const fltx4 Normalized3SIMD (const fltx4 vec)
  5002. {
  5003. fltx4 scLengthSqr = Dot3SIMD(vec,vec);
  5004. bi32x4 isSignificant = CmpGtSIMD(scLengthSqr, Four_Epsilons);
  5005. fltx4 scLengthInv = ReciprocalSqrtSIMD(scLengthSqr);
  5006. return AndSIMD(isSignificant, MulSIMD(vec, scLengthInv));
  5007. }
  5008. // Some convenience operator overloads, which are just aliasing the functions above.
  5009. // Unneccessary on 360, as you already have them from xboxmath.h
  5010. // Componentwise add
  5011. #ifndef COMPILER_GCC
  5012. FORCEINLINE fltx4 operator+=( fltx4 &a, FLTX4 b )
  5013. {
  5014. a = AddSIMD( a, b );
  5015. return a;
  5016. }
  5017. FORCEINLINE fltx4 operator-=( fltx4 &a, FLTX4 b )
  5018. {
  5019. a = SubSIMD( a, b );
  5020. return a;
  5021. }
  5022. FORCEINLINE fltx4 operator*=( fltx4 &a, FLTX4 b )
  5023. {
  5024. a = MulSIMD( a, b );
  5025. return a;
  5026. }
  5027. #endif
  5028. #endif // _ssemath_h