Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

921 lines
30 KiB

  1. //===== Copyright 1996-2005, Valve Corporation, All rights reserved. ======//
  2. //
  3. // Purpose: Implementation of our SIMD function using generic c++ code and a struct. This
  4. // implementation will not be especially fast, but gets us up fast on new platforms and also acts
  5. // as an easy-to-understand reference implementation.
  6. //
  7. //==============================================================//
  8. //---------------------------------------------------------------------
  9. // Standard C (fallback/new platform) implementation (only there for compat - slow)
  10. //---------------------------------------------------------------------
  11. FORCEINLINE float SubFloat( const fltx4 & a, int idx )
  12. {
  13. return a.m128_f32[ idx ];
  14. }
  15. FORCEINLINE float & SubFloat( fltx4 & a, int idx )
  16. {
  17. return a.m128_f32[idx];
  18. }
  19. FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
  20. {
  21. return a.m128_u32[idx];
  22. }
  23. FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
  24. {
  25. return a.m128_u32[idx];
  26. }
  27. // Return one in the fastest way -- on the x360, faster even than loading.
  28. FORCEINLINE fltx4 LoadZeroSIMD( void )
  29. {
  30. return Four_Zeros;
  31. }
  32. // Return one in the fastest way -- on the x360, faster even than loading.
  33. FORCEINLINE fltx4 LoadOneSIMD( void )
  34. {
  35. return Four_Ones;
  36. }
  37. FORCEINLINE fltx4 SplatXSIMD( const fltx4 & a )
  38. {
  39. fltx4 retVal;
  40. SubFloat( retVal, 0 ) = SubFloat( a, 0 );
  41. SubFloat( retVal, 1 ) = SubFloat( a, 0 );
  42. SubFloat( retVal, 2 ) = SubFloat( a, 0 );
  43. SubFloat( retVal, 3 ) = SubFloat( a, 0 );
  44. return retVal;
  45. }
  46. FORCEINLINE fltx4 SplatYSIMD( fltx4 a )
  47. {
  48. fltx4 retVal;
  49. SubFloat( retVal, 0 ) = SubFloat( a, 1 );
  50. SubFloat( retVal, 1 ) = SubFloat( a, 1 );
  51. SubFloat( retVal, 2 ) = SubFloat( a, 1 );
  52. SubFloat( retVal, 3 ) = SubFloat( a, 1 );
  53. return retVal;
  54. }
  55. FORCEINLINE fltx4 SplatZSIMD( fltx4 a )
  56. {
  57. fltx4 retVal;
  58. SubFloat( retVal, 0 ) = SubFloat( a, 2 );
  59. SubFloat( retVal, 1 ) = SubFloat( a, 2 );
  60. SubFloat( retVal, 2 ) = SubFloat( a, 2 );
  61. SubFloat( retVal, 3 ) = SubFloat( a, 2 );
  62. return retVal;
  63. }
  64. FORCEINLINE fltx4 SplatWSIMD( fltx4 a )
  65. {
  66. fltx4 retVal;
  67. SubFloat( retVal, 0 ) = SubFloat( a, 3 );
  68. SubFloat( retVal, 1 ) = SubFloat( a, 3 );
  69. SubFloat( retVal, 2 ) = SubFloat( a, 3 );
  70. SubFloat( retVal, 3 ) = SubFloat( a, 3 );
  71. return retVal;
  72. }
  73. FORCEINLINE fltx4 SetXSIMD( const fltx4& a, const fltx4& x )
  74. {
  75. fltx4 result = a;
  76. SubFloat( result, 0 ) = SubFloat( x, 0 );
  77. return result;
  78. }
  79. FORCEINLINE fltx4 SetYSIMD( const fltx4& a, const fltx4& y )
  80. {
  81. fltx4 result = a;
  82. SubFloat( result, 1 ) = SubFloat( y, 1 );
  83. return result;
  84. }
  85. FORCEINLINE fltx4 SetZSIMD( const fltx4& a, const fltx4& z )
  86. {
  87. fltx4 result = a;
  88. SubFloat( result, 2 ) = SubFloat( z, 2 );
  89. return result;
  90. }
  91. FORCEINLINE fltx4 SetWSIMD( const fltx4& a, const fltx4& w )
  92. {
  93. fltx4 result = a;
  94. SubFloat( result, 3 ) = SubFloat( w, 3 );
  95. return result;
  96. }
  97. /// Set one component of a SIMD word with the given float value.
  98. /// This function is a template because the native implementation of
  99. /// this on PPC platforms requires that the component be given as a
  100. /// compiler immediate -- not a function parameter, not a const function
  101. /// parameter, not even a load from a const static array. It has to be
  102. /// a real immediate.
  103. /// \param NCOMPONENT 0 is x, 1 is y, 2 is z, 3 is w.
  104. /// \note This function is not particularly performant on any platform (because of
  105. /// the load from float), so prefer a masked assign from a fltx4 wherever
  106. /// possible.
  107. template < unsigned int NCOMPONENT >
  108. FORCEINLINE fltx4 SetComponentSIMD( const fltx4& a, float flValue )
  109. {
  110. fltx4 result = a;
  111. SubFloat( result, NCOMPONENT ) = flValue;
  112. return result;
  113. }
  114. // a b c d -> b c d a
  115. FORCEINLINE fltx4 RotateLeft( const fltx4 & a )
  116. {
  117. fltx4 retVal;
  118. SubFloat( retVal, 0 ) = SubFloat( a, 1 );
  119. SubFloat( retVal, 1 ) = SubFloat( a, 2 );
  120. SubFloat( retVal, 2 ) = SubFloat( a, 3 );
  121. SubFloat( retVal, 3 ) = SubFloat( a, 0 );
  122. return retVal;
  123. }
  124. // a b c d -> c d a b
  125. FORCEINLINE fltx4 RotateLeft2( const fltx4 & a )
  126. {
  127. fltx4 retVal;
  128. SubFloat( retVal, 0 ) = SubFloat( a, 2 );
  129. SubFloat( retVal, 1 ) = SubFloat( a, 3 );
  130. SubFloat( retVal, 2 ) = SubFloat( a, 0 );
  131. SubFloat( retVal, 3 ) = SubFloat( a, 1 );
  132. return retVal;
  133. }
  134. #define BINOP(op) \
  135. fltx4 retVal; \
  136. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) op SubFloat( b, 0 ) ); \
  137. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) op SubFloat( b, 1 ) ); \
  138. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) op SubFloat( b, 2 ) ); \
  139. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) op SubFloat( b, 3 ) ); \
  140. return retVal;
  141. #define IBINOP(op) \
  142. fltx4 retVal; \
  143. SubInt( retVal, 0 ) = ( SubInt( a, 0 ) op SubInt ( b, 0 ) ); \
  144. SubInt( retVal, 1 ) = ( SubInt( a, 1 ) op SubInt ( b, 1 ) ); \
  145. SubInt( retVal, 2 ) = ( SubInt( a, 2 ) op SubInt ( b, 2 ) ); \
  146. SubInt( retVal, 3 ) = ( SubInt( a, 3 ) op SubInt ( b, 3 ) ); \
  147. return retVal;
  148. FORCEINLINE fltx4 AddSIMD( const fltx4 & a, const fltx4 & b )
  149. {
  150. BINOP(+);
  151. }
  152. FORCEINLINE fltx4 SubSIMD( const fltx4 & a, const fltx4 & b ) // a-b
  153. {
  154. BINOP(-);
  155. };
  156. FORCEINLINE fltx4 MulSIMD( const fltx4 & a, const fltx4 & b ) // a*b
  157. {
  158. BINOP(*);
  159. }
  160. FORCEINLINE fltx4 DivSIMD( const fltx4 & a, const fltx4 & b ) // a/b
  161. {
  162. BINOP(/);
  163. }
  164. FORCEINLINE fltx4 MaddSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // a*b + c
  165. {
  166. return AddSIMD( MulSIMD(a,b), c );
  167. }
  168. FORCEINLINE fltx4 MsubSIMD( const fltx4 & a, const fltx4 & b, const fltx4 & c ) // c - a*b
  169. {
  170. return SubSIMD( c, MulSIMD(a,b) );
  171. };
  172. FORCEINLINE fltx4 SinSIMD( const fltx4 &radians )
  173. {
  174. fltx4 result;
  175. SubFloat( result, 0 ) = sin( SubFloat( radians, 0 ) );
  176. SubFloat( result, 1 ) = sin( SubFloat( radians, 1 ) );
  177. SubFloat( result, 2 ) = sin( SubFloat( radians, 2 ) );
  178. SubFloat( result, 3 ) = sin( SubFloat( radians, 3 ) );
  179. return result;
  180. }
  181. FORCEINLINE void SinCos3SIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  182. {
  183. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  184. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  185. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  186. }
  187. FORCEINLINE void SinCosSIMD( fltx4 &sine, fltx4 &cosine, const fltx4 &radians )
  188. {
  189. SinCos( SubFloat( radians, 0 ), &SubFloat( sine, 0 ), &SubFloat( cosine, 0 ) );
  190. SinCos( SubFloat( radians, 1 ), &SubFloat( sine, 1 ), &SubFloat( cosine, 1 ) );
  191. SinCos( SubFloat( radians, 2 ), &SubFloat( sine, 2 ), &SubFloat( cosine, 2 ) );
  192. SinCos( SubFloat( radians, 3 ), &SubFloat( sine, 3 ), &SubFloat( cosine, 3 ) );
  193. }
  194. FORCEINLINE fltx4 ArcSinSIMD( const fltx4 &sine )
  195. {
  196. fltx4 result;
  197. SubFloat( result, 0 ) = asin( SubFloat( sine, 0 ) );
  198. SubFloat( result, 1 ) = asin( SubFloat( sine, 1 ) );
  199. SubFloat( result, 2 ) = asin( SubFloat( sine, 2 ) );
  200. SubFloat( result, 3 ) = asin( SubFloat( sine, 3 ) );
  201. return result;
  202. }
  203. FORCEINLINE fltx4 ArcCosSIMD( const fltx4 &cs )
  204. {
  205. fltx4 result;
  206. SubFloat( result, 0 ) = acos( SubFloat( cs, 0 ) );
  207. SubFloat( result, 1 ) = acos( SubFloat( cs, 1 ) );
  208. SubFloat( result, 2 ) = acos( SubFloat( cs, 2 ) );
  209. SubFloat( result, 3 ) = acos( SubFloat( cs, 3 ) );
  210. return result;
  211. }
  212. // tan^1(a/b) .. ie, pass sin in as a and cos in as b
  213. FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
  214. {
  215. fltx4 result;
  216. SubFloat( result, 0 ) = atan2( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  217. SubFloat( result, 1 ) = atan2( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  218. SubFloat( result, 2 ) = atan2( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  219. SubFloat( result, 3 ) = atan2( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  220. return result;
  221. }
  222. FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b ) // max(a,b)
  223. {
  224. fltx4 retVal;
  225. SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  226. SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  227. SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  228. SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  229. return retVal;
  230. }
  231. FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b ) // min(a,b)
  232. {
  233. fltx4 retVal;
  234. SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) );
  235. SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) );
  236. SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) );
  237. SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) );
  238. return retVal;
  239. }
  240. FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b ) // a & b
  241. {
  242. IBINOP(&);
  243. }
  244. FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b ) // ~a & b
  245. {
  246. fltx4 retVal;
  247. SubInt( retVal, 0 ) = ~SubInt( a, 0 ) & SubInt( b, 0 );
  248. SubInt( retVal, 1 ) = ~SubInt( a, 1 ) & SubInt( b, 1 );
  249. SubInt( retVal, 2 ) = ~SubInt( a, 2 ) & SubInt( b, 2 );
  250. SubInt( retVal, 3 ) = ~SubInt( a, 3 ) & SubInt( b, 3 );
  251. return retVal;
  252. }
  253. FORCEINLINE fltx4 XorSIMD( const fltx4 & a, const fltx4 & b ) // a ^ b
  254. {
  255. IBINOP(^);
  256. }
  257. FORCEINLINE fltx4 OrSIMD( const fltx4 & a, const fltx4 & b ) // a | b
  258. {
  259. IBINOP(|);
  260. }
  261. FORCEINLINE fltx4 NegSIMD(const fltx4 &a) // negate: -a
  262. {
  263. fltx4 retval;
  264. SubFloat( retval, 0 ) = -SubFloat( a, 0 );
  265. SubFloat( retval, 1 ) = -SubFloat( a, 1 );
  266. SubFloat( retval, 2 ) = -SubFloat( a, 2 );
  267. SubFloat( retval, 3 ) = -SubFloat( a, 3 );
  268. return retval;
  269. }
  270. FORCEINLINE bool IsAllZeros( const fltx4 & a ) // all floats of a zero?
  271. {
  272. return ( SubFloat( a, 0 ) == 0.0 ) &&
  273. ( SubFloat( a, 1 ) == 0.0 ) &&
  274. ( SubFloat( a, 2 ) == 0.0 ) &&
  275. ( SubFloat( a, 3 ) == 0.0 ) ;
  276. }
  277. // for branching when a.xyzw > b.xyzw
  278. FORCEINLINE bool IsAllGreaterThan( const fltx4 &a, const fltx4 &b )
  279. {
  280. return SubFloat(a,0) > SubFloat(b,0) &&
  281. SubFloat(a,1) > SubFloat(b,1) &&
  282. SubFloat(a,2) > SubFloat(b,2) &&
  283. SubFloat(a,3) > SubFloat(b,3);
  284. }
  285. // for branching when a.xyzw >= b.xyzw
  286. FORCEINLINE bool IsAllGreaterThanOrEq( const fltx4 &a, const fltx4 &b )
  287. {
  288. return SubFloat(a,0) >= SubFloat(b,0) &&
  289. SubFloat(a,1) >= SubFloat(b,1) &&
  290. SubFloat(a,2) >= SubFloat(b,2) &&
  291. SubFloat(a,3) >= SubFloat(b,3);
  292. }
  293. // For branching if all a.xyzw == b.xyzw
  294. FORCEINLINE bool IsAllEqual( const fltx4 & a, const fltx4 & b )
  295. {
  296. return SubFloat(a,0) == SubFloat(b,0) &&
  297. SubFloat(a,1) == SubFloat(b,1) &&
  298. SubFloat(a,2) == SubFloat(b,2) &&
  299. SubFloat(a,3) == SubFloat(b,3);
  300. }
  301. // For branching if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w
  302. FORCEINLINE bool IsAnyEqual( const fltx4 & a, const fltx4 & b )
  303. {
  304. return SubFloat(a,0) == SubFloat(b,0) ||
  305. SubFloat(a,1) == SubFloat(b,1) ||
  306. SubFloat(a,2) == SubFloat(b,2) ||
  307. SubFloat(a,3) == SubFloat(b,3);
  308. }
  309. FORCEINLINE int TestSignSIMD( const fltx4 & a ) // mask of which floats have the high bit set
  310. {
  311. int nRet = 0;
  312. nRet |= ( SubInt( a, 0 ) & 0x80000000 ) >> 31; // sign(x) -> bit 0
  313. nRet |= ( SubInt( a, 1 ) & 0x80000000 ) >> 30; // sign(y) -> bit 1
  314. nRet |= ( SubInt( a, 2 ) & 0x80000000 ) >> 29; // sign(z) -> bit 2
  315. nRet |= ( SubInt( a, 3 ) & 0x80000000 ) >> 28; // sign(w) -> bit 3
  316. return nRet;
  317. }
  318. FORCEINLINE bool IsAnyNegative( const fltx4 & a ) // (a.x < 0) || (a.y < 0) || (a.z < 0) || (a.w < 0)
  319. {
  320. return (0 != TestSignSIMD( a ));
  321. }
  322. FORCEINLINE fltx4 CmpEqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? ~0:0
  323. {
  324. fltx4 retVal;
  325. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? ~0 : 0;
  326. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? ~0 : 0;
  327. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? ~0 : 0;
  328. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? ~0 : 0;
  329. return retVal;
  330. }
  331. FORCEINLINE fltx4 CmpGtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? ~0:0
  332. {
  333. fltx4 retVal;
  334. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? ~0 : 0;
  335. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? ~0 : 0;
  336. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? ~0 : 0;
  337. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? ~0 : 0;
  338. return retVal;
  339. }
  340. FORCEINLINE fltx4 CmpGeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? ~0:0
  341. {
  342. fltx4 retVal;
  343. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? ~0 : 0;
  344. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? ~0 : 0;
  345. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? ~0 : 0;
  346. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? ~0 : 0;
  347. return retVal;
  348. }
  349. FORCEINLINE fltx4 CmpLtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? ~0:0
  350. {
  351. fltx4 retVal;
  352. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? ~0 : 0;
  353. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? ~0 : 0;
  354. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? ~0 : 0;
  355. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? ~0 : 0;
  356. return retVal;
  357. }
  358. FORCEINLINE fltx4 CmpLeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? ~0:0
  359. {
  360. fltx4 retVal;
  361. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? ~0 : 0;
  362. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? ~0 : 0;
  363. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? ~0 : 0;
  364. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? ~0 : 0;
  365. return retVal;
  366. }
  367. FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? ~0 : 0
  368. {
  369. fltx4 retVal;
  370. SubInt( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? ~0 : 0;
  371. SubInt( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? ~0 : 0;
  372. SubInt( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? ~0 : 0;
  373. SubInt( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? ~0 : 0;
  374. return retVal;
  375. }
  376. ///\name Functions which perform comparisons, resulting in a float value of either 0.0 or 1.0 (as opposed to resulting in a 32-bit integer mask ).
  377. ///@{
  378. FORCEINLINE fltx4 Cmp01EqSIMD( const fltx4 & a, const fltx4 & b ) // (a==b) ? 1.0:0
  379. {
  380. fltx4 retVal;
  381. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) == SubFloat( b, 0 )) ? 1.0 : 0;
  382. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) == SubFloat( b, 1 )) ? 1.0 : 0;
  383. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) == SubFloat( b, 2 )) ? 1.0 : 0;
  384. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) == SubFloat( b, 3 )) ? 1.0 : 0;
  385. return retVal;
  386. }
  387. FORCEINLINE fltx4 Cmp01GtSIMD( const fltx4 & a, const fltx4 & b ) // (a>b) ? 1.0:0
  388. {
  389. fltx4 retVal;
  390. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) > SubFloat( b, 0 )) ? 1.0 : 0;
  391. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) > SubFloat( b, 1 )) ? 1.0 : 0;
  392. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) > SubFloat( b, 2 )) ? 1.0 : 0;
  393. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) > SubFloat( b, 3 )) ? 1.0 : 0;
  394. return retVal;
  395. }
  396. FORCEINLINE fltx4 Cmp01GeSIMD( const fltx4 & a, const fltx4 & b ) // (a>=b) ? 1.0:0
  397. {
  398. fltx4 retVal;
  399. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) >= SubFloat( b, 0 )) ? 1.0 : 0;
  400. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) >= SubFloat( b, 1 )) ? 1.0 : 0;
  401. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) >= SubFloat( b, 2 )) ? 1.0 : 0;
  402. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) >= SubFloat( b, 3 )) ? 1.0 : 0;
  403. return retVal;
  404. }
  405. FORCEINLINE fltx4 Cmp01LtSIMD( const fltx4 & a, const fltx4 & b ) // (a<b) ? 1.0:0
  406. {
  407. fltx4 retVal;
  408. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) < SubFloat( b, 0 )) ? 1.0 : 0;
  409. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) < SubFloat( b, 1 )) ? 1.0 : 0;
  410. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) < SubFloat( b, 2 )) ? 1.0 : 0;
  411. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) < SubFloat( b, 3 )) ? 1.0 : 0;
  412. return retVal;
  413. }
  414. FORCEINLINE fltx4 Cmp01LeSIMD( const fltx4 & a, const fltx4 & b ) // (a<=b) ? 1.0:0
  415. {
  416. fltx4 retVal;
  417. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 )) ? 1.0 : 0;
  418. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 )) ? 1.0 : 0;
  419. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 )) ? 1.0 : 0;
  420. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 )) ? 1.0 : 0;
  421. return retVal;
  422. }
  423. FORCEINLINE fltx4 Cmp01InBoundsSIMD( const fltx4 & a, const fltx4 & b ) // (a <= b && a >= -b) ? 1.0 : 0
  424. {
  425. fltx4 retVal;
  426. SubFloat( retVal, 0 ) = ( SubFloat( a, 0 ) <= SubFloat( b, 0 ) && SubFloat( a, 0 ) >= -SubFloat( b, 0 ) ) ? 1.0 : 0;
  427. SubFloat( retVal, 1 ) = ( SubFloat( a, 1 ) <= SubFloat( b, 1 ) && SubFloat( a, 1 ) >= -SubFloat( b, 1 ) ) ? 1.0 : 0;
  428. SubFloat( retVal, 2 ) = ( SubFloat( a, 2 ) <= SubFloat( b, 2 ) && SubFloat( a, 2 ) >= -SubFloat( b, 2 ) ) ? 1.0 : 0;
  429. SubFloat( retVal, 3 ) = ( SubFloat( a, 3 ) <= SubFloat( b, 3 ) && SubFloat( a, 3 ) >= -SubFloat( b, 3 ) ) ? 1.0 : 0;
  430. return retVal;
  431. }
  432. //@}
  433. // simd conditional. for example, a simd version of "( x > 0 ) ? a : b" would be expressed as
  434. // "MaskedAssign( CmpGtSIMD( x, Four_Zeros ), a, b )". A typical use is to conditionally update
  435. // subfiles of a fltx4 based upon some test.
  436. FORCEINLINE fltx4 MaskedAssign( const fltx4 & ReplacementMask, const fltx4 & NewValue, const fltx4 & OldValue )
  437. {
  438. return OrSIMD(
  439. AndSIMD( ReplacementMask, NewValue ),
  440. AndNotSIMD( ReplacementMask, OldValue ) );
  441. }
  442. FORCEINLINE fltx4 ReplicateX4( float flValue ) // a,a,a,a
  443. {
  444. fltx4 retVal;
  445. SubFloat( retVal, 0 ) = flValue;
  446. SubFloat( retVal, 1 ) = flValue;
  447. SubFloat( retVal, 2 ) = flValue;
  448. SubFloat( retVal, 3 ) = flValue;
  449. return retVal;
  450. }
  451. /// replicate a single 32 bit integer value to all 4 components of an m128
  452. FORCEINLINE fltx4 ReplicateIX4( int nValue )
  453. {
  454. fltx4 retVal;
  455. SubInt( retVal, 0 ) = nValue;
  456. SubInt( retVal, 1 ) = nValue;
  457. SubInt( retVal, 2 ) = nValue;
  458. SubInt( retVal, 3 ) = nValue;
  459. return retVal;
  460. }
  461. // Round towards positive infinity
  462. FORCEINLINE fltx4 CeilSIMD( const fltx4 &a )
  463. {
  464. fltx4 retVal;
  465. SubFloat( retVal, 0 ) = ceil( SubFloat( a, 0 ) );
  466. SubFloat( retVal, 1 ) = ceil( SubFloat( a, 1 ) );
  467. SubFloat( retVal, 2 ) = ceil( SubFloat( a, 2 ) );
  468. SubFloat( retVal, 3 ) = ceil( SubFloat( a, 3 ) );
  469. return retVal;
  470. }
  471. // Round towards negative infinity
  472. FORCEINLINE fltx4 FloorSIMD( const fltx4 &a )
  473. {
  474. fltx4 retVal;
  475. SubFloat( retVal, 0 ) = floor( SubFloat( a, 0 ) );
  476. SubFloat( retVal, 1 ) = floor( SubFloat( a, 1 ) );
  477. SubFloat( retVal, 2 ) = floor( SubFloat( a, 2 ) );
  478. SubFloat( retVal, 3 ) = floor( SubFloat( a, 3 ) );
  479. return retVal;
  480. }
  481. FORCEINLINE fltx4 SqrtEstSIMD( const fltx4 & a ) // sqrt(a), more or less
  482. {
  483. fltx4 retVal;
  484. SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
  485. SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
  486. SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
  487. SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
  488. return retVal;
  489. }
  490. FORCEINLINE fltx4 SqrtSIMD( const fltx4 & a ) // sqrt(a)
  491. {
  492. fltx4 retVal;
  493. SubFloat( retVal, 0 ) = sqrt( SubFloat( a, 0 ) );
  494. SubFloat( retVal, 1 ) = sqrt( SubFloat( a, 1 ) );
  495. SubFloat( retVal, 2 ) = sqrt( SubFloat( a, 2 ) );
  496. SubFloat( retVal, 3 ) = sqrt( SubFloat( a, 3 ) );
  497. return retVal;
  498. }
  499. FORCEINLINE fltx4 ReciprocalSqrtEstSIMD( const fltx4 & a ) // 1/sqrt(a), more or less
  500. {
  501. fltx4 retVal;
  502. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
  503. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
  504. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
  505. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
  506. return retVal;
  507. }
  508. FORCEINLINE fltx4 ReciprocalSqrtEstSaturateSIMD( const fltx4 & a )
  509. {
  510. fltx4 retVal;
  511. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) != 0.0f ? SubFloat( a, 0 ) : FLT_EPSILON );
  512. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) != 0.0f ? SubFloat( a, 1 ) : FLT_EPSILON );
  513. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) != 0.0f ? SubFloat( a, 2 ) : FLT_EPSILON );
  514. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) != 0.0f ? SubFloat( a, 3 ) : FLT_EPSILON );
  515. return retVal;
  516. }
  517. FORCEINLINE fltx4 ReciprocalSqrtSIMD( const fltx4 & a ) // 1/sqrt(a)
  518. {
  519. fltx4 retVal;
  520. SubFloat( retVal, 0 ) = 1.0 / sqrt( SubFloat( a, 0 ) );
  521. SubFloat( retVal, 1 ) = 1.0 / sqrt( SubFloat( a, 1 ) );
  522. SubFloat( retVal, 2 ) = 1.0 / sqrt( SubFloat( a, 2 ) );
  523. SubFloat( retVal, 3 ) = 1.0 / sqrt( SubFloat( a, 3 ) );
  524. return retVal;
  525. }
  526. FORCEINLINE fltx4 ReciprocalEstSIMD( const fltx4 & a ) // 1/a, more or less
  527. {
  528. fltx4 retVal;
  529. SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
  530. SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
  531. SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
  532. SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
  533. return retVal;
  534. }
  535. FORCEINLINE fltx4 ReciprocalSIMD( const fltx4 & a ) // 1/a
  536. {
  537. fltx4 retVal;
  538. SubFloat( retVal, 0 ) = 1.0 / SubFloat( a, 0 );
  539. SubFloat( retVal, 1 ) = 1.0 / SubFloat( a, 1 );
  540. SubFloat( retVal, 2 ) = 1.0 / SubFloat( a, 2 );
  541. SubFloat( retVal, 3 ) = 1.0 / SubFloat( a, 3 );
  542. return retVal;
  543. }
  544. /// 1/x for all 4 values.
  545. /// 1/0 will result in a big but NOT infinite result
  546. FORCEINLINE fltx4 ReciprocalEstSaturateSIMD( const fltx4 & a )
  547. {
  548. fltx4 retVal;
  549. SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
  550. SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
  551. SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
  552. SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
  553. return retVal;
  554. }
  555. FORCEINLINE fltx4 ReciprocalSaturateSIMD( const fltx4 & a )
  556. {
  557. fltx4 retVal;
  558. SubFloat( retVal, 0 ) = 1.0 / (SubFloat( a, 0 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 0 ));
  559. SubFloat( retVal, 1 ) = 1.0 / (SubFloat( a, 1 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 1 ));
  560. SubFloat( retVal, 2 ) = 1.0 / (SubFloat( a, 2 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 2 ));
  561. SubFloat( retVal, 3 ) = 1.0 / (SubFloat( a, 3 ) == 0.0f ? FLT_EPSILON : SubFloat( a, 3 ));
  562. return retVal;
  563. }
  564. /// 2^x for all values (the antilog)
  565. FORCEINLINE fltx4 PowerOfTwoSIMD( const fltx4 &toPower )
  566. {
  567. fltx4 retVal;
  568. SubFloat( retVal, 0 ) = powf( 2, SubFloat(toPower, 0) );
  569. SubFloat( retVal, 1 ) = powf( 2, SubFloat(toPower, 1) );
  570. SubFloat( retVal, 2 ) = powf( 2, SubFloat(toPower, 2) );
  571. SubFloat( retVal, 3 ) = powf( 2, SubFloat(toPower, 3) );
  572. return retVal;
  573. }
  574. /// horizontal 3d dotproduct
  575. FORCEINLINE fltx4 Dot3SIMD( const fltx4 &a, const fltx4 &b )
  576. {
  577. float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
  578. SubFloat( a, 1 ) * SubFloat( b, 1 ) +
  579. SubFloat( a, 2 ) * SubFloat( b, 2 );
  580. return ReplicateX4( flDot );
  581. }
  582. /// horizontal 4d dotproduct
  583. FORCEINLINE fltx4 Dot4SIMD( const fltx4 &a, const fltx4 &b )
  584. {
  585. float flDot = SubFloat( a, 0 ) * SubFloat( b, 0 ) +
  586. SubFloat( a, 1 ) * SubFloat( b, 1 ) +
  587. SubFloat( a, 2 ) * SubFloat( b, 2 ) +
  588. SubFloat( a, 3 ) * SubFloat( b, 3 );
  589. return ReplicateX4( flDot );
  590. }
  591. /// Clamps the components of a vector to a specified minimum and maximum range.
  592. FORCEINLINE fltx4 ClampVectorSIMD( FLTX4 in, FLTX4 min, FLTX4 max)
  593. {
  594. return MaxSIMD( min, MinSIMD( max, in ) );
  595. }
  596. /// Squelch the w component of a vector to +0.0. Most efficient when you say a = SetWToZeroSIMD(a)
  597. /// (avoids a copy)
  598. FORCEINLINE fltx4 SetWToZeroSIMD( const fltx4 & a )
  599. {
  600. fltx4 retval;
  601. retval = a;
  602. SubFloat( retval, 0 ) = 0;
  603. return retval;
  604. }
  605. FORCEINLINE fltx4 LoadUnalignedSIMD( const void *pSIMD )
  606. {
  607. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  608. }
  609. FORCEINLINE fltx4 LoadUnaligned3SIMD( const void *pSIMD )
  610. {
  611. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  612. }
  613. /// load a single unaligned float into the x component of a SIMD word
  614. FORCEINLINE fltx4 LoadUnalignedFloatSIMD( const float *pFlt )
  615. {
  616. fltx4 retval;
  617. SubFloat( retval, 0 ) = *pFlt;
  618. return retval;
  619. }
  620. FORCEINLINE fltx4 LoadAlignedSIMD( const void *pSIMD )
  621. {
  622. return *( reinterpret_cast< const fltx4 *> ( pSIMD ) );
  623. }
  624. /// for the transitional class -- load a 3-by VectorAligned and squash its w component
  625. FORCEINLINE fltx4 LoadAlignedSIMD( const VectorAligned & pSIMD )
  626. {
  627. fltx4 retval = LoadAlignedSIMD(pSIMD.Base());
  628. // squelch w
  629. SubInt( retval, 3 ) = 0;
  630. return retval;
  631. }
  632. FORCEINLINE void StoreAlignedSIMD( float *pSIMD, const fltx4 & a )
  633. {
  634. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  635. }
  636. FORCEINLINE void StoreUnalignedSIMD( float *pSIMD, const fltx4 & a )
  637. {
  638. *( reinterpret_cast< fltx4 *> ( pSIMD ) ) = a;
  639. }
  640. FORCEINLINE void StoreUnaligned3SIMD( float *pSIMD, const fltx4 & a )
  641. {
  642. *pSIMD = SubFloat(a, 0);
  643. *(pSIMD+1) = SubFloat(a, 1);
  644. *(pSIMD+2) = SubFloat(a, 2);
  645. }
  646. /// strongly typed -- syntactic castor oil used for typechecking as we transition to SIMD
  647. FORCEINLINE void StoreAligned3SIMD( VectorAligned * RESTRICT pSIMD, const fltx4 & a )
  648. {
  649. StoreAlignedSIMD(pSIMD->Base(),a);
  650. }
  651. /// Store the x,y,z components of the four FLTX4 parameters
  652. // into the four consecutive Vectors pDestination[0], pDestination[1], pDestination[2],
  653. // pDestination[3] The Vectors are assumed
  654. /// to be unaligned.
  655. FORCEINLINE void StoreFourUnalignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  656. Vector * const pDestination )
  657. {
  658. StoreUnaligned3SIMD( pDestination->Base(), a );
  659. StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
  660. StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
  661. StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
  662. }
  663. // Store the x,y,z components of the four FLTX4 parameters
  664. // into the four consecutive Vectors:
  665. // pDestination , pDestination + 1, pDestination + 2, pDestination + 3
  666. // The Vectors are assumed to start on an ALIGNED address, that is,
  667. // pDestination is 16-byte aligned (thhough obviously pDestination+1 is not).
  668. FORCEINLINE void StoreFourAlignedVector3SIMD( fltx4 a, fltx4 b, fltx4 c, FLTX4 d, // first three passed by copy (deliberate)
  669. Vector * const pDestination )
  670. {
  671. StoreUnaligned3SIMD( pDestination->Base(), a );
  672. StoreUnaligned3SIMD( (pDestination+1)->Base(), b );
  673. StoreUnaligned3SIMD( (pDestination+2)->Base(), c );
  674. StoreUnaligned3SIMD( (pDestination+3)->Base(), d );
  675. }
  676. FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
  677. {
  678. #define SWAP_FLOATS( _a_, _ia_, _b_, _ib_ ) \
  679. { \
  680. float tmp = SubFloat( _a_, _ia_ ); \
  681. SubFloat( _a_, _ia_ ) = SubFloat( _b_, _ib_ ); \
  682. SubFloat( _b_, _ib_ ) = tmp; \
  683. }
  684. SWAP_FLOATS( x, 1, y, 0 );
  685. SWAP_FLOATS( x, 2, z, 0 );
  686. SWAP_FLOATS( x, 3, w, 0 );
  687. SWAP_FLOATS( y, 2, z, 1 );
  688. SWAP_FLOATS( y, 3, w, 1 );
  689. SWAP_FLOATS( z, 3, w, 2 );
  690. }
  691. /// find the lowest component of a.x, a.y, a.z, and replicate it to the whole return value.
  692. FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
  693. {
  694. float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
  695. return ReplicateX4(lowest);
  696. }
  697. /// find the highest component of a.x, a.y, a.z, and replicate it to the whole return value.
  698. FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
  699. {
  700. float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
  701. return ReplicateX4(highest);
  702. }
  703. /// Fixed-point conversion and save as SIGNED INTS. pDest->x = Int (vSrc.x) note: some
  704. /// architectures have means of doing fixed point conversion when the fix depth is specified as an
  705. /// immediate.. but there is no way to guarantee an immediate as a parameter to function like this.
  706. FORCEINLINE void ConvertStoreAsIntsSIMD(intx4 * RESTRICT pDest, const fltx4 &vSrc)
  707. {
  708. (*pDest)[0] = SubFloat(vSrc, 0);
  709. (*pDest)[1] = SubFloat(vSrc, 1);
  710. (*pDest)[2] = SubFloat(vSrc, 2);
  711. (*pDest)[3] = SubFloat(vSrc, 3);
  712. }
  713. ///@group INTEGER SIMD OPERATIONS {
  714. /// splat all components of a vector to a signed immediate int number.
  715. FORCEINLINE fltx4 IntSetImmediateSIMD( int nValue )
  716. {
  717. fltx4 retval;
  718. SubInt( retval, 0 ) = SubInt( retval, 1 ) = SubInt( retval, 2 ) = SubInt( retval, 3) = nValue;
  719. return retval;
  720. }
  721. /// Load 4 aligned words into a SIMD register
  722. FORCEINLINE i32x4 LoadAlignedIntSIMD(const void * RESTRICT pSIMD)
  723. {
  724. return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
  725. }
  726. /// Load 4 unaligned words into a SIMD register
  727. FORCEINLINE i32x4 LoadUnalignedIntSIMD( const void * RESTRICT pSIMD)
  728. {
  729. return *( reinterpret_cast< const i32x4 *> ( pSIMD ) );
  730. }
  731. /// save into four words, 16-byte aligned
  732. FORCEINLINE void StoreAlignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  733. {
  734. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  735. }
  736. FORCEINLINE void StoreAlignedIntSIMD( intx4 &pSIMD, const fltx4 & a )
  737. {
  738. *( reinterpret_cast< i32x4 *> ( pSIMD.Base() ) ) = a;
  739. }
  740. FORCEINLINE void StoreUnalignedIntSIMD( int32 *pSIMD, const fltx4 & a )
  741. {
  742. *( reinterpret_cast< i32x4 *> ( pSIMD ) ) = a;
  743. }
  744. /// Load four consecutive uint16's, and turn them into floating point numbers. This function isn't
  745. /// especially fast and could be made faster if anyone is using it heavily.
  746. FORCEINLINE fltx4 LoadAndConvertUint16SIMD( const uint16 *pInts )
  747. {
  748. fltx4 retval;
  749. SubFloat( retval, 0 ) = pInts[0];
  750. SubFloat( retval, 1 ) = pInts[1];
  751. SubFloat( retval, 2 ) = pInts[2];
  752. SubFloat( retval, 3 ) = pInts[3];
  753. }
  754. /// Take a fltx4 containing fixed-point uints and return them as single precision floats. No fixed
  755. /// point conversion is done.
  756. FORCEINLINE fltx4 UnsignedIntConvertToFltSIMD( const u32x4 &vSrcA )
  757. {
  758. Assert(0); /* pc has no such operation */
  759. fltx4 retval;
  760. SubFloat( retval, 0 ) = ( (float) SubInt( vSrcA, 0 ) );
  761. SubFloat( retval, 1 ) = ( (float) SubInt( vSrcA, 1 ) );
  762. SubFloat( retval, 2 ) = ( (float) SubInt( vSrcA, 2 ) );
  763. SubFloat( retval, 3 ) = ( (float) SubInt( vSrcA, 3 ) );
  764. return retval;
  765. }
  766. #if 0 /* pc has no such op */
  767. // Take a fltx4 containing fixed-point sints and
  768. // return them as single precision floats. No
  769. // fixed point conversion is done.
  770. FORCEINLINE fltx4 SignedIntConvertToFltSIMD( const i32x4 &vSrcA )
  771. {
  772. fltx4 retval;
  773. SubFloat( retval, 0 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[0])) );
  774. SubFloat( retval, 1 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[1])) );
  775. SubFloat( retval, 2 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[2])) );
  776. SubFloat( retval, 3 ) = ( (float) (reinterpret_cast<int32 *>(&vSrcA.m128_s32[3])) );
  777. return retval;
  778. }
  779. /// works on fltx4's as if they are four uints. the first parameter contains the words to be
  780. /// shifted, the second contains the amount to shift by AS INTS
  781. ///
  782. /// for i = 0 to 3
  783. /// shift = vSrcB_i*32:(i*32)+4
  784. /// vReturned_i*32:(i*32)+31 = vSrcA_i*32:(i*32)+31 << shift
  785. FORCEINLINE i32x4 IntShiftLeftWordSIMD(const i32x4 &vSrcA, const i32x4 &vSrcB)
  786. {
  787. i32x4 retval;
  788. SubInt(retval, 0) = SubInt(vSrcA, 0) << SubInt(vSrcB, 0);
  789. SubInt(retval, 1) = SubInt(vSrcA, 1) << SubInt(vSrcB, 1);
  790. SubInt(retval, 2) = SubInt(vSrcA, 2) << SubInt(vSrcB, 2);
  791. SubInt(retval, 3) = SubInt(vSrcA, 3) << SubInt(vSrcB, 3);
  792. return retval;
  793. }
  794. //@}