Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1107 lines
27 KiB

  1. //========= Copyright Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose: SSE Math primitives.
  4. //
  5. //=====================================================================================//
  6. #include <math.h>
  7. #include <float.h> // Needed for FLT_EPSILON
  8. #include "basetypes.h"
  9. #include <memory.h>
  10. #include "tier0/dbg.h"
  11. #include "mathlib/mathlib.h"
  12. #include "mathlib/vector.h"
  13. #include "sse.h"
  14. // memdbgon must be the last include file in a .cpp file!!!
  15. #include "tier0/memdbgon.h"
  16. #ifndef COMPILER_MSVC64
  17. // Implement for 64-bit Windows if needed.
  18. static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 };
  19. static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
  20. //-----------------------------------------------------------------------------
  21. // Macros and constants required by some of the SSE assembly:
  22. //-----------------------------------------------------------------------------
  23. #ifdef _WIN32
  24. #define _PS_EXTERN_CONST(Name, Val) \
  25. const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
  26. #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
  27. const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
  28. #define _EPI32_CONST(Name, Val) \
  29. static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
  30. #define _PS_CONST(Name, Val) \
  31. static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
  32. #elif POSIX
  33. #define _PS_EXTERN_CONST(Name, Val) \
  34. const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
  35. #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
  36. const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
  37. #define _EPI32_CONST(Name, Val) \
  38. static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
  39. #define _PS_CONST(Name, Val) \
  40. static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
  41. #endif
  42. _PS_EXTERN_CONST(am_0, 0.0f);
  43. _PS_EXTERN_CONST(am_1, 1.0f);
  44. _PS_EXTERN_CONST(am_m1, -1.0f);
  45. _PS_EXTERN_CONST(am_0p5, 0.5f);
  46. _PS_EXTERN_CONST(am_1p5, 1.5f);
  47. _PS_EXTERN_CONST(am_pi, (float)M_PI);
  48. _PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
  49. _PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
  50. _PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
  51. _PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
  52. _PS_EXTERN_CONST_TYPE(am_sign_mask, uint32, 0x80000000);
  53. _PS_EXTERN_CONST_TYPE(am_inv_sign_mask, uint32, ~0x80000000);
  54. _PS_EXTERN_CONST_TYPE(am_min_norm_pos,uint32, 0x00800000);
  55. _PS_EXTERN_CONST_TYPE(am_mant_mask, uint32, 0x7f800000);
  56. _PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
  57. _EPI32_CONST(1, 1);
  58. _EPI32_CONST(2, 2);
  59. _PS_CONST(sincos_p0, 0.15707963267948963959e1f);
  60. _PS_CONST(sincos_p1, -0.64596409750621907082e0f);
  61. _PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
  62. _PS_CONST(sincos_p3, -0.468175413106023168e-2f);
  63. #ifdef PFN_VECTORMA
  64. void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
  65. #endif
  66. //-----------------------------------------------------------------------------
  67. // SSE implementations of optimized routines:
  68. //-----------------------------------------------------------------------------
  69. float _SSE_Sqrt(float x)
  70. {
  71. Assert( s_bMathlibInitialized );
  72. float root = 0.f;
  73. #ifdef _WIN32
  74. _asm
  75. {
  76. sqrtss xmm0, x
  77. movss root, xmm0
  78. }
  79. #elif POSIX
  80. _mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
  81. #endif
  82. return root;
  83. }
  84. // Single iteration NewtonRaphson reciprocal square root:
  85. // 0.5 * rsqrtps * (3 - x * rsqrtps(x) * rsqrtps(x))
  86. // Very low error, and fine to use in place of 1.f / sqrtf(x).
  87. #if 0
  88. float _SSE_RSqrtAccurate(float x)
  89. {
  90. Assert( s_bMathlibInitialized );
  91. float rroot;
  92. _asm
  93. {
  94. rsqrtss xmm0, x
  95. movss rroot, xmm0
  96. }
  97. return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
  98. }
  99. #else
  100. #ifdef POSIX
  101. const __m128 f3 = _mm_set_ss(3.0f); // 3 as SSE value
  102. const __m128 f05 = _mm_set_ss(0.5f); // 0.5 as SSE value
  103. #endif
  104. // Intel / Kipps SSE RSqrt. Significantly faster than above.
  105. float _SSE_RSqrtAccurate(float a)
  106. {
  107. #ifdef _WIN32
  108. float x;
  109. float half = 0.5f;
  110. float three = 3.f;
  111. __asm
  112. {
  113. movss xmm3, a;
  114. movss xmm1, half;
  115. movss xmm2, three;
  116. rsqrtss xmm0, xmm3;
  117. mulss xmm3, xmm0;
  118. mulss xmm1, xmm0;
  119. mulss xmm3, xmm0;
  120. subss xmm2, xmm3;
  121. mulss xmm1, xmm2;
  122. movss x, xmm1;
  123. }
  124. return x;
  125. #elif POSIX
  126. __m128 xx = _mm_load_ss( &a );
  127. __m128 xr = _mm_rsqrt_ss( xx );
  128. __m128 xt;
  129. xt = _mm_mul_ss( xr, xr );
  130. xt = _mm_mul_ss( xt, xx );
  131. xt = _mm_sub_ss( f3, xt );
  132. xt = _mm_mul_ss( xt, f05 );
  133. xr = _mm_mul_ss( xr, xt );
  134. _mm_store_ss( &a, xr );
  135. return a;
  136. #else
  137. #error "Not Implemented"
  138. #endif
  139. }
  140. #endif
  141. // Simple SSE rsqrt. Usually accurate to around 6 (relative) decimal places
  142. // or so, so ok for closed transforms. (ie, computing lighting normals)
  143. float _SSE_RSqrtFast(float x)
  144. {
  145. Assert( s_bMathlibInitialized );
  146. float rroot;
  147. #ifdef _WIN32
  148. _asm
  149. {
  150. rsqrtss xmm0, x
  151. movss rroot, xmm0
  152. }
  153. #elif POSIX
  154. __asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
  155. #else
  156. #error
  157. #endif
  158. return rroot;
  159. }
  160. float FASTCALL _SSE_VectorNormalize (Vector& vec)
  161. {
  162. Assert( s_bMathlibInitialized );
  163. // NOTE: This is necessary to prevent an memory overwrite...
  164. // sice vec only has 3 floats, we can't "movaps" directly into it.
  165. #ifdef _WIN32
  166. __declspec(align(16)) float result[4];
  167. #elif POSIX
  168. float result[4] __attribute__((aligned(16)));
  169. #endif
  170. float *v = &vec[0];
  171. #ifdef _WIN32
  172. float *r = &result[0];
  173. #endif
  174. float radius = 0.f;
  175. // Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't
  176. // be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
  177. if ( v[0] || v[1] || v[2] )
  178. {
  179. #ifdef _WIN32
  180. _asm
  181. {
  182. mov eax, v
  183. mov edx, r
  184. #ifdef ALIGNED_VECTOR
  185. movaps xmm4, [eax] // r4 = vx, vy, vz, X
  186. movaps xmm1, xmm4 // r1 = r4
  187. #else
  188. movups xmm4, [eax] // r4 = vx, vy, vz, X
  189. movaps xmm1, xmm4 // r1 = r4
  190. #endif
  191. mulps xmm1, xmm4 // r1 = vx * vx, vy * vy, vz * vz, X
  192. movhlps xmm3, xmm1 // r3 = vz * vz, X, X, X
  193. movaps xmm2, xmm1 // r2 = r1
  194. shufps xmm2, xmm2, 1 // r2 = vy * vy, X, X, X
  195. addss xmm1, xmm2 // r1 = (vx * vx) + (vy * vy), X, X, X
  196. addss xmm1, xmm3 // r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
  197. sqrtss xmm1, xmm1 // r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
  198. movss radius, xmm1 // radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
  199. rcpss xmm1, xmm1 // r1 = 1/radius, X, X, X
  200. shufps xmm1, xmm1, 0 // r1 = 1/radius, 1/radius, 1/radius, X
  201. mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
  202. movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
  203. }
  204. #elif POSIX
  205. __asm__ __volatile__(
  206. #ifdef ALIGNED_VECTOR
  207. "movaps %2, %%xmm4 \n\t"
  208. "movaps %%xmm4, %%xmm1 \n\t"
  209. #else
  210. "movups %2, %%xmm4 \n\t"
  211. "movaps %%xmm4, %%xmm1 \n\t"
  212. #endif
  213. "mulps %%xmm4, %%xmm1 \n\t"
  214. "movhlps %%xmm1, %%xmm3 \n\t"
  215. "movaps %%xmm1, %%xmm2 \n\t"
  216. "shufps $1, %%xmm2, %%xmm2 \n\t"
  217. "addss %%xmm2, %%xmm1 \n\t"
  218. "addss %%xmm3, %%xmm1 \n\t"
  219. "sqrtss %%xmm1, %%xmm1 \n\t"
  220. "movss %%xmm1, %0 \n\t"
  221. "rcpss %%xmm1, %%xmm1 \n\t"
  222. "shufps $0, %%xmm1, %%xmm1 \n\t"
  223. "mulps %%xmm1, %%xmm4 \n\t"
  224. "movaps %%xmm4, %1 \n\t"
  225. : "=m" (radius), "=m" (result)
  226. : "m" (*v)
  227. : "xmm1", "xmm2", "xmm3", "xmm4"
  228. );
  229. #else
  230. #error "Not Implemented"
  231. #endif
  232. vec.x = result[0];
  233. vec.y = result[1];
  234. vec.z = result[2];
  235. }
  236. return radius;
  237. }
  238. void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
  239. {
  240. float ool = _SSE_RSqrtAccurate( FLT_EPSILON + vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
  241. vec.x *= ool;
  242. vec.y *= ool;
  243. vec.z *= ool;
  244. }
  245. float _SSE_InvRSquared(const float* v)
  246. {
  247. float inv_r2 = 1.f;
  248. #ifdef _WIN32
  249. _asm { // Intel SSE only routine
  250. mov eax, v
  251. movss xmm5, inv_r2 // x5 = 1.0, 0, 0, 0
  252. #ifdef ALIGNED_VECTOR
  253. movaps xmm4, [eax] // x4 = vx, vy, vz, X
  254. #else
  255. movups xmm4, [eax] // x4 = vx, vy, vz, X
  256. #endif
  257. movaps xmm1, xmm4 // x1 = x4
  258. mulps xmm1, xmm4 // x1 = vx * vx, vy * vy, vz * vz, X
  259. movhlps xmm3, xmm1 // x3 = vz * vz, X, X, X
  260. movaps xmm2, xmm1 // x2 = x1
  261. shufps xmm2, xmm2, 1 // x2 = vy * vy, X, X, X
  262. addss xmm1, xmm2 // x1 = (vx * vx) + (vy * vy), X, X, X
  263. addss xmm1, xmm3 // x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
  264. maxss xmm1, xmm5 // x1 = max( 1.0, x1 )
  265. rcpss xmm0, xmm1 // x0 = 1 / max( 1.0, x1 )
  266. movss inv_r2, xmm0 // inv_r2 = x0
  267. }
  268. #elif POSIX
  269. __asm__ __volatile__(
  270. "movss %0, %%xmm5 \n\t"
  271. #ifdef ALIGNED_VECTOR
  272. "movaps %1, %%xmm4 \n\t"
  273. #else
  274. "movups %1, %%xmm4 \n\t"
  275. #endif
  276. "movaps %%xmm4, %%xmm1 \n\t"
  277. "mulps %%xmm4, %%xmm1 \n\t"
  278. "movhlps %%xmm1, %%xmm3 \n\t"
  279. "movaps %%xmm1, %%xmm2 \n\t"
  280. "shufps $1, %%xmm2, %%xmm2 \n\t"
  281. "addss %%xmm2, %%xmm1 \n\t"
  282. "addss %%xmm3, %%xmm1 \n\t"
  283. "maxss %%xmm5, %%xmm1 \n\t"
  284. "rcpss %%xmm1, %%xmm0 \n\t"
  285. "movss %%xmm0, %0 \n\t"
  286. : "+m" (inv_r2)
  287. : "m" (*v)
  288. : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  289. );
  290. #else
  291. #error "Not Implemented"
  292. #endif
  293. return inv_r2;
  294. }
  295. #ifdef POSIX
  296. // #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
  297. #define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
  298. _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
  299. _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
  300. #define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
  301. _PI32_CONST(1, 1);
  302. _PI32_CONST(inv1, ~1);
  303. _PI32_CONST(2, 2);
  304. _PI32_CONST(4, 4);
  305. _PI32_CONST(0x7f, 0x7f);
  306. _PS_CONST(1 , 1.0f);
  307. _PS_CONST(0p5, 0.5f);
  308. _PS_CONST(minus_cephes_DP1, -0.78515625);
  309. _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
  310. _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
  311. _PS_CONST(sincof_p0, -1.9515295891E-4);
  312. _PS_CONST(sincof_p1, 8.3321608736E-3);
  313. _PS_CONST(sincof_p2, -1.6666654611E-1);
  314. _PS_CONST(coscof_p0, 2.443315711809948E-005);
  315. _PS_CONST(coscof_p1, -1.388731625493765E-003);
  316. _PS_CONST(coscof_p2, 4.166664568298827E-002);
  317. _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
  318. typedef union xmm_mm_union {
  319. __m128 xmm;
  320. __m64 mm[2];
  321. } xmm_mm_union;
  322. #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
  323. typedef __m128 v4sf; // vector of 4 float (sse1)
  324. typedef __m64 v2si; // vector of 2 int (mmx)
  325. #endif
  326. void _SSE_SinCos(float x, float* s, float* c)
  327. {
  328. #ifdef _WIN32
  329. float t4, t8, t12;
  330. __asm
  331. {
  332. movss xmm0, x
  333. movss t12, xmm0
  334. movss xmm1, _ps_am_inv_sign_mask
  335. mov eax, t12
  336. mulss xmm0, _ps_am_2_o_pi
  337. andps xmm0, xmm1
  338. and eax, 0x80000000
  339. cvttss2si edx, xmm0
  340. mov ecx, edx
  341. mov t12, esi
  342. mov esi, edx
  343. add edx, 0x1
  344. shl ecx, (31 - 1)
  345. shl edx, (31 - 1)
  346. movss xmm4, _ps_am_1
  347. cvtsi2ss xmm3, esi
  348. mov t8, eax
  349. and esi, 0x1
  350. subss xmm0, xmm3
  351. movss xmm3, _sincos_inv_masks[esi * 4]
  352. minss xmm0, xmm4
  353. subss xmm4, xmm0
  354. movss xmm6, xmm4
  355. andps xmm4, xmm3
  356. and ecx, 0x80000000
  357. movss xmm2, xmm3
  358. andnps xmm3, xmm0
  359. and edx, 0x80000000
  360. movss xmm7, t8
  361. andps xmm0, xmm2
  362. mov t8, ecx
  363. mov t4, edx
  364. orps xmm4, xmm3
  365. mov eax, s //mov eax, [esp + 4 + 16]
  366. mov edx, c //mov edx, [esp + 4 + 16 + 4]
  367. andnps xmm2, xmm6
  368. orps xmm0, xmm2
  369. movss xmm2, t8
  370. movss xmm1, xmm0
  371. movss xmm5, xmm4
  372. xorps xmm7, xmm2
  373. movss xmm3, _ps_sincos_p3
  374. mulss xmm0, xmm0
  375. mulss xmm4, xmm4
  376. movss xmm2, xmm0
  377. movss xmm6, xmm4
  378. orps xmm1, xmm7
  379. movss xmm7, _ps_sincos_p2
  380. mulss xmm0, xmm3
  381. mulss xmm4, xmm3
  382. movss xmm3, _ps_sincos_p1
  383. addss xmm0, xmm7
  384. addss xmm4, xmm7
  385. movss xmm7, _ps_sincos_p0
  386. mulss xmm0, xmm2
  387. mulss xmm4, xmm6
  388. addss xmm0, xmm3
  389. addss xmm4, xmm3
  390. movss xmm3, t4
  391. mulss xmm0, xmm2
  392. mulss xmm4, xmm6
  393. orps xmm5, xmm3
  394. mov esi, t12
  395. addss xmm0, xmm7
  396. addss xmm4, xmm7
  397. mulss xmm0, xmm1
  398. mulss xmm4, xmm5
  399. // use full stores since caller might reload with full loads
  400. movss [eax], xmm0
  401. movss [edx], xmm4
  402. }
  403. #elif POSIX
  404. Assert( "Needs testing, verify impl!\n" );
  405. v4sf xx = _mm_load_ss( &x );
  406. v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
  407. v2si mm0, mm1, mm2, mm3, mm4, mm5;
  408. sign_bit_sin = xx;
  409. /* take the absolute value */
  410. xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
  411. /* extract the sign bit (upper one) */
  412. sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
  413. /* scale by 4/Pi */
  414. y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
  415. /* store the integer part of y in mm2:mm3 */
  416. xmm3 = _mm_movehl_ps(xmm3, y);
  417. mm2 = _mm_cvttps_pi32(y);
  418. mm3 = _mm_cvttps_pi32(xmm3);
  419. /* j=(j+1) & (~1) (see the cephes sources) */
  420. mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
  421. mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
  422. mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
  423. mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
  424. y = _mm_cvtpi32x2_ps(mm2, mm3);
  425. mm4 = mm2;
  426. mm5 = mm3;
  427. /* get the swap sign flag for the sine */
  428. mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
  429. mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
  430. mm0 = _mm_slli_pi32(mm0, 29);
  431. mm1 = _mm_slli_pi32(mm1, 29);
  432. v4sf swap_sign_bit_sin;
  433. COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
  434. /* get the polynom selection mask for the sine */
  435. mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
  436. mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
  437. mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
  438. mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
  439. v4sf poly_mask;
  440. COPY_MM_TO_XMM(mm2, mm3, poly_mask);
  441. /* The magic pass: "Extended precision modular arithmetic"
  442. x = ((x - y * DP1) - y * DP2) - y * DP3; */
  443. xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
  444. xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
  445. xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
  446. xmm1 = _mm_mul_ps(y, xmm1);
  447. xmm2 = _mm_mul_ps(y, xmm2);
  448. xmm3 = _mm_mul_ps(y, xmm3);
  449. xx = _mm_add_ps(xx, xmm1);
  450. xx = _mm_add_ps(xx, xmm2);
  451. xx = _mm_add_ps(xx, xmm3);
  452. /* get the sign flag for the cosine */
  453. mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
  454. mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
  455. mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
  456. mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
  457. mm4 = _mm_slli_pi32(mm4, 29);
  458. mm5 = _mm_slli_pi32(mm5, 29);
  459. v4sf sign_bit_cos;
  460. COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
  461. _mm_empty(); /* good-bye mmx */
  462. sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  463. /* Evaluate the first polynom (0 <= x <= Pi/4) */
  464. v4sf z = _mm_mul_ps(xx,xx);
  465. y = *(v4sf*)_ps_coscof_p0;
  466. y = _mm_mul_ps(y, z);
  467. y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
  468. y = _mm_mul_ps(y, z);
  469. y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
  470. y = _mm_mul_ps(y, z);
  471. y = _mm_mul_ps(y, z);
  472. v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  473. y = _mm_sub_ps(y, tmp);
  474. y = _mm_add_ps(y, *(v4sf*)_ps_1);
  475. /* Evaluate the second polynom (Pi/4 <= x <= 0) */
  476. v4sf y2 = *(v4sf*)_ps_sincof_p0;
  477. y2 = _mm_mul_ps(y2, z);
  478. y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  479. y2 = _mm_mul_ps(y2, z);
  480. y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  481. y2 = _mm_mul_ps(y2, z);
  482. y2 = _mm_mul_ps(y2, xx);
  483. y2 = _mm_add_ps(y2, xx);
  484. /* select the correct result from the two polynoms */
  485. xmm3 = poly_mask;
  486. v4sf ysin2 = _mm_and_ps(xmm3, y2);
  487. v4sf ysin1 = _mm_andnot_ps(xmm3, y);
  488. y2 = _mm_sub_ps(y2,ysin2);
  489. y = _mm_sub_ps(y, ysin1);
  490. xmm1 = _mm_add_ps(ysin1,ysin2);
  491. xmm2 = _mm_add_ps(y,y2);
  492. /* update the sign */
  493. _mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
  494. _mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
  495. #else
  496. #error "Not Implemented"
  497. #endif
  498. }
  499. float _SSE_cos( float x )
  500. {
  501. #ifdef _WIN32
  502. float temp;
  503. __asm
  504. {
  505. movss xmm0, x
  506. movss xmm1, _ps_am_inv_sign_mask
  507. andps xmm0, xmm1
  508. addss xmm0, _ps_am_pi_o_2
  509. mulss xmm0, _ps_am_2_o_pi
  510. cvttss2si ecx, xmm0
  511. movss xmm5, _ps_am_1
  512. mov edx, ecx
  513. shl edx, (31 - 1)
  514. cvtsi2ss xmm1, ecx
  515. and edx, 0x80000000
  516. and ecx, 0x1
  517. subss xmm0, xmm1
  518. movss xmm6, _sincos_masks[ecx * 4]
  519. minss xmm0, xmm5
  520. movss xmm1, _ps_sincos_p3
  521. subss xmm5, xmm0
  522. andps xmm5, xmm6
  523. movss xmm7, _ps_sincos_p2
  524. andnps xmm6, xmm0
  525. mov temp, edx
  526. orps xmm5, xmm6
  527. movss xmm0, xmm5
  528. mulss xmm5, xmm5
  529. movss xmm4, _ps_sincos_p1
  530. movss xmm2, xmm5
  531. mulss xmm5, xmm1
  532. movss xmm1, _ps_sincos_p0
  533. addss xmm5, xmm7
  534. mulss xmm5, xmm2
  535. movss xmm3, temp
  536. addss xmm5, xmm4
  537. mulss xmm5, xmm2
  538. orps xmm0, xmm3
  539. addss xmm5, xmm1
  540. mulss xmm0, xmm5
  541. movss x, xmm0
  542. }
  543. #elif POSIX
  544. Assert( "Needs testing, verify impl!\n" );
  545. v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
  546. v2si mm0, mm1, mm2, mm3;
  547. /* take the absolute value */
  548. v4sf xx = _mm_load_ss( &x );
  549. xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
  550. /* scale by 4/Pi */
  551. y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
  552. /* store the integer part of y in mm0:mm1 */
  553. xmm2 = _mm_movehl_ps(xmm2, y);
  554. mm2 = _mm_cvttps_pi32(y);
  555. mm3 = _mm_cvttps_pi32(xmm2);
  556. /* j=(j+1) & (~1) (see the cephes sources) */
  557. mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
  558. mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
  559. mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
  560. mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
  561. y = _mm_cvtpi32x2_ps(mm2, mm3);
  562. mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
  563. mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
  564. /* get the swap sign flag in mm0:mm1 and the
  565. polynom selection mask in mm2:mm3 */
  566. mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
  567. mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
  568. mm0 = _mm_slli_pi32(mm0, 29);
  569. mm1 = _mm_slli_pi32(mm1, 29);
  570. mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
  571. mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
  572. mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
  573. mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
  574. v4sf sign_bit, poly_mask;
  575. COPY_MM_TO_XMM(mm0, mm1, sign_bit);
  576. COPY_MM_TO_XMM(mm2, mm3, poly_mask);
  577. _mm_empty(); /* good-bye mmx */
  578. /* The magic pass: "Extended precision modular arithmetic"
  579. x = ((x - y * DP1) - y * DP2) - y * DP3; */
  580. xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
  581. xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
  582. xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
  583. xmm1 = _mm_mul_ps(y, xmm1);
  584. xmm2 = _mm_mul_ps(y, xmm2);
  585. xmm3 = _mm_mul_ps(y, xmm3);
  586. xx = _mm_add_ps(xx, xmm1);
  587. xx = _mm_add_ps(xx, xmm2);
  588. xx = _mm_add_ps(xx, xmm3);
  589. /* Evaluate the first polynom (0 <= x <= Pi/4) */
  590. y = *(v4sf*)_ps_coscof_p0;
  591. v4sf z = _mm_mul_ps(xx,xx);
  592. y = _mm_mul_ps(y, z);
  593. y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
  594. y = _mm_mul_ps(y, z);
  595. y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
  596. y = _mm_mul_ps(y, z);
  597. y = _mm_mul_ps(y, z);
  598. v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  599. y = _mm_sub_ps(y, tmp);
  600. y = _mm_add_ps(y, *(v4sf*)_ps_1);
  601. /* Evaluate the second polynom (Pi/4 <= x <= 0) */
  602. v4sf y2 = *(v4sf*)_ps_sincof_p0;
  603. y2 = _mm_mul_ps(y2, z);
  604. y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  605. y2 = _mm_mul_ps(y2, z);
  606. y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  607. y2 = _mm_mul_ps(y2, z);
  608. y2 = _mm_mul_ps(y2, xx);
  609. y2 = _mm_add_ps(y2, xx);
  610. /* select the correct result from the two polynoms */
  611. xmm3 = poly_mask;
  612. y2 = _mm_and_ps(xmm3, y2); //, xmm3);
  613. y = _mm_andnot_ps(xmm3, y);
  614. y = _mm_add_ps(y,y2);
  615. /* update the sign */
  616. _mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
  617. #else
  618. #error "Not Implemented"
  619. #endif
  620. return x;
  621. }
  622. //-----------------------------------------------------------------------------
  623. // SSE2 implementations of optimized routines:
  624. //-----------------------------------------------------------------------------
  625. #ifdef PLATFORM_WINDOWS_PC32
  626. void _SSE2_SinCos(float x, float* s, float* c) // any x
  627. {
  628. #ifdef _WIN32
  629. __asm
  630. {
  631. movss xmm0, x
  632. movaps xmm7, xmm0
  633. movss xmm1, _ps_am_inv_sign_mask
  634. movss xmm2, _ps_am_sign_mask
  635. movss xmm3, _ps_am_2_o_pi
  636. andps xmm0, xmm1
  637. andps xmm7, xmm2
  638. mulss xmm0, xmm3
  639. pxor xmm3, xmm3
  640. movd xmm5, _epi32_1
  641. movss xmm4, _ps_am_1
  642. cvttps2dq xmm2, xmm0
  643. pand xmm5, xmm2
  644. movd xmm1, _epi32_2
  645. pcmpeqd xmm5, xmm3
  646. movd xmm3, _epi32_1
  647. cvtdq2ps xmm6, xmm2
  648. paddd xmm3, xmm2
  649. pand xmm2, xmm1
  650. pand xmm3, xmm1
  651. subss xmm0, xmm6
  652. pslld xmm2, (31 - 1)
  653. minss xmm0, xmm4
  654. mov eax, s // mov eax, [esp + 4 + 16]
  655. mov edx, c // mov edx, [esp + 4 + 16 + 4]
  656. subss xmm4, xmm0
  657. pslld xmm3, (31 - 1)
  658. movaps xmm6, xmm4
  659. xorps xmm2, xmm7
  660. movaps xmm7, xmm5
  661. andps xmm6, xmm7
  662. andnps xmm7, xmm0
  663. andps xmm0, xmm5
  664. andnps xmm5, xmm4
  665. movss xmm4, _ps_sincos_p3
  666. orps xmm6, xmm7
  667. orps xmm0, xmm5
  668. movss xmm5, _ps_sincos_p2
  669. movaps xmm1, xmm0
  670. movaps xmm7, xmm6
  671. mulss xmm0, xmm0
  672. mulss xmm6, xmm6
  673. orps xmm1, xmm2
  674. orps xmm7, xmm3
  675. movaps xmm2, xmm0
  676. movaps xmm3, xmm6
  677. mulss xmm0, xmm4
  678. mulss xmm6, xmm4
  679. movss xmm4, _ps_sincos_p1
  680. addss xmm0, xmm5
  681. addss xmm6, xmm5
  682. movss xmm5, _ps_sincos_p0
  683. mulss xmm0, xmm2
  684. mulss xmm6, xmm3
  685. addss xmm0, xmm4
  686. addss xmm6, xmm4
  687. mulss xmm0, xmm2
  688. mulss xmm6, xmm3
  689. addss xmm0, xmm5
  690. addss xmm6, xmm5
  691. mulss xmm0, xmm1
  692. mulss xmm6, xmm7
  693. // use full stores since caller might reload with full loads
  694. movss [eax], xmm0
  695. movss [edx], xmm6
  696. }
  697. #elif POSIX
  698. #warning "_SSE2_SinCos NOT implemented!"
  699. Assert( 0 );
  700. #else
  701. #error "Not Implemented"
  702. #endif
  703. }
  704. #endif // PLATFORM_WINDOWS_PC32
  705. #ifdef PLATFORM_WINDOWS_PC32
  706. float _SSE2_cos(float x)
  707. {
  708. #ifdef _WIN32
  709. __asm
  710. {
  711. movss xmm0, x
  712. movss xmm1, _ps_am_inv_sign_mask
  713. movss xmm2, _ps_am_pi_o_2
  714. movss xmm3, _ps_am_2_o_pi
  715. andps xmm0, xmm1
  716. addss xmm0, xmm2
  717. mulss xmm0, xmm3
  718. pxor xmm3, xmm3
  719. movd xmm5, _epi32_1
  720. movss xmm4, _ps_am_1
  721. cvttps2dq xmm2, xmm0
  722. pand xmm5, xmm2
  723. movd xmm1, _epi32_2
  724. pcmpeqd xmm5, xmm3
  725. cvtdq2ps xmm6, xmm2
  726. pand xmm2, xmm1
  727. pslld xmm2, (31 - 1)
  728. subss xmm0, xmm6
  729. movss xmm3, _ps_sincos_p3
  730. minss xmm0, xmm4
  731. subss xmm4, xmm0
  732. andps xmm0, xmm5
  733. andnps xmm5, xmm4
  734. orps xmm0, xmm5
  735. movaps xmm1, xmm0
  736. movss xmm4, _ps_sincos_p2
  737. mulss xmm0, xmm0
  738. movss xmm5, _ps_sincos_p1
  739. orps xmm1, xmm2
  740. movaps xmm7, xmm0
  741. mulss xmm0, xmm3
  742. movss xmm6, _ps_sincos_p0
  743. addss xmm0, xmm4
  744. mulss xmm0, xmm7
  745. addss xmm0, xmm5
  746. mulss xmm0, xmm7
  747. addss xmm0, xmm6
  748. mulss xmm0, xmm1
  749. movss x, xmm0
  750. }
  751. #elif POSIX
  752. #warning "_SSE2_cos NOT implemented!"
  753. Assert( 0 );
  754. #else
  755. #error "Not Implemented"
  756. #endif
  757. return x;
  758. }
  759. #endif // PLATFORM_WINDOWS_PC32
  760. #if 0
  761. // SSE Version of VectorTransform
  762. void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
  763. {
  764. Assert( s_bMathlibInitialized );
  765. Assert( in1 != out1 );
  766. #ifdef _WIN32
  767. __asm
  768. {
  769. mov eax, in1;
  770. mov ecx, in2;
  771. mov edx, out1;
  772. movss xmm0, [eax];
  773. mulss xmm0, [ecx];
  774. movss xmm1, [eax+4];
  775. mulss xmm1, [ecx+4];
  776. movss xmm2, [eax+8];
  777. mulss xmm2, [ecx+8];
  778. addss xmm0, xmm1;
  779. addss xmm0, xmm2;
  780. addss xmm0, [ecx+12]
  781. movss [edx], xmm0;
  782. add ecx, 16;
  783. movss xmm0, [eax];
  784. mulss xmm0, [ecx];
  785. movss xmm1, [eax+4];
  786. mulss xmm1, [ecx+4];
  787. movss xmm2, [eax+8];
  788. mulss xmm2, [ecx+8];
  789. addss xmm0, xmm1;
  790. addss xmm0, xmm2;
  791. addss xmm0, [ecx+12]
  792. movss [edx+4], xmm0;
  793. add ecx, 16;
  794. movss xmm0, [eax];
  795. mulss xmm0, [ecx];
  796. movss xmm1, [eax+4];
  797. mulss xmm1, [ecx+4];
  798. movss xmm2, [eax+8];
  799. mulss xmm2, [ecx+8];
  800. addss xmm0, xmm1;
  801. addss xmm0, xmm2;
  802. addss xmm0, [ecx+12]
  803. movss [edx+8], xmm0;
  804. }
  805. #elif POSIX
  806. #warning "VectorTransformSSE C implementation only"
  807. out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
  808. out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
  809. out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
  810. #else
  811. #error "Not Implemented"
  812. #endif
  813. }
  814. #endif
  815. #if 0
  816. void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
  817. {
  818. Assert( s_bMathlibInitialized );
  819. Assert( in1 != out1 );
  820. #ifdef _WIN32
  821. __asm
  822. {
  823. mov eax, in1;
  824. mov ecx, in2;
  825. mov edx, out1;
  826. movss xmm0, [eax];
  827. mulss xmm0, [ecx];
  828. movss xmm1, [eax+4];
  829. mulss xmm1, [ecx+4];
  830. movss xmm2, [eax+8];
  831. mulss xmm2, [ecx+8];
  832. addss xmm0, xmm1;
  833. addss xmm0, xmm2;
  834. movss [edx], xmm0;
  835. add ecx, 16;
  836. movss xmm0, [eax];
  837. mulss xmm0, [ecx];
  838. movss xmm1, [eax+4];
  839. mulss xmm1, [ecx+4];
  840. movss xmm2, [eax+8];
  841. mulss xmm2, [ecx+8];
  842. addss xmm0, xmm1;
  843. addss xmm0, xmm2;
  844. movss [edx+4], xmm0;
  845. add ecx, 16;
  846. movss xmm0, [eax];
  847. mulss xmm0, [ecx];
  848. movss xmm1, [eax+4];
  849. mulss xmm1, [ecx+4];
  850. movss xmm2, [eax+8];
  851. mulss xmm2, [ecx+8];
  852. addss xmm0, xmm1;
  853. addss xmm0, xmm2;
  854. movss [edx+8], xmm0;
  855. }
  856. #elif POSIX
  857. #warning "VectorRotateSSE C implementation only"
  858. out1[0] = DotProduct( in1, in2[0] );
  859. out1[1] = DotProduct( in1, in2[1] );
  860. out1[2] = DotProduct( in1, in2[2] );
  861. #else
  862. #error "Not Implemented"
  863. #endif
  864. }
  865. #endif
  866. #ifdef _WIN32
  867. void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
  868. {
  869. // FIXME: This don't work!! It will overwrite memory in the write to dest
  870. Assert(0);
  871. Assert( s_bMathlibInitialized );
  872. _asm { // Intel SSE only routine
  873. mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2
  874. mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2
  875. mov edx, DWORD PTR [esp+0x10] ; *dest
  876. movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0
  877. #ifdef ALIGNED_VECTOR
  878. movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
  879. pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
  880. movaps xmm1, [eax] ; x1 = start1, start2, start3, X
  881. mulps xmm3, xmm2 ; x3 *= x2
  882. addps xmm3, xmm1 ; x3 += x1
  883. movaps [edx], xmm3 ; *dest = x3
  884. #else
  885. movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
  886. pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
  887. movups xmm1, [eax] ; x1 = start1, start2, start3, X
  888. mulps xmm3, xmm2 ; x3 *= x2
  889. addps xmm3, xmm1 ; x3 += x1
  890. movups [edx], xmm3 ; *dest = x3
  891. #endif
  892. }
  893. }
  894. #endif
  895. #ifdef _WIN32
  896. #ifdef PFN_VECTORMA
  897. void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
  898. {
  899. // FIXME: This don't work!! It will overwrite memory in the write to dest
  900. Assert(0);
  901. Assert( s_bMathlibInitialized );
  902. _asm
  903. {
  904. // Intel SSE only routine
  905. mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2
  906. mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2
  907. mov edx, DWORD PTR [esp+0x10] ; *dest
  908. movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0
  909. #ifdef ALIGNED_VECTOR
  910. movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
  911. pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
  912. movaps xmm1, [eax] ; x1 = start1, start2, start3, X
  913. mulps xmm3, xmm2 ; x3 *= x2
  914. addps xmm3, xmm1 ; x3 += x1
  915. movaps [edx], xmm3 ; *dest = x3
  916. #else
  917. movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
  918. pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
  919. movups xmm1, [eax] ; x1 = start1, start2, start3, X
  920. mulps xmm3, xmm2 ; x3 *= x2
  921. addps xmm3, xmm1 ; x3 += x1
  922. movups [edx], xmm3 ; *dest = x3
  923. #endif
  924. }
  925. }
  926. float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
  927. #endif
  928. #endif
  929. // SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
  930. // Should be validated too! :)
  931. // NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like
  932. // this, but when inlined, and instruction scheduled, the C version is faster.
  933. // Verified this via VTune
  934. /*
  935. vec_t DotProduct (const vec_t *a, const vec_t *c)
  936. {
  937. vec_t temp;
  938. __asm
  939. {
  940. mov eax, a;
  941. mov ecx, c;
  942. mov edx, DWORD PTR [temp]
  943. movss xmm0, [eax];
  944. mulss xmm0, [ecx];
  945. movss xmm1, [eax+4];
  946. mulss xmm1, [ecx+4];
  947. movss xmm2, [eax+8];
  948. mulss xmm2, [ecx+8];
  949. addss xmm0, xmm1;
  950. addss xmm0, xmm2;
  951. movss [edx], xmm0;
  952. fld DWORD PTR [edx];
  953. ret
  954. }
  955. }
  956. */
  957. #endif // COMPILER_MSVC64