Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

760 lines
19 KiB

  1. //========= Copyright � 1996-2005, Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose: SSE Math primitives.
  4. //
  5. //=====================================================================================//
  6. #include <math.h>
  7. #include <float.h> // needed for flt_epsilon
  8. #include "basetypes.h"
  9. #include "tier0/dbg.h"
  10. #include "mathlib/mathlib.h"
  11. #include "mathlib/vector.h"
  12. #include "sse.h"
  13. // memdbgon must be the last include file in a .cpp file!!!
  14. #include "tier0/memdbgon.h"
  15. static const uint32 _sincos_masks[] = { (uint32)0x0, (uint32)~0x0 };
  16. static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
  17. //-----------------------------------------------------------------------------
  18. // Macros and constants required by some of the SSE assembly:
  19. //-----------------------------------------------------------------------------
  20. #ifdef _WIN32
  21. #define _PS_EXTERN_CONST(Name, Val) \
  22. const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
  23. #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
  24. const __declspec(align(16)) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
  25. #define _EPI32_CONST(Name, Val) \
  26. static const __declspec(align(16)) __int32 _epi32_##Name[4] = { Val, Val, Val, Val }
  27. #define _PS_CONST(Name, Val) \
  28. static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
  29. #elif POSIX
  30. #define _PS_EXTERN_CONST(Name, Val) \
  31. const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
  32. #define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
  33. const Type _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }; \
  34. #define _EPI32_CONST(Name, Val) \
  35. static const int32 _epi32_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
  36. #define _PS_CONST(Name, Val) \
  37. static const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }
  38. #endif
  39. _PS_EXTERN_CONST(am_0, 0.0f);
  40. _PS_EXTERN_CONST(am_1, 1.0f);
  41. _PS_EXTERN_CONST(am_m1, -1.0f);
  42. _PS_EXTERN_CONST(am_0p5, 0.5f);
  43. _PS_EXTERN_CONST(am_1p5, 1.5f);
  44. _PS_EXTERN_CONST(am_pi, (float)M_PI);
  45. _PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
  46. _PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
  47. _PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
  48. _PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
  49. _PS_EXTERN_CONST_TYPE(am_sign_mask, int32, 0x80000000);
  50. _PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
  51. _PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
  52. _PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
  53. _PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
  54. _EPI32_CONST(1, 1);
  55. _EPI32_CONST(2, 2);
  56. _PS_CONST(sincos_p0, 0.15707963267948963959e1f);
  57. _PS_CONST(sincos_p1, -0.64596409750621907082e0f);
  58. _PS_CONST(sincos_p2, 0.7969262624561800806e-1f);
  59. _PS_CONST(sincos_p3, -0.468175413106023168e-2f);
  60. #ifdef PFN_VECTORMA
  61. void __cdecl _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest );
  62. #endif
  63. //-----------------------------------------------------------------------------
  64. // SSE implementations of optimized routines:
  65. //-----------------------------------------------------------------------------
  66. float FASTCALL _SSE_VectorNormalize (Vector& vec)
  67. {
  68. Assert( s_bMathlibInitialized );
  69. // NOTE: This is necessary to prevent an memory overwrite...
  70. // sice vec only has 3 floats, we can't "movaps" directly into it.
  71. #ifdef _WIN32
  72. __declspec(align(16)) float result[4];
  73. #elif POSIX
  74. float result[4] __attribute__((aligned(16)));
  75. #endif
  76. float *v = &vec[0];
  77. float *r = &result[0];
  78. float radius = 0.f;
  79. // Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't
  80. // be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
  81. if ( v[0] || v[1] || v[2] )
  82. {
  83. #if defined( _WIN32 ) && !defined( _WIN64 )
  84. _asm
  85. {
  86. mov eax, v
  87. mov edx, r
  88. #ifdef ALIGNED_VECTOR
  89. movaps xmm4, [eax] // r4 = vx, vy, vz, X
  90. movaps xmm1, xmm4 // r1 = r4
  91. #else
  92. movups xmm4, [eax] // r4 = vx, vy, vz, X
  93. movaps xmm1, xmm4 // r1 = r4
  94. #endif
  95. mulps xmm1, xmm4 // r1 = vx * vx, vy * vy, vz * vz, X
  96. movhlps xmm3, xmm1 // r3 = vz * vz, X, X, X
  97. movaps xmm2, xmm1 // r2 = r1
  98. shufps xmm2, xmm2, 1 // r2 = vy * vy, X, X, X
  99. addss xmm1, xmm2 // r1 = (vx * vx) + (vy * vy), X, X, X
  100. addss xmm1, xmm3 // r1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
  101. sqrtss xmm1, xmm1 // r1 = sqrt((vx * vx) + (vy * vy) + (vz * vz)), X, X, X
  102. movss radius, xmm1 // radius = sqrt((vx * vx) + (vy * vy) + (vz * vz))
  103. rcpss xmm1, xmm1 // r1 = 1/radius, X, X, X
  104. shufps xmm1, xmm1, 0 // r1 = 1/radius, 1/radius, 1/radius, X
  105. mulps xmm4, xmm1 // r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
  106. movaps [edx], xmm4 // v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
  107. }
  108. #elif _WIN64
  109. // Inline assembly isn't allowed in 64-bit MSVC. Sadness.
  110. float recipSqrt = FastRSqrt( vec.x * vec.x + vec.y * vec.y + vec.z * vec.z );
  111. r[ 0 ] = vec.x * recipSqrt;
  112. r[ 1 ] = vec.y * recipSqrt;
  113. r[ 2 ] = vec.z * recipSqrt;
  114. #elif POSIX
  115. __asm__ __volatile__(
  116. #ifdef ALIGNED_VECTOR
  117. "movaps %2, %%xmm4 \n\t"
  118. "movaps %%xmm4, %%xmm1 \n\t"
  119. #else
  120. "movups %2, %%xmm4 \n\t"
  121. "movaps %%xmm4, %%xmm1 \n\t"
  122. #endif
  123. "mulps %%xmm4, %%xmm1 \n\t"
  124. "movhlps %%xmm1, %%xmm3 \n\t"
  125. "movaps %%xmm1, %%xmm2 \n\t"
  126. "shufps $1, %%xmm2, %%xmm2 \n\t"
  127. "addss %%xmm2, %%xmm1 \n\t"
  128. "addss %%xmm3, %%xmm1 \n\t"
  129. "sqrtss %%xmm1, %%xmm1 \n\t"
  130. "movss %%xmm1, %0 \n\t"
  131. "rcpss %%xmm1, %%xmm1 \n\t"
  132. "shufps $0, %%xmm1, %%xmm1 \n\t"
  133. "mulps %%xmm1, %%xmm4 \n\t"
  134. "movaps %%xmm4, %1 \n\t"
  135. : "=m" (radius), "=m" (result)
  136. : "m" (*v)
  137. );
  138. #else
  139. #error "Not Implemented"
  140. #endif
  141. vec.x = result[0];
  142. vec.y = result[1];
  143. vec.z = result[2];
  144. }
  145. return radius;
  146. }
  147. #if defined( _WIN32 ) && !defined( _WIN64 )
  148. void FastSinCos( float x, float* s, float* c ) // any x
  149. {
  150. float t4, t8, t12;
  151. __asm
  152. {
  153. movss xmm0, x
  154. movss t12, xmm0
  155. movss xmm1, _ps_am_inv_sign_mask
  156. mov eax, t12
  157. mulss xmm0, _ps_am_2_o_pi
  158. andps xmm0, xmm1
  159. and eax, 0x80000000
  160. cvttss2si edx, xmm0
  161. mov ecx, edx
  162. mov t12, esi
  163. mov esi, edx
  164. add edx, 0x1
  165. shl ecx, (31 - 1)
  166. shl edx, (31 - 1)
  167. movss xmm4, _ps_am_1
  168. cvtsi2ss xmm3, esi
  169. mov t8, eax
  170. and esi, 0x1
  171. subss xmm0, xmm3
  172. movss xmm3, _sincos_inv_masks[esi * 4]
  173. minss xmm0, xmm4
  174. subss xmm4, xmm0
  175. movss xmm6, xmm4
  176. andps xmm4, xmm3
  177. and ecx, 0x80000000
  178. movss xmm2, xmm3
  179. andnps xmm3, xmm0
  180. and edx, 0x80000000
  181. movss xmm7, t8
  182. andps xmm0, xmm2
  183. mov t8, ecx
  184. mov t4, edx
  185. orps xmm4, xmm3
  186. mov eax, s //mov eax, [esp + 4 + 16]
  187. mov edx, c //mov edx, [esp + 4 + 16 + 4]
  188. andnps xmm2, xmm6
  189. orps xmm0, xmm2
  190. movss xmm2, t8
  191. movss xmm1, xmm0
  192. movss xmm5, xmm4
  193. xorps xmm7, xmm2
  194. movss xmm3, _ps_sincos_p3
  195. mulss xmm0, xmm0
  196. mulss xmm4, xmm4
  197. movss xmm2, xmm0
  198. movss xmm6, xmm4
  199. orps xmm1, xmm7
  200. movss xmm7, _ps_sincos_p2
  201. mulss xmm0, xmm3
  202. mulss xmm4, xmm3
  203. movss xmm3, _ps_sincos_p1
  204. addss xmm0, xmm7
  205. addss xmm4, xmm7
  206. movss xmm7, _ps_sincos_p0
  207. mulss xmm0, xmm2
  208. mulss xmm4, xmm6
  209. addss xmm0, xmm3
  210. addss xmm4, xmm3
  211. movss xmm3, t4
  212. mulss xmm0, xmm2
  213. mulss xmm4, xmm6
  214. orps xmm5, xmm3
  215. mov esi, t12
  216. addss xmm0, xmm7
  217. addss xmm4, xmm7
  218. mulss xmm0, xmm1
  219. mulss xmm4, xmm5
  220. // use full stores since caller might reload with full loads
  221. movss [eax], xmm0
  222. movss [edx], xmm4
  223. }
  224. }
  225. #if 0
  226. //-----------------------------------------------------------------------------
  227. // SSE2 implementations of optimized routines:
  228. //-----------------------------------------------------------------------------
  229. void FastSinCos( float x, float* s, float* c ) // any x
  230. {
  231. __asm
  232. {
  233. movss xmm0, x
  234. movaps xmm7, xmm0
  235. movss xmm1, _ps_am_inv_sign_mask
  236. movss xmm2, _ps_am_sign_mask
  237. movss xmm3, _ps_am_2_o_pi
  238. andps xmm0, xmm1
  239. andps xmm7, xmm2
  240. mulss xmm0, xmm3
  241. pxor xmm3, xmm3
  242. movd xmm5, _epi32_1
  243. movss xmm4, _ps_am_1
  244. cvttps2dq xmm2, xmm0
  245. pand xmm5, xmm2
  246. movd xmm1, _epi32_2
  247. pcmpeqd xmm5, xmm3
  248. movd xmm3, _epi32_1
  249. cvtdq2ps xmm6, xmm2
  250. paddd xmm3, xmm2
  251. pand xmm2, xmm1
  252. pand xmm3, xmm1
  253. subss xmm0, xmm6
  254. pslld xmm2, (31 - 1)
  255. minss xmm0, xmm4
  256. mov eax, s // mov eax, [esp + 4 + 16]
  257. mov edx, c // mov edx, [esp + 4 + 16 + 4]
  258. subss xmm4, xmm0
  259. pslld xmm3, (31 - 1)
  260. movaps xmm6, xmm4
  261. xorps xmm2, xmm7
  262. movaps xmm7, xmm5
  263. andps xmm6, xmm7
  264. andnps xmm7, xmm0
  265. andps xmm0, xmm5
  266. andnps xmm5, xmm4
  267. movss xmm4, _ps_sincos_p3
  268. orps xmm6, xmm7
  269. orps xmm0, xmm5
  270. movss xmm5, _ps_sincos_p2
  271. movaps xmm1, xmm0
  272. movaps xmm7, xmm6
  273. mulss xmm0, xmm0
  274. mulss xmm6, xmm6
  275. orps xmm1, xmm2
  276. orps xmm7, xmm3
  277. movaps xmm2, xmm0
  278. movaps xmm3, xmm6
  279. mulss xmm0, xmm4
  280. mulss xmm6, xmm4
  281. movss xmm4, _ps_sincos_p1
  282. addss xmm0, xmm5
  283. addss xmm6, xmm5
  284. movss xmm5, _ps_sincos_p0
  285. mulss xmm0, xmm2
  286. mulss xmm6, xmm3
  287. addss xmm0, xmm4
  288. addss xmm6, xmm4
  289. mulss xmm0, xmm2
  290. mulss xmm6, xmm3
  291. addss xmm0, xmm5
  292. addss xmm6, xmm5
  293. mulss xmm0, xmm1
  294. mulss xmm6, xmm7
  295. // use full stores since caller might reload with full loads
  296. movss [eax], xmm0
  297. movss [edx], xmm6
  298. }
  299. }
  300. #endif
  301. #elif defined( _OSX ) || defined (LINUX) || defined( _WIN64 )
  302. // [will] - Note: could use optimization.
  303. void FastSinCos( float x, float* s, float* c ) // any x
  304. {
  305. if( c != NULL )
  306. {
  307. *c = FastCos(x);
  308. }
  309. if( s != NULL )
  310. {
  311. *s = sin(x);
  312. }
  313. }
  314. #endif
  315. #ifdef POSIX
  316. //#define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
  317. #define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
  318. _PS_CONST_TYPE(sign_mask, int, 0x80000000);
  319. _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
  320. #define _PI32_CONST(Name, Val) static const ALIGN16 int _pi32_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
  321. _PI32_CONST(1, 1);
  322. _PI32_CONST(inv1, ~1);
  323. _PI32_CONST(2, 2);
  324. _PI32_CONST(4, 4);
  325. _PI32_CONST(0x7f, 0x7f);
  326. _PS_CONST(1 , 1.0f);
  327. _PS_CONST(0p5, 0.5f);
  328. _PS_CONST(minus_cephes_DP1, -0.78515625);
  329. _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
  330. _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
  331. _PS_CONST(sincof_p0, -1.9515295891E-4);
  332. _PS_CONST(sincof_p1, 8.3321608736E-3);
  333. _PS_CONST(sincof_p2, -1.6666654611E-1);
  334. _PS_CONST(coscof_p0, 2.443315711809948E-005);
  335. _PS_CONST(coscof_p1, -1.388731625493765E-003);
  336. _PS_CONST(coscof_p2, 4.166664568298827E-002);
  337. _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
  338. typedef union xmm_mm_union {
  339. __m128 xmm;
  340. __m64 mm[2];
  341. } xmm_mm_union;
  342. #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
  343. typedef __m128 v4sf; // vector of 4 float (sse1)
  344. typedef __m64 v2si; // vector of 2 int (mmx)
  345. #endif
  346. float FastCos( float x )
  347. {
  348. #if defined( _WIN32 ) && !defined( _WIN64 )
  349. float temp;
  350. __asm
  351. {
  352. movss xmm0, x
  353. movss xmm1, _ps_am_inv_sign_mask
  354. andps xmm0, xmm1
  355. addss xmm0, _ps_am_pi_o_2
  356. mulss xmm0, _ps_am_2_o_pi
  357. cvttss2si ecx, xmm0
  358. movss xmm5, _ps_am_1
  359. mov edx, ecx
  360. shl edx, (31 - 1)
  361. cvtsi2ss xmm1, ecx
  362. and edx, 0x80000000
  363. and ecx, 0x1
  364. subss xmm0, xmm1
  365. movss xmm6, _sincos_masks[ecx * 4]
  366. minss xmm0, xmm5
  367. movss xmm1, _ps_sincos_p3
  368. subss xmm5, xmm0
  369. andps xmm5, xmm6
  370. movss xmm7, _ps_sincos_p2
  371. andnps xmm6, xmm0
  372. mov temp, edx
  373. orps xmm5, xmm6
  374. movss xmm0, xmm5
  375. mulss xmm5, xmm5
  376. movss xmm4, _ps_sincos_p1
  377. movss xmm2, xmm5
  378. mulss xmm5, xmm1
  379. movss xmm1, _ps_sincos_p0
  380. addss xmm5, xmm7
  381. mulss xmm5, xmm2
  382. movss xmm3, temp
  383. addss xmm5, xmm4
  384. mulss xmm5, xmm2
  385. orps xmm0, xmm3
  386. addss xmm5, xmm1
  387. mulss xmm0, xmm5
  388. movss x, xmm0
  389. }
  390. #elif defined( _WIN64 )
  391. return cosf( x );
  392. #elif POSIX
  393. v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
  394. v2si mm0, mm1, mm2, mm3;
  395. /* take the absolute value */
  396. v4sf xx = _mm_load_ss( &x );
  397. xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
  398. /* scale by 4/Pi */
  399. y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
  400. /* store the integer part of y in mm0:mm1 */
  401. xmm2 = _mm_movehl_ps(xmm2, y);
  402. mm2 = _mm_cvttps_pi32(y);
  403. mm3 = _mm_cvttps_pi32(xmm2);
  404. /* j=(j+1) & (~1) (see the cephes sources) */
  405. mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
  406. mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
  407. mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
  408. mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
  409. y = _mm_cvtpi32x2_ps(mm2, mm3);
  410. mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
  411. mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
  412. /* get the swap sign flag in mm0:mm1 and the
  413. polynom selection mask in mm2:mm3 */
  414. mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
  415. mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
  416. mm0 = _mm_slli_pi32(mm0, 29);
  417. mm1 = _mm_slli_pi32(mm1, 29);
  418. mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
  419. mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
  420. mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
  421. mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
  422. v4sf sign_bit, poly_mask;
  423. COPY_MM_TO_XMM(mm0, mm1, sign_bit);
  424. COPY_MM_TO_XMM(mm2, mm3, poly_mask);
  425. _mm_empty(); /* good-bye mmx */
  426. /* The magic pass: "Extended precision modular arithmetic"
  427. x = ((x - y * DP1) - y * DP2) - y * DP3; */
  428. xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
  429. xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
  430. xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
  431. xmm1 = _mm_mul_ps(y, xmm1);
  432. xmm2 = _mm_mul_ps(y, xmm2);
  433. xmm3 = _mm_mul_ps(y, xmm3);
  434. xx = _mm_add_ps(xx, xmm1);
  435. xx = _mm_add_ps(xx, xmm2);
  436. xx = _mm_add_ps(xx, xmm3);
  437. /* Evaluate the first polynom (0 <= x <= Pi/4) */
  438. y = *(v4sf*)_ps_coscof_p0;
  439. v4sf z = _mm_mul_ps(xx,xx);
  440. y = _mm_mul_ps(y, z);
  441. y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
  442. y = _mm_mul_ps(y, z);
  443. y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
  444. y = _mm_mul_ps(y, z);
  445. y = _mm_mul_ps(y, z);
  446. v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  447. y = _mm_sub_ps(y, tmp);
  448. y = _mm_add_ps(y, *(v4sf*)_ps_1);
  449. /* Evaluate the second polynom (Pi/4 <= x <= 0) */
  450. v4sf y2 = *(v4sf*)_ps_sincof_p0;
  451. y2 = _mm_mul_ps(y2, z);
  452. y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  453. y2 = _mm_mul_ps(y2, z);
  454. y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  455. y2 = _mm_mul_ps(y2, z);
  456. y2 = _mm_mul_ps(y2, xx);
  457. y2 = _mm_add_ps(y2, xx);
  458. /* select the correct result from the two polynoms */
  459. xmm3 = poly_mask;
  460. y2 = _mm_and_ps(xmm3, y2); //, xmm3);
  461. y = _mm_andnot_ps(xmm3, y);
  462. y = _mm_add_ps(y,y2);
  463. /* update the sign */
  464. _mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
  465. #else
  466. #error "Not Implemented"
  467. #endif
  468. return x;
  469. }
  470. // SSE Version of VectorTransform
  471. void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
  472. {
  473. Assert( s_bMathlibInitialized );
  474. Assert( in1 != out1 );
  475. #if defined( _WIN32 ) && !defined( _WIN64 )
  476. __asm
  477. {
  478. mov eax, in1;
  479. mov ecx, in2;
  480. mov edx, out1;
  481. movss xmm0, [eax];
  482. mulss xmm0, [ecx];
  483. movss xmm1, [eax+4];
  484. mulss xmm1, [ecx+4];
  485. movss xmm2, [eax+8];
  486. mulss xmm2, [ecx+8];
  487. addss xmm0, xmm1;
  488. addss xmm0, xmm2;
  489. addss xmm0, [ecx+12]
  490. movss [edx], xmm0;
  491. add ecx, 16;
  492. movss xmm0, [eax];
  493. mulss xmm0, [ecx];
  494. movss xmm1, [eax+4];
  495. mulss xmm1, [ecx+4];
  496. movss xmm2, [eax+8];
  497. mulss xmm2, [ecx+8];
  498. addss xmm0, xmm1;
  499. addss xmm0, xmm2;
  500. addss xmm0, [ecx+12]
  501. movss [edx+4], xmm0;
  502. add ecx, 16;
  503. movss xmm0, [eax];
  504. mulss xmm0, [ecx];
  505. movss xmm1, [eax+4];
  506. mulss xmm1, [ecx+4];
  507. movss xmm2, [eax+8];
  508. mulss xmm2, [ecx+8];
  509. addss xmm0, xmm1;
  510. addss xmm0, xmm2;
  511. addss xmm0, [ecx+12]
  512. movss [edx+8], xmm0;
  513. }
  514. #else
  515. out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
  516. out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
  517. out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
  518. #endif
  519. }
  520. void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
  521. {
  522. Assert( s_bMathlibInitialized );
  523. Assert( in1 != out1 );
  524. #if defined( _WIN32 ) && !defined( _WIN64 )
  525. __asm
  526. {
  527. mov eax, in1;
  528. mov ecx, in2;
  529. mov edx, out1;
  530. movss xmm0, [eax];
  531. mulss xmm0, [ecx];
  532. movss xmm1, [eax+4];
  533. mulss xmm1, [ecx+4];
  534. movss xmm2, [eax+8];
  535. mulss xmm2, [ecx+8];
  536. addss xmm0, xmm1;
  537. addss xmm0, xmm2;
  538. movss [edx], xmm0;
  539. add ecx, 16;
  540. movss xmm0, [eax];
  541. mulss xmm0, [ecx];
  542. movss xmm1, [eax+4];
  543. mulss xmm1, [ecx+4];
  544. movss xmm2, [eax+8];
  545. mulss xmm2, [ecx+8];
  546. addss xmm0, xmm1;
  547. addss xmm0, xmm2;
  548. movss [edx+4], xmm0;
  549. add ecx, 16;
  550. movss xmm0, [eax];
  551. mulss xmm0, [ecx];
  552. movss xmm1, [eax+4];
  553. mulss xmm1, [ecx+4];
  554. movss xmm2, [eax+8];
  555. mulss xmm2, [ecx+8];
  556. addss xmm0, xmm1;
  557. addss xmm0, xmm2;
  558. movss [edx+8], xmm0;
  559. }
  560. #else
  561. out1[0] = DotProduct( in1, in2[0] );
  562. out1[1] = DotProduct( in1, in2[1] );
  563. out1[2] = DotProduct( in1, in2[2] );
  564. #endif
  565. }
  566. #if defined( _WIN32 ) && !defined( _WIN64 )
  567. void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
  568. {
  569. // FIXME: This don't work!! It will overwrite memory in the write to dest
  570. Assert(0);
  571. Assert( s_bMathlibInitialized );
  572. _asm { // Intel SSE only routine
  573. mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2
  574. mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2
  575. mov edx, DWORD PTR [esp+0x10] ; *dest
  576. movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0
  577. #ifdef ALIGNED_VECTOR
  578. movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
  579. pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
  580. movaps xmm1, [eax] ; x1 = start1, start2, start3, X
  581. mulps xmm3, xmm2 ; x3 *= x2
  582. addps xmm3, xmm1 ; x3 += x1
  583. movaps [edx], xmm3 ; *dest = x3
  584. #else
  585. movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
  586. pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
  587. movups xmm1, [eax] ; x1 = start1, start2, start3, X
  588. mulps xmm3, xmm2 ; x3 *= x2
  589. addps xmm3, xmm1 ; x3 += x1
  590. movups [edx], xmm3 ; *dest = x3
  591. #endif
  592. }
  593. }
  594. #endif
  595. #ifdef _WIN32
  596. #ifdef PFN_VECTORMA
  597. void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
  598. {
  599. // FIXME: This don't work!! It will overwrite memory in the write to dest
  600. Assert(0);
  601. Assert( s_bMathlibInitialized );
  602. _asm
  603. {
  604. // Intel SSE only routine
  605. mov eax, DWORD PTR [esp+0x04] ; *start, s0..s2
  606. mov ecx, DWORD PTR [esp+0x0c] ; *direction, d0..d2
  607. mov edx, DWORD PTR [esp+0x10] ; *dest
  608. movss xmm2, [esp+0x08] ; x2 = scale, 0, 0, 0
  609. #ifdef ALIGNED_VECTOR
  610. movaps xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
  611. pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
  612. movaps xmm1, [eax] ; x1 = start1, start2, start3, X
  613. mulps xmm3, xmm2 ; x3 *= x2
  614. addps xmm3, xmm1 ; x3 += x1
  615. movaps [edx], xmm3 ; *dest = x3
  616. #else
  617. movups xmm3, [ecx] ; x3 = dir0,dir1,dir2,X
  618. pshufd xmm2, xmm2, 0 ; x2 = scale, scale, scale, scale
  619. movups xmm1, [eax] ; x1 = start1, start2, start3, X
  620. mulps xmm3, xmm2 ; x3 *= x2
  621. addps xmm3, xmm1 ; x3 += x1
  622. movups [edx], xmm3 ; *dest = x3
  623. #endif
  624. }
  625. }
  626. float (__cdecl *pfVectorMA)(Vector& v) = _VectorMA;
  627. #endif
  628. #endif
  629. // SSE DotProduct -- it's a smidgen faster than the asm DotProduct...
  630. // Should be validated too! :)
  631. // NJS: (Nov 1 2002) -NOT- faster. may time a couple cycles faster in a single function like
  632. // this, but when inlined, and instruction scheduled, the C version is faster.
  633. // Verified this via VTune
  634. /*
  635. vec_t DotProduct (const vec_t *a, const vec_t *c)
  636. {
  637. vec_t temp;
  638. __asm
  639. {
  640. mov eax, a;
  641. mov ecx, c;
  642. mov edx, DWORD PTR [temp]
  643. movss xmm0, [eax];
  644. mulss xmm0, [ecx];
  645. movss xmm1, [eax+4];
  646. mulss xmm1, [ecx+4];
  647. movss xmm2, [eax+8];
  648. mulss xmm2, [ecx+8];
  649. addss xmm0, xmm1;
  650. addss xmm0, xmm2;
  651. movss [edx], xmm0;
  652. fld DWORD PTR [edx];
  653. ret
  654. }
  655. }
  656. */