Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2120 lines
70 KiB

  1. //===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
  2. //
  3. // Purpose:
  4. //
  5. // $NoKeywords: $
  6. //
  7. //===========================================================================//
  8. #include "mathlib/mathlib.h"
  9. #if defined(__SPU__)
  10. #include "ps3/spu_job_shared.h"
  11. #endif
  12. #include "bone_setup_PS3.h"
  13. #include <string.h>
  14. #if !defined(__SPU__)
  15. #include "tier0/vprof.h"
  16. #endif
  17. #include "mathlib/ssequaternion.h"
  18. #include "bone_utils_PS3.h"
  19. // -----------------------------------------------------------------
  20. // -----------------------------------------------------------------
  21. // from mathlib_base.cpp
  22. // -----------------------------------------------------------------
  23. #if 0
  24. void ConcatTransforms_Aligned_PS3( const matrix3x4a_t &m0, const matrix3x4a_t &m1, matrix3x4a_t &out )
  25. {
  26. // AssertAligned( &m0 );
  27. // AssertAligned( &m1 );
  28. // AssertAligned( &out );
  29. fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
  30. fltx4 rowA0 = LoadAlignedSIMD( m0.m_flMatVal[0] );
  31. fltx4 rowA1 = LoadAlignedSIMD( m0.m_flMatVal[1] );
  32. fltx4 rowA2 = LoadAlignedSIMD( m0.m_flMatVal[2] );
  33. fltx4 rowB0 = LoadAlignedSIMD( m1.m_flMatVal[0] );
  34. fltx4 rowB1 = LoadAlignedSIMD( m1.m_flMatVal[1] );
  35. fltx4 rowB2 = LoadAlignedSIMD( m1.m_flMatVal[2] );
  36. // now we have the rows of m0 and the columns of m1
  37. // first output row
  38. fltx4 A0 = SplatXSIMD(rowA0);
  39. fltx4 A1 = SplatYSIMD(rowA0);
  40. fltx4 A2 = SplatZSIMD(rowA0);
  41. fltx4 mul00 = MulSIMD( A0, rowB0 );
  42. fltx4 mul01 = MulSIMD( A1, rowB1 );
  43. fltx4 mul02 = MulSIMD( A2, rowB2 );
  44. fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
  45. // second output row
  46. A0 = SplatXSIMD(rowA1);
  47. A1 = SplatYSIMD(rowA1);
  48. A2 = SplatZSIMD(rowA1);
  49. fltx4 mul10 = MulSIMD( A0, rowB0 );
  50. fltx4 mul11 = MulSIMD( A1, rowB1 );
  51. fltx4 mul12 = MulSIMD( A2, rowB2 );
  52. fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
  53. // third output row
  54. A0 = SplatXSIMD(rowA2);
  55. A1 = SplatYSIMD(rowA2);
  56. A2 = SplatZSIMD(rowA2);
  57. fltx4 mul20 = MulSIMD( A0, rowB0 );
  58. fltx4 mul21 = MulSIMD( A1, rowB1 );
  59. fltx4 mul22 = MulSIMD( A2, rowB2 );
  60. fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
  61. // add in translation vector
  62. A0 = AndSIMD(rowA0,lastMask);
  63. A1 = AndSIMD(rowA1,lastMask);
  64. A2 = AndSIMD(rowA2,lastMask);
  65. out0 = AddSIMD(out0, A0);
  66. out1 = AddSIMD(out1, A1);
  67. out2 = AddSIMD(out2, A2);
  68. StoreAlignedSIMD( out.m_flMatVal[0], out0 );
  69. StoreAlignedSIMD( out.m_flMatVal[1], out1 );
  70. StoreAlignedSIMD( out.m_flMatVal[2], out2 );
  71. }
  72. void ConcatTransforms_PS3( const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out )
  73. {
  74. #if 0
  75. // test for ones that'll be 2x faster
  76. if ( (((size_t)&in1) % 16) == 0 && (((size_t)&in2) % 16) == 0 && (((size_t)&out) % 16) == 0 )
  77. {
  78. ConcatTransforms_Aligned( in1, in2, out );
  79. return;
  80. }
  81. #endif
  82. fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
  83. fltx4 rowA0 = LoadUnalignedSIMD( in1.m_flMatVal[0] );
  84. fltx4 rowA1 = LoadUnalignedSIMD( in1.m_flMatVal[1] );
  85. fltx4 rowA2 = LoadUnalignedSIMD( in1.m_flMatVal[2] );
  86. fltx4 rowB0 = LoadUnalignedSIMD( in2.m_flMatVal[0] );
  87. fltx4 rowB1 = LoadUnalignedSIMD( in2.m_flMatVal[1] );
  88. fltx4 rowB2 = LoadUnalignedSIMD( in2.m_flMatVal[2] );
  89. // now we have the rows of m0 and the columns of m1
  90. // first output row
  91. fltx4 A0 = SplatXSIMD(rowA0);
  92. fltx4 A1 = SplatYSIMD(rowA0);
  93. fltx4 A2 = SplatZSIMD(rowA0);
  94. fltx4 mul00 = MulSIMD( A0, rowB0 );
  95. fltx4 mul01 = MulSIMD( A1, rowB1 );
  96. fltx4 mul02 = MulSIMD( A2, rowB2 );
  97. fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
  98. // second output row
  99. A0 = SplatXSIMD(rowA1);
  100. A1 = SplatYSIMD(rowA1);
  101. A2 = SplatZSIMD(rowA1);
  102. fltx4 mul10 = MulSIMD( A0, rowB0 );
  103. fltx4 mul11 = MulSIMD( A1, rowB1 );
  104. fltx4 mul12 = MulSIMD( A2, rowB2 );
  105. fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
  106. // third output row
  107. A0 = SplatXSIMD(rowA2);
  108. A1 = SplatYSIMD(rowA2);
  109. A2 = SplatZSIMD(rowA2);
  110. fltx4 mul20 = MulSIMD( A0, rowB0 );
  111. fltx4 mul21 = MulSIMD( A1, rowB1 );
  112. fltx4 mul22 = MulSIMD( A2, rowB2 );
  113. fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
  114. // add in translation vector
  115. A0 = AndSIMD(rowA0,lastMask);
  116. A1 = AndSIMD(rowA1,lastMask);
  117. A2 = AndSIMD(rowA2,lastMask);
  118. out0 = AddSIMD(out0, A0);
  119. out1 = AddSIMD(out1, A1);
  120. out2 = AddSIMD(out2, A2);
  121. // write to output
  122. StoreUnalignedSIMD( out.m_flMatVal[0], out0 );
  123. StoreUnalignedSIMD( out.m_flMatVal[1], out1 );
  124. StoreUnalignedSIMD( out.m_flMatVal[2], out2 );
  125. }
  126. void MatrixAngles_PS3( const matrix3x4_t& matrix, float *angles )
  127. {
  128. float forward[3];
  129. float left[3];
  130. float up[3];
  131. //
  132. // Extract the basis vectors from the matrix. Since we only need the Z
  133. // component of the up vector, we don't get X and Y.
  134. //
  135. forward[0] = matrix[0][0];
  136. forward[1] = matrix[1][0];
  137. forward[2] = matrix[2][0];
  138. left[0] = matrix[0][1];
  139. left[1] = matrix[1][1];
  140. left[2] = matrix[2][1];
  141. up[2] = matrix[2][2];
  142. float xyDist = sqrtf( forward[0] * forward[0] + forward[1] * forward[1] );
  143. // enough here to get angles?
  144. if ( xyDist > 0.001f )
  145. {
  146. // (yaw) y = ATAN( forward.y, forward.x ); -- in our space, forward is the X axis
  147. angles[1] = RAD2DEG( atan2f( forward[1], forward[0] ) );
  148. // (pitch) x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
  149. angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
  150. // (roll) z = ATAN( left.z, up.z );
  151. angles[2] = RAD2DEG( atan2f( left[2], up[2] ) );
  152. }
  153. else // forward is mostly Z, gimbal lock-
  154. {
  155. // (yaw) y = ATAN( -left.x, left.y ); -- forward is mostly z, so use right for yaw
  156. angles[1] = RAD2DEG( atan2f( -left[0], left[1] ) );
  157. // (pitch) x = ATAN( -forward.z, sqrt(forward.x*forward.x+forward.y*forward.y) );
  158. angles[0] = RAD2DEG( atan2f( -forward[2], xyDist ) );
  159. // Assume no roll in this case as one degree of freedom has been lost (i.e. yaw == roll)
  160. angles[2] = 0.0f;
  161. }
  162. }
  163. void MatrixAngles_PS3( const matrix3x4_t& matrix, RadianEuler &angles, Vector &position )
  164. {
  165. MatrixGetColumn_PS3( matrix, 3, position );
  166. MatrixAngles_PS3( matrix, angles );
  167. }
  168. void MatrixAngles_PS3( const matrix3x4_t &matrix, Quaternion &q, Vector &pos )
  169. {
  170. float trace;
  171. trace = matrix[0][0] + matrix[1][1] + matrix[2][2] + 1.0f;
  172. if( trace > 1.0f + FLT_EPSILON )
  173. {
  174. // VPROF_INCREMENT_COUNTER("MatrixQuaternion A",1);
  175. q.x = ( matrix[2][1] - matrix[1][2] );
  176. q.y = ( matrix[0][2] - matrix[2][0] );
  177. q.z = ( matrix[1][0] - matrix[0][1] );
  178. q.w = trace;
  179. }
  180. else if ( matrix[0][0] > matrix[1][1] && matrix[0][0] > matrix[2][2] )
  181. {
  182. // VPROF_INCREMENT_COUNTER("MatrixQuaternion B",1);
  183. trace = 1.0f + matrix[0][0] - matrix[1][1] - matrix[2][2];
  184. q.x = trace;
  185. q.y = (matrix[1][0] + matrix[0][1] );
  186. q.z = (matrix[0][2] + matrix[2][0] );
  187. q.w = (matrix[2][1] - matrix[1][2] );
  188. }
  189. else if (matrix[1][1] > matrix[2][2])
  190. {
  191. // VPROF_INCREMENT_COUNTER("MatrixQuaternion C",1);
  192. trace = 1.0f + matrix[1][1] - matrix[0][0] - matrix[2][2];
  193. q.x = (matrix[0][1] + matrix[1][0] );
  194. q.y = trace;
  195. q.z = (matrix[2][1] + matrix[1][2] );
  196. q.w = (matrix[0][2] - matrix[2][0] );
  197. }
  198. else
  199. {
  200. // VPROF_INCREMENT_COUNTER("MatrixQuaternion D",1);
  201. trace = 1.0f + matrix[2][2] - matrix[0][0] - matrix[1][1];
  202. q.x = (matrix[0][2] + matrix[2][0] );
  203. q.y = (matrix[2][1] + matrix[1][2] );
  204. q.z = trace;
  205. q.w = (matrix[1][0] - matrix[0][1] );
  206. }
  207. QuaternionNormalize_PS3( q );
  208. #if 0
  209. // check against the angle version
  210. RadianEuler ang;
  211. MatrixAngles( matrix, ang );
  212. Quaternion test;
  213. AngleQuaternion( ang, test );
  214. float d = QuaternionDotProduct( q, test );
  215. Assert( fabs(d) > 0.99 && fabs(d) < 1.01 );
  216. #endif
  217. MatrixGetColumn_PS3( matrix, 3, pos );
  218. }
  219. void MatrixGetColumn_PS3( const matrix3x4_t& in, int column, Vector &out )
  220. {
  221. out.x = in[0][column];
  222. out.y = in[1][column];
  223. out.z = in[2][column];
  224. }
  225. void MatrixSetColumn_PS3( const Vector &in, int column, matrix3x4_t& out )
  226. {
  227. out[0][column] = in.x;
  228. out[1][column] = in.y;
  229. out[2][column] = in.z;
  230. }
  231. void MatrixInvert_PS3( const matrix3x4_t& in, matrix3x4_t& out )
  232. {
  233. // if ( &in == &out )
  234. // {
  235. // V_swap(out[0][1],out[1][0]);
  236. // V_swap(out[0][2],out[2][0]);
  237. // V_swap(out[1][2],out[2][1]);
  238. // }
  239. // else
  240. {
  241. // transpose the matrix
  242. out[0][0] = in[0][0];
  243. out[0][1] = in[1][0];
  244. out[0][2] = in[2][0];
  245. out[1][0] = in[0][1];
  246. out[1][1] = in[1][1];
  247. out[1][2] = in[2][1];
  248. out[2][0] = in[0][2];
  249. out[2][1] = in[1][2];
  250. out[2][2] = in[2][2];
  251. }
  252. // now fix up the translation to be in the other space
  253. float tmp[3];
  254. tmp[0] = in[0][3];
  255. tmp[1] = in[1][3];
  256. tmp[2] = in[2][3];
  257. out[0][3] = -DotProduct_PS3( tmp, out[0] );
  258. out[1][3] = -DotProduct_PS3( tmp, out[1] );
  259. out[2][3] = -DotProduct_PS3( tmp, out[2] );
  260. }
  261. void SetIdentityMatrix_PS3( matrix3x4_t& matrix )
  262. {
  263. memset( matrix.Base(), 0, sizeof(float)*3*4 );
  264. matrix[0][0] = 1.0f;
  265. matrix[1][1] = 1.0f;
  266. matrix[2][2] = 1.0f;
  267. }
  268. void VectorRotate_PS3( const float * RESTRICT in1, const matrix3x4_t& in2, float * RESTRICT out )
  269. {
  270. // Assert( in1 != out );
  271. out[0] = DotProduct_PS3( in1, in2[0] );
  272. out[1] = DotProduct_PS3( in1, in2[1] );
  273. out[2] = DotProduct_PS3( in1, in2[2] );
  274. }
  275. void AngleMatrix_PS3( RadianEuler const &angles, const Vector &position, matrix3x4_t& matrix )
  276. {
  277. AngleMatrix_PS3( angles, matrix );
  278. MatrixSetColumn_PS3( position, 3, matrix );
  279. }
  280. void AngleMatrix_PS3( const RadianEuler& angles, matrix3x4_t& matrix )
  281. {
  282. QAngle quakeEuler( RAD2DEG( angles.y ), RAD2DEG( angles.z ), RAD2DEG( angles.x ) );
  283. AngleMatrix_PS3( quakeEuler, matrix );
  284. }
  285. void AngleMatrix_PS3( const QAngle &angles, const Vector &position, matrix3x4_t& matrix )
  286. {
  287. AngleMatrix_PS3( angles, matrix );
  288. MatrixSetColumn_PS3( position, 3, matrix );
  289. }
  290. void AngleMatrix_PS3( const QAngle &angles, matrix3x4_t& matrix )
  291. {
  292. float sr, sp, sy, cr, cp, cy;
  293. #ifdef _X360
  294. fltx4 radians, scale, sine, cosine;
  295. radians = LoadUnaligned3SIMD( angles.Base() );
  296. scale = ReplicateX4( M_PI_F / 180.f );
  297. radians = MulSIMD( radians, scale );
  298. SinCos3SIMD( sine, cosine, radians );
  299. sp = SubFloat( sine, 0 ); sy = SubFloat( sine, 1 ); sr = SubFloat( sine, 2 );
  300. cp = SubFloat( cosine, 0 ); cy = SubFloat( cosine, 1 ); cr = SubFloat( cosine, 2 );
  301. #else
  302. SinCos( DEG2RAD( angles[YAW] ), &sy, &cy );
  303. SinCos( DEG2RAD( angles[PITCH] ), &sp, &cp );
  304. SinCos( DEG2RAD( angles[ROLL] ), &sr, &cr );
  305. #endif
  306. // matrix = (YAW * PITCH) * ROLL
  307. matrix[0][0] = cp*cy;
  308. matrix[1][0] = cp*sy;
  309. matrix[2][0] = -sp;
  310. // NOTE: Do not optimize this to reduce multiplies! optimizer bug will screw this up.
  311. matrix[0][1] = sr*sp*cy+cr*-sy;
  312. matrix[1][1] = sr*sp*sy+cr*cy;
  313. matrix[2][1] = sr*cp;
  314. matrix[0][2] = (cr*sp*cy+-sr*-sy);
  315. matrix[1][2] = (cr*sp*sy+-sr*cy);
  316. matrix[2][2] = cr*cp;
  317. matrix[0][3] = 0.0f;
  318. matrix[1][3] = 0.0f;
  319. matrix[2][3] = 0.0f;
  320. }
  321. void AngleQuaternion_PS3( const RadianEuler &angles, Quaternion &outQuat )
  322. {
  323. float sr, sp, sy, cr, cp, cy;
  324. #ifdef _X360
  325. fltx4 radians, scale, sine, cosine;
  326. radians = LoadUnaligned3SIMD( &angles.x );
  327. scale = ReplicateX4( 0.5f );
  328. radians = MulSIMD( radians, scale );
  329. SinCos3SIMD( sine, cosine, radians );
  330. // NOTE: The ordering here is *different* from the AngleQuaternion below
  331. // because p, y, r are not in the same locations in QAngle + RadianEuler. Yay!
  332. sr = SubFloat( sine, 0 ); sp = SubFloat( sine, 1 ); sy = SubFloat( sine, 2 );
  333. cr = SubFloat( cosine, 0 ); cp = SubFloat( cosine, 1 ); cy = SubFloat( cosine, 2 );
  334. #else
  335. SinCos( angles.z * 0.5f, &sy, &cy );
  336. SinCos( angles.y * 0.5f, &sp, &cp );
  337. SinCos( angles.x * 0.5f, &sr, &cr );
  338. #endif
  339. // NJS: for some reason VC6 wasn't recognizing the common subexpressions:
  340. float srXcp = sr * cp, crXsp = cr * sp;
  341. outQuat.x = srXcp*cy-crXsp*sy; // X
  342. outQuat.y = crXsp*cy+srXcp*sy; // Y
  343. float crXcp = cr * cp, srXsp = sr * sp;
  344. outQuat.z = crXcp*sy-srXsp*cy; // Z
  345. outQuat.w = crXcp*cy+srXsp*sy; // W (real component)
  346. }
  347. void Hermite_Spline_PS3( const Vector &p1, const Vector &p2, const Vector &d1, const Vector &d2, float t, Vector& output )
  348. {
  349. float tSqr = t*t;
  350. float tCube = t*tSqr;
  351. Assert( &output != &p1 );
  352. Assert( &output != &p2 );
  353. Assert( &output != &d1 );
  354. Assert( &output != &d2 );
  355. float b1 = 2.0f*tCube-3.0f*tSqr+1.0f;
  356. float b2 = 1.0f - b1; // -2*tCube+3*tSqr;
  357. float b3 = tCube-2*tSqr+t;
  358. float b4 = tCube-tSqr;
  359. VectorScale_PS3( p1, b1, output );
  360. VectorMA_PS3( output, b2, p2, output );
  361. VectorMA_PS3( output, b3, d1, output );
  362. VectorMA_PS3( output, b4, d2, output );
  363. }
  364. float Hermite_Spline_PS3( float p1, float p2, float d1, float d2, float t )
  365. {
  366. float output;
  367. float tSqr = t*t;
  368. float tCube = t*tSqr;
  369. float b1 = 2.0f*tCube-3.0f*tSqr+1.0f;
  370. float b2 = 1.0f - b1; // -2*tCube+3*tSqr;
  371. float b3 = tCube-2*tSqr+t;
  372. float b4 = tCube-tSqr;
  373. output = p1 * b1;
  374. output += p2 * b2;
  375. output += d1 * b3;
  376. output += d2 * b4;
  377. return output;
  378. }
  379. void Hermite_SplineBasis_PS3( float t, float basis[4] )
  380. {
  381. float tSqr = t*t;
  382. float tCube = t*tSqr;
  383. basis[0] = 2.0f*tCube-3.0f*tSqr+1.0f;
  384. basis[1] = 1.0f - basis[0]; // -2*tCube+3*tSqr;
  385. basis[2] = tCube-2*tSqr+t;
  386. basis[3] = tCube-tSqr;
  387. }
  388. //#pragma optimize( "g", off )
  389. void Hermite_Spline_PS3( const Vector &p0, const Vector &p1, const Vector &p2, float t, Vector& output )
  390. {
  391. Vector e10, e21;
  392. VectorSubtract_PS3( p1, p0, e10 );
  393. VectorSubtract_PS3( p2, p1, e21 );
  394. Hermite_Spline_PS3( p1, p2, e10, e21, t, output );
  395. }
  396. //#pragma optimize( "", on )
  397. float Hermite_Spline_PS3( float p0, float p1, float p2, float t )
  398. {
  399. return Hermite_Spline_PS3( p1, p2, p1 - p0, p2 - p1, t );
  400. }
  401. void Hermite_Spline_PS3( const Quaternion &q0, const Quaternion &q1, const Quaternion &q2, float t, Quaternion &output )
  402. {
  403. // cheap, hacked version of quaternions
  404. Quaternion q0a;
  405. Quaternion q1a;
  406. QuaternionAlign_PS3( q2, q0, q0a );
  407. QuaternionAlign_PS3( q2, q1, q1a );
  408. output.x = Hermite_Spline_PS3( q0a.x, q1a.x, q2.x, t );
  409. output.y = Hermite_Spline_PS3( q0a.y, q1a.y, q2.y, t );
  410. output.z = Hermite_Spline_PS3( q0a.z, q1a.z, q2.z, t );
  411. output.w = Hermite_Spline_PS3( q0a.w, q1a.w, q2.w, t );
  412. QuaternionNormalize_PS3( output );
  413. }
  414. //-----------------------------------------------------------------------------
  415. // Purpose: Converts a quaternion into engine angles
  416. // Input : *quaternion - q3 + q0.i + q1.j + q2.k
  417. // *outAngles - PITCH, YAW, ROLL
  418. //-----------------------------------------------------------------------------
  419. void QuaternionAngles_PS3( const Quaternion &q, RadianEuler &angles )
  420. {
  421. Assert( s_bMathlibInitialized );
  422. Assert( q.IsValid() );
  423. // FIXME: doing it this way calculates too much data, needs to do an optimized version...
  424. matrix3x4_t matrix;
  425. QuaternionMatrix_PS3( q, matrix );
  426. MatrixAngles_PS3( matrix, angles );
  427. Assert( angles.IsValid() );
  428. }
  429. #endif
  430. // some assumptions made about alignment here
  431. #define _VEC_SWIZZLE_Y0X0X0Y0 (__vector unsigned char) { 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03, 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07 }
  432. #define _VEC_SWIZZLE_Y0Y0Z0Z0 (__vector unsigned char) { 0x04,0x05,0x06,0x07, 0x04,0x05,0x06,0x07, 0x08,0x09,0x0A,0x0B, 0x08,0x09,0x0A,0x0B }
  433. #define _VEC_SWIZZLE_Z0W0W0W0 (__vector unsigned char) { 0x08,0x09,0x0A,0x0B, 0x0C,0x0D,0x0E,0x0F, 0x0C,0x0D,0x0E,0x0F, 0x0C,0x0D,0x0E,0x0F }
  434. #define _VEC_SWIZZLE_Z0Z0Y0X0 (__vector unsigned char) { 0x08,0x09,0x0A,0x0B, 0x08,0x09,0x0A,0x0B, 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03 }
  435. #define _VEC_SWIZZLE_X0Y1Z1W1 (__vector unsigned char) { 0x00,0x01,0x02,0x03, 0x14,0x15,0x16,0x17, 0x18,0x19,0x1A,0x1B, 0x1C,0x1D,0x1E,0x1F }
  436. #define _VEC_SWIZZLE_X0Z1X0Z1 (__vector unsigned char) { 0x00,0x01,0x02,0x03, 0x18,0x19,0x1A,0x1B, 0x00,0x01,0x02,0x03, 0x18,0x19,0x1A,0x1B }
  437. #define _VEC_SWIZZLE_X0Y0X1Y1 (__vector unsigned char) { 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07, 0x10,0x11,0x12,0x13, 0x14,0x15,0x16,0x17 }
  438. #define _VEC_SWIZZLE_X0Y0Z0X1 (__vector unsigned char) { 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07, 0x08,0x09,0x0A,0x0B, 0x10,0x11,0x12,0x13 }
  439. #define _VEC_SWIZZLE_Y0X0W0Y1 (__vector unsigned char) { 0x04,0x05,0x06,0x07, 0x00,0x01,0x02,0x03, 0x0C,0x0D,0x0E,0x0F, 0x14,0x15,0x16,0x17 }
  440. #define _VEC_SWIZZLE_X0Y0Z0Z1 (__vector unsigned char) { 0x00,0x01,0x02,0x03, 0x04,0x05,0x06,0x07, 0x08,0x09,0x0A,0x0B, 0x18,0x19,0x1A,0x1B }
  441. #define _VEC_SWIZZLE_Z1W0Z1W0 (__vector unsigned char) { 0x18,0x19,0x1A,0x1B, 0x0C,0x0D,0x0E,0x0F, 0x18,0x19,0x1A,0x1B, 0x0C,0x0D,0x0E,0x0F }
  442. #define _VEC_SWIZZLE_Z0W1Z0W1 (__vector unsigned char) { 0x08,0x09,0x0A,0x0B, 0x1C,0x1D,0x1E,0x1F, 0x08,0x09,0x0A,0x0B, 0x1C,0x1D,0x1E,0x1F }
  443. #define _VEC_ONEZEROZEROZERO (__vector float) { 1.0f, 0.0f, 0.0f, 0.0f }
  444. #define _VEC_ZEROSIGNSIGNZERO (__vector unsigned int) { 0x0, 0x80000000, 0x80000000, 0x0 }
  445. #define _VEC_ZEROSIGNSIGNSIGN (__vector unsigned int) { 0x0, 0x80000000, 0x80000000, 0x80000000 }
  446. #define _VEC_SIGNZEROSIGNSIGN (__vector unsigned int) { 0x80000000, 0x0, 0x80000000, 0x80000000 }
  447. #define _VEC_SIGNSIGNZEROSIGN (__vector unsigned int) { 0x80000000, 0x80000000, 0x0, 0x80000000 }
  448. #define _VEC_SIGNZEROZEROZERO (__vector unsigned int) { 0x80000000, 0x0, 0x0, 0x0 }
  449. #define _VEC_SIGNSIGNZEROZERO (__vector unsigned int) { 0x80000000, 0x80000000, 0x0, 0x0 }
  450. #define _VEC_ZEROZEROZEROSIGN (__vector unsigned int) { 0x0, 0x0, 0x0, 0x80000000 }
  451. #if defined(__SPU__)
  452. // cyclic dependancy workaround (redefinition here) - TODO: remove
  453. const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON};
  454. #endif
  455. void QuaternionMatrix_PS3( const Quaternion &q, const Vector &pos, matrix3x4a_t& matrix )
  456. {
  457. fltx4 v0, v1, v2, v3, v4, v5, v6, v7;
  458. v0 = LoadUnalignedSIMD( &q ); // x, y, z, w (q)
  459. v6 = LoadUnalignedSIMD( &pos ); // px, py, pz, pw
  460. v1 = AddSIMD( v0, v0 ); // 2x, 2y, 2z, 2w
  461. v2 = vec_perm( v0, v0, _VEC_SWIZZLE_Y0X0X0Y0 ); // y, x, x, y
  462. v3 = vec_perm( v1, v1, _VEC_SWIZZLE_Y0Y0Z0Z0 ); // 2y, 2y, 2z, 2z
  463. v2 = MulSIMD( v2, v3 ); // 2yy, 2xy, 2xz, 2yz
  464. v4 = vec_perm( v0, v0, _VEC_SWIZZLE_Z0W0W0W0 ); // z, w, w, w
  465. v5 = vec_perm( v1, v1, _VEC_SWIZZLE_Z0Z0Y0X0 ); // 2z, 2z, 2y, 2x
  466. v4 = MulSIMD( v4, v5 ); // 2zz, 2zw, 2yw, 2xw
  467. v0 = MulSIMD( v0, v1 ); // 2xx, 2yy, 2zz, 2ww
  468. v0 = vec_perm( v0, v1, _VEC_SWIZZLE_X0Y1Z1W1 ); // 2xx, 2y, 2z, 2w
  469. // last two elements of third row
  470. v7 = SubSIMD( _VEC_ONEZEROZEROZERO, v0 ); // 1-2xx, --, --, --
  471. v7 = SubSIMD( v7, v2 ); // 1-2xx-2yy, --, --, --
  472. v7 = vec_perm( v7, v6, _VEC_SWIZZLE_X0Z1X0Z1 ); // 1-2xx-2yy, pz, --, --
  473. // first row
  474. // 1-2yy-2zz, 2xy-2zw, 2xz+2yw, px
  475. v2 = vec_xor( v2, (fltx4)_VEC_SIGNZEROZEROZERO );// -2yy, 2xy, 2xz, 2yz
  476. v4 = vec_xor( v4, (fltx4)_VEC_SIGNSIGNZEROSIGN );// -2zz, -2zw, 2yw, -2xw
  477. v4 = AddSIMD( v4, _VEC_ONEZEROZEROZERO ); // 1-2zz, -2zw, 2yw, -2xw
  478. v3 = AddSIMD( v4, v2 ); // 1-2zz-2yy, 2xy-2zw, 2xz+2yw, 2yz-2xw
  479. StoreAlignedSIMD( matrix[0], vec_perm( v3, v6, _VEC_SWIZZLE_X0Y0Z0X1 ) ); // 1-2zz-2yy, 2xy-2zw, 2xz+2yw, px
  480. // second row
  481. // 2xy+2wz, 1-2xx-2zz, 2yz-2xw, py
  482. v2 = vec_perm( v0, v2, _VEC_SWIZZLE_X0Y1Z1W1 ); // 2xx, 2xy, 2xz, 2yz
  483. v2 = vec_xor( v2, (fltx4)_VEC_SIGNZEROZEROZERO );// -2xx, 2xy, 2xz, 2yz
  484. v4 = vec_xor( v4, (fltx4)_VEC_ZEROSIGNSIGNZERO );// 1-2zz, 2zw, -2yw, -2xw
  485. v3 = AddSIMD( v4, v2 ); // 1-2xx-2zz, 2xy+2zw, 2xz-2yw, 2yz-2xw
  486. StoreAlignedSIMD( matrix[1], vec_perm( v3, v6, _VEC_SWIZZLE_Y0X0W0Y1 ) ); // 2xy+2zw, 1-2xx-2zz, 2yz-2xw, py
  487. // third row
  488. // 2xz-2yw, 2yz+2xw, 1-2xx-2yy, pz
  489. v2 = SubSIMD( v2, v4 ); // 2zz-1-2xx, 2xy-2zw, 2xz+2yw, 2yz+2xw
  490. v3 = vec_perm( v3, v2, _VEC_SWIZZLE_Z0W1Z0W1 ); // 2xz-2yw, 2yz+2xw, --, --
  491. v3 = vec_perm( v3, v7, _VEC_SWIZZLE_X0Y0X1Y1 ); // 2xz-2yw, 2yz+2xw, 1-2xx-2yy, pz
  492. StoreAlignedSIMD( matrix[2], v3 ); // 2xz+2yw, 2xw+2yz, 1-2xx-2yy, pz
  493. }
  494. void QuaternionAlign_PS3( const Quaternion &p, const Quaternion &q, QuaternionAligned &qt )
  495. {
  496. fltx4 p1, q1, qt1;
  497. p1 = LoadUnalignedSIMD( &p );
  498. q1 = LoadUnalignedSIMD( &q );
  499. qt1 = QuaternionAlignSIMD( p1, q1 );
  500. StoreAlignedSIMD( (QuaternionAligned *)&qt, qt1 );
  501. AssertFatal( qt.IsValid() );
  502. }
  503. void QuaternionSlerp_PS3( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
  504. {
  505. fltx4 p1, q1, qt1;
  506. p1 = LoadUnalignedSIMD( &p );
  507. q1 = LoadUnalignedSIMD( &q );
  508. qt1 = QuaternionSlerpSIMD( p1, q1, t );
  509. StoreUnalignedSIMD( qt.Base(), qt1 );
  510. AssertFatal( qt.IsValid() );
  511. }
  512. void QuaternionSlerpNoAlign_PS3( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
  513. {
  514. fltx4 p1, q1, qt1;
  515. p1 = LoadUnalignedSIMD( &p );
  516. q1 = LoadUnalignedSIMD( &q );
  517. qt1 = QuaternionSlerpNoAlignSIMD( p1, q1, t );
  518. StoreUnalignedSIMD( qt.Base(), qt1 );
  519. AssertFatal( qt.IsValid() );
  520. }
  521. float QuaternionNormalize_PS3( Quaternion &q )
  522. {
  523. fltx4 q1, radius, result;
  524. bi32x4 mask;
  525. q1 = LoadUnalignedSIMD( &q );
  526. radius = Dot4SIMD( q1, q1 );
  527. mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0
  528. result = ReciprocalSqrtSIMD( radius );
  529. result = MulSIMD( result, q1 );
  530. result = MaskedAssign( mask, q1, result ); // if radius was 0, just return q
  531. StoreUnalignedSIMD( q.Base(), result );
  532. AssertFatal( q.IsValid() );
  533. return GetComponentSIMD( radius, 0 );
  534. }
  535. void QuaternionBlend_PS3( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
  536. {
  537. fltx4 psimd, qsimd, qtsimd;
  538. psimd = LoadUnalignedSIMD( p.Base() );
  539. qsimd = LoadUnalignedSIMD( q.Base() );
  540. qtsimd = QuaternionBlendSIMD( psimd, qsimd, t );
  541. StoreUnalignedSIMD( qt.Base(), qtsimd );
  542. AssertFatal( qt.IsValid() );
  543. }
  544. void QuaternionBlendNoAlign_PS3( const Quaternion &p, const Quaternion &q, float t, Quaternion &qt )
  545. {
  546. fltx4 p1, q1, qt1;
  547. p1 = LoadUnalignedSIMD( &p );
  548. q1 = LoadUnalignedSIMD( &q );
  549. qt1 = QuaternionBlendNoAlignSIMD( p1, q1, t );
  550. StoreUnalignedSIMD( qt.Base(), qt1 );
  551. AssertFatal( qt.IsValid() );
  552. }
  553. void QuaternionIdentityBlend_PS3( const Quaternion &p, float t, Quaternion &qt )
  554. {
  555. fltx4 p1, t1, tw, sclp, qt1;
  556. p1 = LoadUnalignedSIMD( &p );
  557. t1 = ReplicateX4( t );
  558. sclp = SubSIMD( _VEC_ONEF, t1 );
  559. qt1 = MulSIMD( p1, sclp );
  560. tw = _VEC_ZEROF;
  561. tw = SetWSIMD( tw, t1 );
  562. tw = XorSIMD( tw, AndSIMD( qt1, (fltx4)_VEC_ZEROZEROZEROSIGN ) );
  563. qt1 = AddSIMD( qt1, tw );
  564. QuaternionNormalizeSIMD( qt1 );
  565. StoreUnalignedSIMD( qt.Base(), qt1 );
  566. AssertFatal( qt.IsValid() );
  567. }
  568. void QuaternionScale_PS3( const Quaternion &p, float t, Quaternion &q )
  569. {
  570. fltx4 p1, q1;
  571. p1 = LoadUnalignedSIMD( &p );
  572. q1 = QuaternionScaleSIMD( p1, t );
  573. StoreUnalignedSIMD( q.Base(), q1 );
  574. AssertFatal( q.IsValid() );
  575. }
  576. void QuaternionAdd_PS3( const Quaternion &p, const Quaternion &q, Quaternion &qt )
  577. {
  578. fltx4 p1, q1, qt1;
  579. fltx4 q2;
  580. p1 = LoadUnalignedSIMD( &p );
  581. q1 = LoadUnalignedSIMD( &q );
  582. q2 = QuaternionAlignSIMD( p1, q1 );
  583. qt1 = AddSIMD( p1, q2 );
  584. StoreUnalignedSIMD( qt.Base(), qt1 );
  585. AssertFatal( qt.IsValid() );
  586. }
  587. float QuaternionDotProduct_PS3( const Quaternion &p, const Quaternion &q )
  588. {
  589. fltx4 p1, q1, qt1;
  590. p1 = LoadUnalignedSIMD( &p );
  591. q1 = LoadUnalignedSIMD( &q );
  592. qt1 = Dot4SIMD( p1, q1 );
  593. #if !defined(__SPU__)
  594. QuaternionAligned qt;
  595. StoreAlignedSIMD( qt.Base(), qt1 );
  596. AssertFatal( qt.IsValid() );
  597. #endif
  598. return GetComponentSIMD( qt1, 0 );
  599. }
  600. void QuaternionMult_PS3( const Quaternion &p, const Quaternion &q, Quaternion &qt )
  601. {
  602. fltx4 p1, q1, qt1;
  603. p1 = LoadUnalignedSIMD( &p );
  604. q1 = LoadUnalignedSIMD( &q );
  605. qt1 = QuaternionMultSIMD( p1, q1 );
  606. StoreUnalignedSIMD( qt.Base(), qt1 );
  607. AssertFatal( qt.IsValid() );
  608. }
  609. //-----------------------------------------------------------------------------
  610. // Purpose: build boneToWorld transforms for a specific bone
  611. //-----------------------------------------------------------------------------
  612. void BuildBoneChain_PS3(
  613. const int *pBoneParent,
  614. const matrix3x4a_t &rootxform,
  615. const BoneVector pos[],
  616. const BoneQuaternion q[],
  617. int iBone,
  618. matrix3x4a_t *pBoneToWorld )
  619. {
  620. CBoneBitList_PS3 boneComputed;
  621. BuildBoneChainPartial_PS3( pBoneParent, rootxform, pos, q, iBone, pBoneToWorld, boneComputed, -1 );
  622. return;
  623. }
  624. void BuildBoneChain_PS3(
  625. const int *pBoneParent,
  626. const matrix3x4a_t &rootxform,
  627. const BoneVector pos[],
  628. const BoneQuaternion q[],
  629. int iBone,
  630. matrix3x4a_t *pBoneToWorld,
  631. CBoneBitList_PS3 &boneComputed )
  632. {
  633. BuildBoneChainPartial_PS3( pBoneParent, rootxform, pos, q, iBone, pBoneToWorld, boneComputed, -1 );
  634. }
  635. void BuildBoneChainPartial_PS3(
  636. const int *pBoneParent,
  637. const matrix3x4a_t &rootxform,
  638. const BoneVector pos[],
  639. const BoneQuaternion q[],
  640. int iBone,
  641. matrix3x4a_t *pBoneToWorld,
  642. CBoneBitList_PS3 &boneComputed,
  643. int iRoot )
  644. {
  645. if ( boneComputed.IsBoneMarked( iBone ) )
  646. return;
  647. matrix3x4a_t bonematrix;
  648. QuaternionMatrix_PS3( q[ iBone ], pos[ iBone ], bonematrix );
  649. int parent = pBoneParent[ iBone ];
  650. if( parent == -1 || iBone == iRoot )
  651. {
  652. ConcatTransforms_Aligned_PS3( rootxform, bonematrix, pBoneToWorld[ iBone ] );
  653. }
  654. else
  655. {
  656. // evil recursive!!!
  657. BuildBoneChainPartial_PS3( pBoneParent, rootxform, pos, q, parent, pBoneToWorld, boneComputed, iRoot );
  658. ConcatTransforms_Aligned_PS3( pBoneToWorld[ parent ], bonematrix, pBoneToWorld[ iBone ]);
  659. }
  660. boneComputed.MarkBone( iBone );
  661. }
  662. //-----------------------------------------------------------------------------
  663. // Purpose: qt = ( s * p ) * q
  664. //-----------------------------------------------------------------------------
  665. void QuaternionSM_PS3( float s, const Quaternion &p, const Quaternion &q, Quaternion &qt )
  666. {
  667. // Quaternion p1, q1;
  668. //
  669. // QuaternionScale_PS3( p, s, p1 );
  670. // QuaternionMult_PS3( p1, q, q1 );
  671. // QuaternionNormalize_PS3( q1 );
  672. // qt[0] = q1[0];
  673. // qt[1] = q1[1];
  674. // qt[2] = q1[2];
  675. // qt[3] = q1[3];
  676. fltx4 p1, q1, qt1;
  677. p1 = LoadUnalignedSIMD( &p );
  678. q1 = LoadUnalignedSIMD( &q );
  679. p1 = QuaternionScaleSIMD( p1, s );
  680. q1 = QuaternionMultSIMD( p1, q1 );
  681. qt1 = QuaternionNormalizeSIMD( q1 );
  682. StoreUnalignedSIMD( qt.Base(), qt1 );
  683. }
  684. //-----------------------------------------------------------------------------
  685. // Purpose: qt = p * ( s * q )
  686. //-----------------------------------------------------------------------------
  687. void QuaternionMA_PS3( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt )
  688. {
  689. // Quaternion p1, q1;
  690. //
  691. // QuaternionScale_PS3( q, s, q1 );
  692. // QuaternionMult_PS3( p, q1, p1 );
  693. // QuaternionNormalize_PS3( p1 );
  694. // qt[0] = p1[0];
  695. // qt[1] = p1[1];
  696. // qt[2] = p1[2];
  697. // qt[3] = p1[3];
  698. fltx4 p1, q1, qt1;
  699. q1 = LoadUnalignedSIMD( &q );
  700. p1 = LoadUnalignedSIMD( &p );
  701. q1 = QuaternionScaleSIMD( q1, s );
  702. p1 = QuaternionMultSIMD( p1, q1 );
  703. qt1 = QuaternionNormalizeSIMD( p1 );
  704. StoreUnalignedSIMD( qt.Base(), qt1 );
  705. }
  706. //-----------------------------------------------------------------------------
  707. // Purpose: qt = p + s * q
  708. //-----------------------------------------------------------------------------
  709. void QuaternionAccumulate_PS3( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt )
  710. {
  711. // Quaternion q2;
  712. // QuaternionAlign_PS3( p, q, q2 );
  713. //
  714. // qt[0] = p[0] + s * q2[0];
  715. // qt[1] = p[1] + s * q2[1];
  716. // qt[2] = p[2] + s * q2[2];
  717. // qt[3] = p[3] + s * q2[3];
  718. fltx4 p1, s1, q1, qt1;
  719. p1 = LoadUnalignedSIMD( &p );
  720. q1 = LoadUnalignedSIMD( &q );
  721. s1 = ReplicateX4( s );
  722. qt1 = QuaternionAlignSIMD( p1, q1 );
  723. qt1 = MaddSIMD( qt1, s1, p1 );
  724. StoreUnalignedSIMD( qt.Base(), qt1 );
  725. }
  726. void GetBoneMapBoneWeight_SPU( bonejob_SPU *pBonejob, accumposeentry_SPU *pPoseEntry, int *&pLS_boneMap, float *&pLS_boneWeight )
  727. {
  728. int maxAnimBones = pBonejob->numBones;
  729. for( int lp = 0; lp < MAX_BLENDANIMS; lp++ )
  730. {
  731. if( pPoseEntry->animIndices[lp] != -1 )
  732. {
  733. maxAnimBones = MAX( maxAnimBones, pPoseEntry->anims[ pPoseEntry->animIndices[lp] ].animstudiohdr_numbones );
  734. }
  735. }
  736. if( pPoseEntry->pEA_seqgroup_boneMap )
  737. {
  738. pLS_boneMap = (int *)SPUmemcpy_UnalignedGet_MustSync( pLS_boneMap, (uint32)pPoseEntry->pEA_seqgroup_boneMap, sizeof(int) * maxAnimBones, DMATAG_ANIM_SYNC_BONEMAPWEIGHT );
  739. }
  740. Assert( pPoseEntry->pEA_seqdesc_boneWeight );
  741. pLS_boneWeight = (float *)SPUmemcpy_UnalignedGet_MustSync( pLS_boneWeight, (uint32)pPoseEntry->pEA_seqdesc_boneWeight, sizeof(float) * maxAnimBones, DMATAG_ANIM_SYNC_BONEMAPWEIGHT );
  742. }
  743. //-----------------------------------------------------------------------------
  744. // Purpose: blend together in world space q1,pos1 with q2,pos2. Return result in q1,pos1.
  745. // 0 returns q1, pos1. 1 returns q2, pos2
  746. //-----------------------------------------------------------------------------
  747. void WorldSpaceSlerp_SPU(
  748. bonejob_SPU* pSPUJob,
  749. accumposeentry_SPU *pPoseEntry,
  750. BoneQuaternion *q1,
  751. BoneVector *pos1,
  752. const BoneQuaternion *q2,
  753. const BoneVector *pos2,
  754. const int *boneMap,
  755. const float *boneWeight,
  756. float s,
  757. int boneMask )
  758. {
  759. SNPROF_ANIM("WorldSpaceSlerp_SPU");
  760. int i, j;
  761. float s1; // weight of parent for q2, pos2
  762. float s2; // weight for q2, pos2
  763. // make fake root transform
  764. matrix3x4a_t rootXform;
  765. SetIdentityMatrix_PS3( rootXform );
  766. // matrices for q2, pos2
  767. // matrix3x4a_t *srcBoneToWorld = g_matStack[0];
  768. matrix3x4a_t *srcBoneToWorld = (matrix3x4a_t *)PushLSStack( sizeof(matrix3x4a_t) * pSPUJob->maxBones );
  769. CBoneBitList_PS3 srcBoneComputed;
  770. // matrix3x4a_t *destBoneToWorld = g_matStack[1];
  771. matrix3x4a_t *destBoneToWorld = (matrix3x4a_t *)PushLSStack( sizeof(matrix3x4a_t) * pSPUJob->maxBones );
  772. CBoneBitList_PS3 destBoneComputed;
  773. // matrix3x4a_t *targetBoneToWorld = g_matStack[2];
  774. matrix3x4a_t *targetBoneToWorld = (matrix3x4a_t *)PushLSStack( sizeof(matrix3x4a_t) * pSPUJob->maxBones );
  775. // CBoneBitList_PS3 targetBoneComputed;
  776. // get bonemap and boneweights
  777. for( i = 0; i < pSPUJob->numBones; i++ )
  778. {
  779. // skip unused bones
  780. // if (!(pStudioHdr->boneFlags(i) & boneMask))
  781. if ( !( pSPUJob->boneFlags[ i ] & boneMask ) )
  782. {
  783. continue;
  784. }
  785. // int n = pbone[i].parent;
  786. int n = pSPUJob->boneParent[ i ];
  787. s1 = 0.0f;
  788. if( pPoseEntry->pEA_seqgroup_boneMap )
  789. {
  790. // j = pSeqGroup->boneMap[i];
  791. j = boneMap[ i ];
  792. if( j >= 0 )
  793. {
  794. // s2 = s * seqdesc.weight( j ); // blend in based on this bones weight
  795. s2 = s * boneWeight[ j ]; // blend in based on this bones weight
  796. if( n != -1 )
  797. {
  798. // s1 = s * seqdesc.weight( pSeqGroup->boneMap[n] );
  799. s1 = s * boneWeight[ boneMap[ n ] ];
  800. }
  801. }
  802. else
  803. {
  804. s2 = 0.0f;
  805. }
  806. }
  807. else
  808. {
  809. // s2 = s * seqdesc.weight( i ); // blend in based on this bones weight
  810. s2 = s * boneWeight[ i ]; // blend in based on this bones weight
  811. if (n != -1)
  812. {
  813. // s1 = s * seqdesc.weight( n );
  814. s1 = s * boneWeight[ n ];
  815. }
  816. }
  817. if( s1 == 1.0f && s2 == 1.0f )
  818. {
  819. pos1[i] = pos2[i];
  820. q1[i] = q2[i];
  821. }
  822. else if( s2 > 0.0f )
  823. {
  824. BoneQuaternion srcQ, destQ;
  825. BoneVector srcPos, destPos;
  826. BoneQuaternion targetQ;
  827. BoneVector targetPos;
  828. BoneVector tmp;
  829. BuildBoneChain_PS3( pSPUJob->boneParent, rootXform, pos1, q1, i, destBoneToWorld, destBoneComputed );
  830. BuildBoneChain_PS3( pSPUJob->boneParent, rootXform, pos2, q2, i, srcBoneToWorld, srcBoneComputed );
  831. MatrixAngles_PS3( destBoneToWorld[i], destQ, destPos );
  832. MatrixAngles_PS3( srcBoneToWorld[i], srcQ, srcPos );
  833. QuaternionSlerp_PS3( destQ, srcQ, s2, targetQ );
  834. AngleMatrix_PS3( targetQ, destPos, targetBoneToWorld[i] );
  835. // back solve
  836. if( n == -1 )
  837. {
  838. MatrixAngles_PS3( targetBoneToWorld[i], q1[i], tmp );
  839. }
  840. else
  841. {
  842. matrix3x4a_t worldToBone;
  843. MatrixInvert_PS3( targetBoneToWorld[n], worldToBone );
  844. matrix3x4a_t local;
  845. ConcatTransforms_Aligned_PS3( worldToBone, targetBoneToWorld[i], local );
  846. MatrixAngles_PS3( local, q1[i], tmp );
  847. // blend bone lengths (local space)
  848. pos1[i] = Lerp_PS3( s2, pos1[i], pos2[i] );
  849. }
  850. }
  851. }
  852. PopLSStack( sizeof(matrix3x4a_t) * pSPUJob->maxBones );
  853. PopLSStack( sizeof(matrix3x4a_t) * pSPUJob->maxBones );
  854. PopLSStack( sizeof(matrix3x4a_t) * pSPUJob->maxBones );
  855. }
  856. //-----------------------------------------------------------------------------
  857. // Purpose: blend together q1,pos1 with q2,pos2. Return result in q1,pos1.
  858. // 0 returns q1, pos1. 1 returns q2, pos2
  859. //
  860. // assumes p, q arrays aligned
  861. //-----------------------------------------------------------------------------
  862. void SlerpBones_SPU(
  863. bonejob_SPU* pSPUJob,
  864. accumposeentry_SPU *pPoseEntry,
  865. BoneQuaternion *q1,
  866. BoneVector *pos1,
  867. const BoneQuaternion *q2,
  868. const BoneVector *pos2,
  869. const int *boneMap,
  870. const float *boneWeight,
  871. float s,
  872. int boneMask )
  873. {
  874. SNPROF_ANIM( "SlerpBones_SPU" );
  875. // Assert 16-byte alignment of in and out arrays.
  876. // AssertMsg(
  877. // ((reinterpret_cast<unsigned int>(q1) & 0x0F)==0) &&
  878. // ((reinterpret_cast<unsigned int>(q2) & 0x0F)==0) ,
  879. // "Input arrays to SlerpBones are not aligned! Catastrophe is inevitable.\n");
  880. if( s <= 0.0f )
  881. return;
  882. if( s > 1.0f )
  883. {
  884. s = 1.0f;
  885. }
  886. if( pPoseEntry->seqdesc_flags & STUDIO_WORLD )
  887. {
  888. WorldSpaceSlerp_SPU( pSPUJob, pPoseEntry, q1, pos1, q2, pos2, boneMap, boneWeight, s, boneMask );
  889. return;
  890. }
  891. int i;
  892. // get bonemap and boneweights
  893. // virtualmodel_t *pVModel = pStudioHdr->GetVirtualModel();
  894. //
  895. // const virtualgroup_t * RESTRICT pSeqGroup = NULL;
  896. // if( pVModel )
  897. // {
  898. // pSeqGroup = pVModel->pSeqGroup( sequence );
  899. // }
  900. // Build weightlist for all bones
  901. int nBoneCount = pSPUJob->numBones;
  902. // int nBoneCount = pStudioHdr->numbones();
  903. // float * RESTRICT pS2 = (float*)stackalloc( nBoneCount * sizeof(float) ); // 16-byte aligned
  904. // float *pS2 = g_floatStack[ 0 ];
  905. float *pS2 = (float *)PushLSStack( sizeof(float) * nBoneCount );
  906. // if( pSeqGroup ) // hoist this branch outside of the inner loop for speed (even correctly predicted branches are an eight cycle latency)
  907. if( pPoseEntry->pEA_seqgroup_boneMap )
  908. {
  909. for( i = 0; i < nBoneCount; i++ )
  910. {
  911. // skip unused bones
  912. // if( !(pStudioHdr->boneFlags(i) & boneMask) ||
  913. // pSeqGroup->boneMap[i] < 0 )
  914. if( !( pSPUJob->boneFlags[ i ] & boneMask ) ||
  915. boneMap[ i ] < 0 )
  916. {
  917. pS2[i] = 0.0f;
  918. }
  919. else
  920. {
  921. // boneMap[i] is not a float, don't be lured by the siren call of fcmp
  922. // pS2[i] = s * seqdesc.weight( pSeqGroup->boneMap[i] );
  923. pS2[i] = s * boneWeight[ boneMap[ i ] ];
  924. }
  925. }
  926. }
  927. else // !pSeqGroup
  928. {
  929. for( i = 0; i < nBoneCount; i++ )
  930. {
  931. // skip unused bones
  932. // if( !(pStudioHdr->boneFlags(i) & boneMask) )
  933. if( !( pSPUJob->boneFlags[ i ] & boneMask ) )
  934. {
  935. pS2[i] = 0.0f;
  936. }
  937. else
  938. {
  939. // pS2[i] = s * seqdesc.weight( i ); // blend in based on this bones weight
  940. pS2[i] = s * boneWeight[ i ]; // blend in based on this bones weight
  941. }
  942. }
  943. }
  944. float weight;
  945. int nBoneCountRoundedFour = ( nBoneCount ) & (~(3));
  946. if( pPoseEntry->seqdesc_flags & STUDIO_DELTA )
  947. {
  948. // do as many as we can four at a time, then take care of stragglers.
  949. for( i = 0; i < nBoneCountRoundedFour; i+=4 )
  950. {
  951. fltx4 weightfour = LoadAlignedSIMD(pS2+i); // four weights
  952. FourQuaternions q1four, q2four;
  953. FourQuaternions result;
  954. q1four.LoadAndSwizzleAligned(q1+i); // four quaternions
  955. q2four.LoadAndSwizzleAligned(q2+i); // four quaternions
  956. if( pPoseEntry->seqdesc_flags & STUDIO_POST )
  957. {
  958. // result = q1 * ( weight * q2 )
  959. result = q1four.MulAc(weightfour, q2four);
  960. }
  961. else
  962. {
  963. // result = ( s * q1 ) * q2
  964. result = q2four.ScaleMul(weightfour, q1four);
  965. }
  966. // mask out unused channels, replacing them with original data
  967. {
  968. bi32x4 tinyScales = CmpLeSIMD( weightfour, Four_Zeros );
  969. result.x = MaskedAssign(tinyScales, q1four.x, result.x);
  970. result.y = MaskedAssign(tinyScales, q1four.y, result.y);
  971. result.z = MaskedAssign(tinyScales, q1four.z, result.z);
  972. result.w = MaskedAssign(tinyScales, q1four.w, result.w);
  973. }
  974. result.SwizzleAndStoreAlignedMasked(q1+i, CmpGtSIMD(weightfour,Four_Zeros) );
  975. fltx4 originalpos1simd[4], pos1simd[4], pos2simd[4];
  976. originalpos1simd[0] = pos1simd[0] = LoadAlignedSIMD(pos1[i+0].Base());
  977. originalpos1simd[1] = pos1simd[1] = LoadAlignedSIMD(pos1[i+1].Base());
  978. originalpos1simd[2] = pos1simd[2] = LoadAlignedSIMD(pos1[i+2].Base());
  979. originalpos1simd[3] = pos1simd[3] = LoadAlignedSIMD(pos1[i+3].Base());
  980. pos2simd[0] = LoadAlignedSIMD(pos2[i+0].Base());
  981. pos2simd[1] = LoadAlignedSIMD(pos2[i+1].Base());
  982. pos2simd[2] = LoadAlignedSIMD(pos2[i+2].Base());
  983. pos2simd[3] = LoadAlignedSIMD(pos2[i+3].Base());
  984. // should be able to use aligned loads??
  985. // originalpos1simd[0] = pos1simd[0] = LoadUnalignedSIMD(pos1[i+0].Base());
  986. // originalpos1simd[1] = pos1simd[1] = LoadUnalignedSIMD(pos1[i+1].Base());
  987. // originalpos1simd[2] = pos1simd[2] = LoadUnalignedSIMD(pos1[i+2].Base());
  988. // originalpos1simd[3] = pos1simd[3] = LoadUnalignedSIMD(pos1[i+3].Base());
  989. // pos2simd[0] = LoadUnalignedSIMD(pos2[i+0].Base());
  990. // pos2simd[1] = LoadUnalignedSIMD(pos2[i+1].Base());
  991. // pos2simd[2] = LoadUnalignedSIMD(pos2[i+2].Base());
  992. // pos2simd[3] = LoadUnalignedSIMD(pos2[i+3].Base());
  993. fltx4 splatweights[4] = { SplatXSIMD(weightfour),
  994. SplatYSIMD(weightfour),
  995. SplatZSIMD(weightfour),
  996. SplatWSIMD(weightfour) };
  997. fltx4 Zero = Four_Zeros;
  998. pos1simd[0] = MaddSIMD(pos2simd[0], splatweights[0], pos1simd[0] );
  999. splatweights[0] = ( fltx4 ) CmpGtSIMD(splatweights[0], Zero);
  1000. pos1simd[1] = MaddSIMD(pos2simd[1], splatweights[1], pos1simd[1] );
  1001. splatweights[1] = ( fltx4 ) CmpGtSIMD(splatweights[1], Zero);
  1002. pos1simd[2] = MaddSIMD(pos2simd[2], splatweights[2], pos1simd[2] );
  1003. splatweights[2] = ( fltx4 ) CmpGtSIMD(splatweights[2], Zero);
  1004. pos1simd[3] = MaddSIMD(pos2simd[3], splatweights[3], pos1simd[3] );
  1005. splatweights[3] = ( fltx4 ) CmpGtSIMD(splatweights[3], Zero);
  1006. // mask out unweighted bones
  1007. /*
  1008. if (pS2[i+0] > 0)
  1009. StoreUnaligned3SIMD( pos1[i + 0].Base(), pos1simd[0] );
  1010. if (pS2[i+1] > 0)
  1011. StoreUnaligned3SIMD( pos1[i + 1].Base(), pos1simd[1] );
  1012. if (pS2[i+2] > 0)
  1013. StoreUnaligned3SIMD( pos1[i + 2].Base(), pos1simd[2] );
  1014. if (pS2[i+3] > 0)
  1015. StoreUnaligned3SIMD( pos1[i + 3].Base(), pos1simd[3] );
  1016. */
  1017. StoreAlignedSIMD( pos1[i + 0].Base(), MaskedAssign( ( bi32x4 ) splatweights[0], pos1simd[0], originalpos1simd[0] ) );
  1018. StoreAlignedSIMD( pos1[i + 1].Base(), MaskedAssign( ( bi32x4 ) splatweights[1], pos1simd[1], originalpos1simd[1] ) );
  1019. StoreAlignedSIMD( pos1[i + 2].Base(), MaskedAssign( ( bi32x4 ) splatweights[2], pos1simd[2], originalpos1simd[2] ) );
  1020. StoreAlignedSIMD( pos1[i + 3].Base(), MaskedAssign( ( bi32x4 ) splatweights[3], pos1simd[3], originalpos1simd[3] ) );
  1021. // StoreAligned3SIMD( pos1[i + 0].Base(), MaskedAssign( ( bi32x4 ) splatweights[0], pos1simd[0], originalpos1simd[0] ) );
  1022. // StoreAligned3SIMD( pos1[i + 1].Base(), MaskedAssign( ( bi32x4 ) splatweights[1], pos1simd[1], originalpos1simd[1] ) );
  1023. // StoreAligned3SIMD( pos1[i + 2].Base(), MaskedAssign( ( bi32x4 ) splatweights[2], pos1simd[2], originalpos1simd[2] ) );
  1024. // StoreAligned3SIMD( pos1[i + 3].Base(), MaskedAssign( ( bi32x4 ) splatweights[3], pos1simd[3], originalpos1simd[3] ) );
  1025. // StoreUnaligned3SIMD( pos1[i + 0].Base(), MaskedAssign( ( bi32x4 ) splatweights[0], pos1simd[0], originalpos1simd[0] ) );
  1026. // StoreUnaligned3SIMD( pos1[i + 1].Base(), MaskedAssign( ( bi32x4 ) splatweights[1], pos1simd[1], originalpos1simd[1] ) );
  1027. // StoreUnaligned3SIMD( pos1[i + 2].Base(), MaskedAssign( ( bi32x4 ) splatweights[2], pos1simd[2], originalpos1simd[2] ) );
  1028. // StoreUnaligned3SIMD( pos1[i + 3].Base(), MaskedAssign( ( bi32x4 ) splatweights[3], pos1simd[3], originalpos1simd[3] ) );
  1029. }
  1030. // take care of stragglers
  1031. // odd that this is like this? for( false ; i < nBoneCount; i++ )
  1032. for( ; i < nBoneCount; i++ )
  1033. {
  1034. weight = pS2[i];
  1035. if ( weight <= 0.0f )
  1036. continue;
  1037. if ( pPoseEntry->seqdesc_flags & STUDIO_POST )
  1038. {
  1039. QuaternionMA_PS3( q1[i], weight, q2[i], q1[i] );
  1040. // QuaternionMASIMD( q1[i], weight, q2[i], q1[i] );
  1041. // FIXME: are these correct?
  1042. pos1[i][0] = pos1[i][0] + pos2[i][0] * weight;
  1043. pos1[i][1] = pos1[i][1] + pos2[i][1] * weight;
  1044. pos1[i][2] = pos1[i][2] + pos2[i][2] * weight;
  1045. }
  1046. else
  1047. {
  1048. QuaternionSM_PS3( weight, q2[i], q1[i], q1[i] );
  1049. // QuaternionSMSIMD( weight, q2[i], q1[i], q1[i] );
  1050. // FIXME: are these correct?
  1051. pos1[i][0] = pos1[i][0] + pos2[i][0] * weight;
  1052. pos1[i][1] = pos1[i][1] + pos2[i][1] * weight;
  1053. pos1[i][2] = pos1[i][2] + pos2[i][2] * weight;
  1054. }
  1055. }
  1056. PopLSStack( sizeof(float) * nBoneCount );
  1057. // #if defined(__SPU__)
  1058. // if( pSPUJob->numBones == 70 )
  1059. // {
  1060. // Quaternion *pQ = &q1[1];
  1061. // VjobSpuLog("q1[1]: %f, %f, %f, %f\n", pQ->x, pQ->y, pQ->z, pQ->w );
  1062. // }
  1063. // #endif
  1064. return;
  1065. }
  1066. //// SLERP PHASE
  1067. // Some bones need to be slerped with alignment.
  1068. // Others do not.
  1069. // Some need to be ignored altogether.
  1070. // Build arrays indicating which are which.
  1071. // This is the corral approach. Another approach
  1072. // would be to compute both the aligned and unaligned
  1073. // slerps of each bone in the first pass through the
  1074. // array, and then do a masked selection of each
  1075. // based on the masks. However there really isn't
  1076. // a convenient way to turn the int flags that
  1077. // specify which approach to take, into fltx4 masks.
  1078. // int * RESTRICT aBonesSlerpAlign = (int *)stackalloc(nBoneCount * sizeof(int));
  1079. // float * RESTRICT aBonesSlerpAlignWeights = (float *)stackalloc(nBoneCount * sizeof(float));
  1080. // int * RESTRICT aBonesSlerpNoAlign = (int *)stackalloc(nBoneCount * sizeof(int));
  1081. // float * RESTRICT aBonesSlerpNoAlignWeights = (float *)stackalloc(nBoneCount * sizeof(float));
  1082. // int *aBonesSlerpAlign = g_intStack[ 0 ];
  1083. // float *aBonesSlerpAlignWeights = g_floatStack[ 1 ];
  1084. // int *aBonesSlerpNoAlign = g_intStack[ 1 ];
  1085. // float *aBonesSlerpNoAlignWeights = g_floatStack[ 2 ];
  1086. int *aBonesSlerpAlign = (int *)PushLSStack( sizeof(int) * nBoneCount );
  1087. float *aBonesSlerpAlignWeights = (float *)PushLSStack( sizeof(float) * nBoneCount );
  1088. int *aBonesSlerpNoAlign = (int *)PushLSStack( sizeof(int) * nBoneCount );
  1089. float *aBonesSlerpNoAlignWeights = (float *)PushLSStack( sizeof(float) * nBoneCount );
  1090. int numBonesSlerpAlign = 0;
  1091. int numBonesSlerpNoAlign = 0;
  1092. // BoneQuaternion * RESTRICT testOutput = (BoneQuaternion *)stackalloc(nBoneCount * sizeof(BoneQuaternion));
  1093. // sweep forward through the array and determine where to corral each bone.
  1094. for( i = 0 ; i < nBoneCount ; ++i )
  1095. {
  1096. float weight = pS2[i];
  1097. if( weight == 1.0f )
  1098. {
  1099. q1[i] = q2[i];
  1100. pos1[i] = pos2[i];
  1101. }
  1102. else if( weight > 0.0f ) // ignore small bones
  1103. {
  1104. // if( pStudioHdr->boneFlags(i) & BONE_FIXED_ALIGNMENT )
  1105. if( pSPUJob->boneFlags[ i ] & BONE_FIXED_ALIGNMENT )
  1106. {
  1107. aBonesSlerpNoAlign[ numBonesSlerpNoAlign ] = i;
  1108. aBonesSlerpNoAlignWeights[ numBonesSlerpNoAlign ] = weight;
  1109. ++numBonesSlerpNoAlign;
  1110. }
  1111. else
  1112. {
  1113. aBonesSlerpAlign[ numBonesSlerpAlign ] = i;
  1114. aBonesSlerpAlignWeights[ numBonesSlerpAlign ] = weight;
  1115. ++numBonesSlerpAlign;
  1116. }
  1117. }
  1118. }
  1119. // okay, compute all the aligned, and all the unaligned bones, four at
  1120. // a time if possible.
  1121. const fltx4 One = Four_Ones;
  1122. /////////////////
  1123. // // // Aligned!
  1124. nBoneCountRoundedFour = ( numBonesSlerpAlign ) & ~3;
  1125. for( i = 0 ; i < nBoneCountRoundedFour ; i+=4 )
  1126. {
  1127. fltx4 weights = LoadAlignedSIMD( aBonesSlerpAlignWeights+i );
  1128. fltx4 oneMinusWeight = SubSIMD(One, weights);
  1129. // position component:
  1130. // pos1[i][0] = pos1[i][0] * s1 + pos2[i][0] * weight;
  1131. fltx4 pos1simd[4];
  1132. fltx4 pos2simd[4];
  1133. pos1simd[0] = LoadAlignedSIMD(pos1[aBonesSlerpAlign[i+0]].Base());
  1134. pos1simd[1] = LoadAlignedSIMD(pos1[aBonesSlerpAlign[i+1]].Base());
  1135. pos1simd[2] = LoadAlignedSIMD(pos1[aBonesSlerpAlign[i+2]].Base());
  1136. pos1simd[3] = LoadAlignedSIMD(pos1[aBonesSlerpAlign[i+3]].Base());
  1137. pos2simd[0] = LoadAlignedSIMD(pos2[aBonesSlerpAlign[i+0]].Base());
  1138. pos2simd[1] = LoadAlignedSIMD(pos2[aBonesSlerpAlign[i+1]].Base());
  1139. pos2simd[2] = LoadAlignedSIMD(pos2[aBonesSlerpAlign[i+2]].Base());
  1140. pos2simd[3] = LoadAlignedSIMD(pos2[aBonesSlerpAlign[i+3]].Base());
  1141. // pos1simd[0] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i+0]].Base());
  1142. // pos1simd[1] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i+1]].Base());
  1143. // pos1simd[2] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i+2]].Base());
  1144. // pos1simd[3] = LoadUnaligned3SIMD(pos1[aBonesSlerpAlign[i+3]].Base());
  1145. // pos2simd[0] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i+0]].Base());
  1146. // pos2simd[1] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i+1]].Base());
  1147. // pos2simd[2] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i+2]].Base());
  1148. // pos2simd[3] = LoadUnaligned3SIMD(pos2[aBonesSlerpAlign[i+3]].Base());
  1149. pos1simd[0] = MulSIMD( SplatXSIMD(oneMinusWeight) , pos1simd[0] );
  1150. pos1simd[1] = MulSIMD( SplatYSIMD(oneMinusWeight) , pos1simd[1] );
  1151. pos1simd[2] = MulSIMD( SplatZSIMD(oneMinusWeight) , pos1simd[2] );
  1152. pos1simd[3] = MulSIMD( SplatWSIMD(oneMinusWeight) , pos1simd[3] );
  1153. fltx4 posWriteMasks[4]; // don't overwrite where there was zero weight
  1154. {
  1155. fltx4 splatweights[4];
  1156. fltx4 Zero = Four_Zeros;
  1157. splatweights[0] = SplatXSIMD(weights);
  1158. splatweights[1] = SplatYSIMD(weights);
  1159. splatweights[2] = SplatZSIMD(weights);
  1160. splatweights[3] = SplatWSIMD(weights);
  1161. pos1simd[0] = MaddSIMD( splatweights[0] , pos2simd[0], pos1simd[0] );
  1162. posWriteMasks[0] = ( fltx4 ) CmpGtSIMD(splatweights[0], Zero);
  1163. pos1simd[1] = MaddSIMD( splatweights[1] , pos2simd[1], pos1simd[1] );
  1164. posWriteMasks[1] = ( fltx4 ) CmpGtSIMD(splatweights[1], Zero);
  1165. pos1simd[2] = MaddSIMD( splatweights[2] , pos2simd[2], pos1simd[2] );
  1166. posWriteMasks[2] = ( fltx4 ) CmpGtSIMD(splatweights[2], Zero);
  1167. pos1simd[3] = MaddSIMD( splatweights[3] , pos2simd[3], pos1simd[3] );
  1168. posWriteMasks[3] = ( fltx4 ) CmpGtSIMD(splatweights[3], Zero);
  1169. }
  1170. FourQuaternions q1four, q2four, result;
  1171. q1four.LoadAndSwizzleAligned( q1 + aBonesSlerpAlign[i+0],
  1172. q1 + aBonesSlerpAlign[i+1],
  1173. q1 + aBonesSlerpAlign[i+2],
  1174. q1 + aBonesSlerpAlign[i+3] );
  1175. #if 0
  1176. // FIXME: the SIMD slerp doesn't handle quaternions that have opposite signs
  1177. q2four.LoadAndSwizzleAligned( q2 + aBonesSlerpAlign[i+0],
  1178. q2 + aBonesSlerpAlign[i+1],
  1179. q2 + aBonesSlerpAlign[i+2],
  1180. q2 + aBonesSlerpAlign[i+3] );
  1181. result = q2four.Slerp(q1four, oneMinusWeight);
  1182. #else
  1183. // force the quaternions to be the same sign (< 180 degree separation)
  1184. BoneQuaternion q20, q21, q22, q23;
  1185. QuaternionAlign_PS3( q1[aBonesSlerpAlign[i+0]], q2[aBonesSlerpAlign[i+0]], q20 );
  1186. QuaternionAlign_PS3( q1[aBonesSlerpAlign[i+1]], q2[aBonesSlerpAlign[i+1]], q21 );
  1187. QuaternionAlign_PS3( q1[aBonesSlerpAlign[i+2]], q2[aBonesSlerpAlign[i+2]], q22 );
  1188. QuaternionAlign_PS3( q1[aBonesSlerpAlign[i+3]], q2[aBonesSlerpAlign[i+3]], q23 );
  1189. q2four.LoadAndSwizzleAligned( &q20, &q21, &q22, &q23 );
  1190. result = q2four.SlerpNoAlign(q1four, oneMinusWeight);
  1191. #endif
  1192. result.SwizzleAndStoreAligned( q1 + aBonesSlerpAlign[i+0],
  1193. q1 + aBonesSlerpAlign[i+1],
  1194. q1 + aBonesSlerpAlign[i+2],
  1195. q1 + aBonesSlerpAlign[i+3] );
  1196. StoreAlignedSIMD( pos1[aBonesSlerpAlign[i+0]].Base(), pos1simd[0] );
  1197. StoreAlignedSIMD( pos1[aBonesSlerpAlign[i+1]].Base(), pos1simd[1] );
  1198. StoreAlignedSIMD( pos1[aBonesSlerpAlign[i+2]].Base(), pos1simd[2] );
  1199. StoreAlignedSIMD( pos1[aBonesSlerpAlign[i+3]].Base(), pos1simd[3] );
  1200. // StoreUnaligned3SIMD( pos1[aBonesSlerpAlign[i+0]].Base(), pos1simd[0] );
  1201. // StoreUnaligned3SIMD( pos1[aBonesSlerpAlign[i+1]].Base(), pos1simd[1] );
  1202. // StoreUnaligned3SIMD( pos1[aBonesSlerpAlign[i+2]].Base(), pos1simd[2] );
  1203. // StoreUnaligned3SIMD( pos1[aBonesSlerpAlign[i+3]].Base(), pos1simd[3] );
  1204. }
  1205. // handle stragglers
  1206. // for( i ; i < numBonesSlerpAlign ; ++i )
  1207. for( ; i < numBonesSlerpAlign ; ++i )
  1208. {
  1209. BoneQuaternion q3;
  1210. weight = aBonesSlerpAlignWeights[i];
  1211. int k = aBonesSlerpAlign[i];
  1212. float s1 = 1.0 - weight;
  1213. QuaternionSlerp_PS3( q2[k], q1[k], s1, q3 );
  1214. q1[k][0] = q3[0];
  1215. q1[k][1] = q3[1];
  1216. q1[k][2] = q3[2];
  1217. q1[k][3] = q3[3];
  1218. pos1[k][0] = pos1[k][0] * s1 + pos2[k][0] * weight;
  1219. pos1[k][1] = pos1[k][1] * s1 + pos2[k][1] * weight;
  1220. pos1[k][2] = pos1[k][2] * s1 + pos2[k][2] * weight;
  1221. }
  1222. ///////////////////
  1223. // // // Unaligned!
  1224. nBoneCountRoundedFour = (numBonesSlerpNoAlign) & ~3;
  1225. for( i = 0 ; i < nBoneCountRoundedFour ; i+=4 )
  1226. {
  1227. fltx4 weights = LoadAlignedSIMD( aBonesSlerpNoAlignWeights+i );
  1228. fltx4 oneMinusWeight = SubSIMD(One, weights);
  1229. // position component:
  1230. // pos1[i][0] = pos1[i][0] * s1 + pos2[i][0] * weight;
  1231. fltx4 pos1simd[4];
  1232. fltx4 pos2simd[4];
  1233. pos1simd[0] = LoadAlignedSIMD(pos1[aBonesSlerpNoAlign[i+0]].Base());
  1234. pos1simd[1] = LoadAlignedSIMD(pos1[aBonesSlerpNoAlign[i+1]].Base());
  1235. pos1simd[2] = LoadAlignedSIMD(pos1[aBonesSlerpNoAlign[i+2]].Base());
  1236. pos1simd[3] = LoadAlignedSIMD(pos1[aBonesSlerpNoAlign[i+3]].Base());
  1237. pos2simd[0] = LoadAlignedSIMD(pos2[aBonesSlerpNoAlign[i+0]].Base());
  1238. pos2simd[1] = LoadAlignedSIMD(pos2[aBonesSlerpNoAlign[i+1]].Base());
  1239. pos2simd[2] = LoadAlignedSIMD(pos2[aBonesSlerpNoAlign[i+2]].Base());
  1240. pos2simd[3] = LoadAlignedSIMD(pos2[aBonesSlerpNoAlign[i+3]].Base());
  1241. // pos1simd[0] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i+0]].Base());
  1242. // pos1simd[1] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i+1]].Base());
  1243. // pos1simd[2] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i+2]].Base());
  1244. // pos1simd[3] = LoadUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i+3]].Base());
  1245. // pos2simd[0] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i+0]].Base());
  1246. // pos2simd[1] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i+1]].Base());
  1247. // pos2simd[2] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i+2]].Base());
  1248. // pos2simd[3] = LoadUnaligned3SIMD(pos2[aBonesSlerpNoAlign[i+3]].Base());
  1249. pos1simd[0] = MulSIMD( SplatXSIMD(oneMinusWeight) , pos1simd[0] );
  1250. pos1simd[1] = MulSIMD( SplatYSIMD(oneMinusWeight) , pos1simd[1] );
  1251. pos1simd[2] = MulSIMD( SplatZSIMD(oneMinusWeight) , pos1simd[2] );
  1252. pos1simd[3] = MulSIMD( SplatWSIMD(oneMinusWeight) , pos1simd[3] );
  1253. pos1simd[0] = MaddSIMD( SplatXSIMD(weights) , pos2simd[0], pos1simd[0] );
  1254. pos1simd[1] = MaddSIMD( SplatYSIMD(weights) , pos2simd[1], pos1simd[1] );
  1255. pos1simd[2] = MaddSIMD( SplatZSIMD(weights) , pos2simd[2], pos1simd[2] );
  1256. pos1simd[3] = MaddSIMD( SplatWSIMD(weights) , pos2simd[3], pos1simd[3] );
  1257. FourQuaternions q1four, q2four, result;
  1258. q1four.LoadAndSwizzleAligned( q1 + aBonesSlerpNoAlign[i+0],
  1259. q1 + aBonesSlerpNoAlign[i+1],
  1260. q1 + aBonesSlerpNoAlign[i+2],
  1261. q1 + aBonesSlerpNoAlign[i+3] );
  1262. q2four.LoadAndSwizzleAligned( q2 + aBonesSlerpNoAlign[i+0],
  1263. q2 + aBonesSlerpNoAlign[i+1],
  1264. q2 + aBonesSlerpNoAlign[i+2],
  1265. q2 + aBonesSlerpNoAlign[i+3] );
  1266. result = q2four.SlerpNoAlign(q1four, oneMinusWeight);
  1267. result.SwizzleAndStoreAligned( q1 + aBonesSlerpNoAlign[i+0],
  1268. q1 + aBonesSlerpNoAlign[i+1],
  1269. q1 + aBonesSlerpNoAlign[i+2],
  1270. q1 + aBonesSlerpNoAlign[i+3] );
  1271. StoreAlignedSIMD(pos1[aBonesSlerpNoAlign[i+0]].Base(), pos1simd[0]);
  1272. StoreAlignedSIMD(pos1[aBonesSlerpNoAlign[i+1]].Base(), pos1simd[1]);
  1273. StoreAlignedSIMD(pos1[aBonesSlerpNoAlign[i+2]].Base(), pos1simd[2]);
  1274. StoreAlignedSIMD(pos1[aBonesSlerpNoAlign[i+3]].Base(), pos1simd[3]);
  1275. // StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i+0]].Base(), pos1simd[0]);
  1276. // StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i+1]].Base(), pos1simd[1]);
  1277. // StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i+2]].Base(), pos1simd[2]);
  1278. // StoreUnaligned3SIMD(pos1[aBonesSlerpNoAlign[i+3]].Base(), pos1simd[3]);
  1279. }
  1280. // handle stragglers
  1281. // for( i ; i < numBonesSlerpNoAlign ; ++i )
  1282. for( ; i < numBonesSlerpNoAlign ; ++i )
  1283. {
  1284. weight = aBonesSlerpNoAlignWeights[ i ];
  1285. int k = aBonesSlerpNoAlign[ i ];
  1286. float s1 = 1.0 - weight;
  1287. BoneQuaternion q3;
  1288. QuaternionSlerpNoAlign_PS3( q2[ k ], q1[ k ], s1, q3 );
  1289. q1[k][0] = q3[0];
  1290. q1[k][1] = q3[1];
  1291. q1[k][2] = q3[2];
  1292. q1[k][3] = q3[3];
  1293. pos1[k][0] = pos1[k][0] * s1 + pos2[k][0] * weight;
  1294. pos1[k][1] = pos1[k][1] * s1 + pos2[k][1] * weight;
  1295. pos1[k][2] = pos1[k][2] * s1 + pos2[k][2] * weight;
  1296. }
  1297. // aBonesSlerpNoAlignWeights
  1298. PopLSStack( sizeof(float) * nBoneCount );
  1299. // aBonesSlerpNoAlign
  1300. PopLSStack( sizeof(int) * nBoneCount );
  1301. // aBonesSlerpAlignWeights
  1302. PopLSStack( sizeof(float) * nBoneCount );
  1303. // aBonesSlerpAlign
  1304. PopLSStack( sizeof(int) * nBoneCount );
  1305. // pS2
  1306. PopLSStack( sizeof(float) * nBoneCount );
  1307. }
  1308. template <int N>
  1309. struct GetLog2_t
  1310. {};
  1311. template<>
  1312. struct GetLog2_t<0x00100000>
  1313. {
  1314. enum {kLog2 = 20};
  1315. };
  1316. //---------------------------------------------------------------------
  1317. // Make sure quaternions are within 180 degrees of one another, if not, reverse q
  1318. //---------------------------------------------------------------------
  1319. FORCEINLINE fltx4 BoneQuaternionAlignSIMD( const fltx4 &p, const fltx4 &q )
  1320. {
  1321. // decide if one of the quaternions is backwards
  1322. bi32x4 cmp = CmpLtSIMD( Dot4SIMD(p,q), Four_Zeros );
  1323. fltx4 result = MaskedAssign( cmp, NegSIMD(q), q );
  1324. return result;
  1325. }
  1326. // SSE + X360 implementation
  1327. FORCEINLINE fltx4 BoneQuaternionNormalizeSIMD( const fltx4 &q )
  1328. {
  1329. fltx4 radius, result;
  1330. bi32x4 mask;
  1331. radius = Dot4SIMD( q, q );
  1332. mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0
  1333. result = ReciprocalSqrtSIMD( radius );
  1334. result = MulSIMD( result, q );
  1335. return MaskedAssign( mask, q, result ); // if radius was 0, just return q
  1336. }
  1337. //-----------------------------------------------------------------------------
  1338. // Purpose: Inter-animation blend. Assumes both types are identical.
  1339. // blend together q1,pos1 with q2,pos2. Return result in q1,pos1.
  1340. // 0 returns q1, pos1. 1 returns q2, pos2
  1341. //-----------------------------------------------------------------------------
  1342. void BlendBones_PS3(
  1343. const bonejob_SPU *pBonejob,
  1344. const accumposeentry_SPU *pPoseEntry,
  1345. BoneQuaternion *q1,
  1346. BoneVector *pos1,
  1347. const int *boneMap,
  1348. const float *boneWeight,
  1349. const BoneQuaternion *q2,
  1350. const BoneVector *pos2,
  1351. float s,
  1352. int boneMask )
  1353. {
  1354. SNPROF_ANIM("BlendBones_PS3");
  1355. int i, j;
  1356. Quaternion q3;
  1357. if( s <= 0.0f )
  1358. {
  1359. Assert(0); // shouldn't have been called
  1360. return;
  1361. }
  1362. else if( s >= 1.0f )
  1363. {
  1364. //CMiniProfilerGuard mpguard(&g_lmp_BlendBones1, pStudioHdr->numbones());
  1365. Assert(0); // shouldn't have been called
  1366. for (i = 0; i < pBonejob->numBones; i++)
  1367. {
  1368. // skip unused bones
  1369. if( !( pBonejob->boneFlags[i] & boneMask) )
  1370. {
  1371. continue;
  1372. }
  1373. if( pPoseEntry->pEA_seqgroup_boneMap )
  1374. {
  1375. j = boneMap[i];
  1376. }
  1377. else
  1378. {
  1379. j = i;
  1380. }
  1381. if( j >= 0 && boneWeight[j] > 0.0f )
  1382. {
  1383. q1[i] = q2[i];
  1384. pos1[i] = pos2[i];
  1385. }
  1386. }
  1387. return;
  1388. }
  1389. float s2 = s;
  1390. float s1 = 1.0f - s2;
  1391. //CMiniProfilerGuard mpguard(&g_lmp_BlendBones2,pStudioHdr->numbones()); // 130-180 ticks without profilers; 167-190 ticks with all profilers on
  1392. int nMode = 2;//g_cv_BlendBonesMode.GetInt();
  1393. #ifndef DEDICATED
  1394. if(nMode)
  1395. {
  1396. const int numBones = pBonejob->numBones;
  1397. const int *RESTRICT pBonePseudoWeight = (int*)boneWeight;//(int*)seqdesc.pBoneweight(0); // we'll treat floats as ints to check for > 0.0
  1398. // int *RESTRICT pActiveBones = (int*)stackalloc(numBones * sizeof(int) * 2), *RESTRICT pActiveBonesEnd = pActiveBones;
  1399. int *pActiveBones = (int *)PushLSStack( numBones * sizeof(int) * 2 );
  1400. int *pActiveBonesEnd = pActiveBones;
  1401. {
  1402. // BONE_PROFILE_LOOP(BlendBoneLoop2a,numBones); // 20 ticks straight; 12-14 ticks 4 at a time; 14-19 ticks 8 at a time (compiler generated code)
  1403. i = 0;
  1404. #ifdef _X360 // on PC, this is slower
  1405. for(; i+3 < numBones; i+=4)
  1406. {
  1407. int isBoneActiveA = pStudioHdr->boneFlags(i ) & boneMask;
  1408. int isBoneActiveB = pStudioHdr->boneFlags(i+1) & boneMask;
  1409. int isBoneActiveC = pStudioHdr->boneFlags(i+2) & boneMask;
  1410. int isBoneActiveD = pStudioHdr->boneFlags(i+3) & boneMask;
  1411. isBoneActiveA = isBoneActiveA | -isBoneActiveA; // the high bit is now 1 iff the flags check
  1412. isBoneActiveB = isBoneActiveB | -isBoneActiveB; // the high bit is now 1 iff the flags check
  1413. isBoneActiveC = isBoneActiveC | -isBoneActiveC; // the high bit is now 1 iff the flags check
  1414. isBoneActiveD = isBoneActiveD | -isBoneActiveD; // the high bit is now 1 iff the flags check
  1415. isBoneActiveA = _rotl(isBoneActiveA,1) & 1; // now it's either 0 or 1
  1416. isBoneActiveB = _rotl(isBoneActiveB,1) & 1; // now it's either 0 or 1
  1417. isBoneActiveC = _rotl(isBoneActiveC,1) & 1; // now it's either 0 or 1
  1418. isBoneActiveD = _rotl(isBoneActiveD,1) & 1; // now it's either 0 or 1
  1419. *pActiveBonesEnd = i+0;
  1420. pActiveBonesEnd += isBoneActiveA;
  1421. *pActiveBonesEnd = i+1;
  1422. pActiveBonesEnd += isBoneActiveB;
  1423. *pActiveBonesEnd = i+2;
  1424. pActiveBonesEnd += isBoneActiveC;
  1425. *pActiveBonesEnd = i+3;
  1426. pActiveBonesEnd += isBoneActiveD;
  1427. }
  1428. #endif
  1429. for(; i < numBones; ++i)
  1430. {
  1431. *pActiveBonesEnd = i;
  1432. int isBoneActive = pBonejob->boneFlags[i] & boneMask;
  1433. isBoneActive = isBoneActive | -isBoneActive; // the high bit is now 1 iff the flags check
  1434. isBoneActive = _rotl(isBoneActive,1) & 1; // now it's either 0 or 1
  1435. pActiveBonesEnd += isBoneActive;
  1436. }
  1437. }
  1438. // now we have a list of bones whose flags & mask != 0
  1439. // we need to create bone pay
  1440. if( pPoseEntry->pEA_seqgroup_boneMap )// if( pSeqGroup )
  1441. {
  1442. int *pEnd = pActiveBones;
  1443. {
  1444. // BONE_PROFILE_LOOP(BlendBoneLoop2b,pActiveBonesEnd - pActiveBones);//21-25 straight; 16-18 4 at a time;
  1445. int *RESTRICT pActiveBone = pActiveBones;
  1446. #ifdef _X360 // on PC, this is slower
  1447. for(; pActiveBone + 3 < pActiveBonesEnd; pActiveBone += 4)
  1448. {
  1449. int nActiveBoneA = pActiveBone[0];
  1450. int nActiveBoneB = pActiveBone[1];
  1451. int nActiveBoneC = pActiveBone[2];
  1452. int nActiveBoneD = pActiveBone[3];
  1453. int nMappedBoneA = pSeqGroup->boneMap[nActiveBoneA];
  1454. int nMappedBoneB = pSeqGroup->boneMap[nActiveBoneB];
  1455. int nMappedBoneC = pSeqGroup->boneMap[nActiveBoneC];
  1456. int nMappedBoneD = pSeqGroup->boneMap[nActiveBoneD];
  1457. pEnd[numBones] = nMappedBoneA;
  1458. *pEnd = nActiveBoneA;
  1459. pEnd += _rotl(~nMappedBoneA,1) & 1; // if nMappedBone < 0, don't advance the end
  1460. pEnd[numBones] = nMappedBoneB;
  1461. *pEnd = nActiveBoneB;
  1462. pEnd += _rotl(~nMappedBoneB,1) & 1; // if nMappedBone < 0, don't advance the end
  1463. pEnd[numBones] = nMappedBoneC;
  1464. *pEnd = nActiveBoneC;
  1465. pEnd += _rotl(~nMappedBoneC,1) & 1; // if nMappedBone < 0, don't advance the end
  1466. pEnd[numBones] = nMappedBoneD;
  1467. *pEnd = nActiveBoneD;
  1468. pEnd += _rotl(~nMappedBoneD,1) & 1; // if nMappedBone < 0, don't advance the end
  1469. }
  1470. #endif
  1471. for(; pActiveBone < pActiveBonesEnd; ++pActiveBone)
  1472. {
  1473. int nActiveBone = *pActiveBone;
  1474. int nMappedBone = boneMap[ nActiveBone ];
  1475. pEnd[ numBones ] = nMappedBone;
  1476. *pEnd = nActiveBone;
  1477. pEnd += _rotl(~nMappedBone,1) & 1; // if nMappedBone < 0, don't advance the end
  1478. }
  1479. }
  1480. pActiveBonesEnd = pEnd; // the new end of the array of active bones, with negatively-mapped bones taken out
  1481. // now get rid of non-positively-weighted bones
  1482. pEnd = pActiveBones;
  1483. {
  1484. // BONE_PROFILE_LOOP(BlendBoneLoop2c,pActiveBonesEnd - pActiveBones);//18-23 straight; 14-17 ticks 4 at a time
  1485. int *RESTRICT pActiveBone = pActiveBones;
  1486. #ifdef _X360 // on PC, this is slower
  1487. int *RESTRICT pMappedBone = pActiveBones+numBones;
  1488. for(; pActiveBone+3 < pActiveBonesEnd; pActiveBone += 4, pMappedBone += 4)
  1489. {
  1490. int nActiveBoneA = pActiveBone[0];
  1491. int nActiveBoneB = pActiveBone[1];
  1492. int nActiveBoneC = pActiveBone[2];
  1493. int nActiveBoneD = pActiveBone[3];
  1494. int nMappedBoneA = pMappedBone[0];
  1495. int nMappedBoneB = pMappedBone[1];
  1496. int nMappedBoneC = pMappedBone[2];
  1497. int nMappedBoneD = pMappedBone[3];
  1498. int pseudoWeightA = pBonePseudoWeight[nMappedBoneA];
  1499. int pseudoWeightB = pBonePseudoWeight[nMappedBoneB];
  1500. int pseudoWeightC = pBonePseudoWeight[nMappedBoneC];
  1501. int pseudoWeightD = pBonePseudoWeight[nMappedBoneD];
  1502. *pEnd = nActiveBoneA;
  1503. pEnd += _rotl(-pseudoWeightA, 1) & 1; // pseudoWeight must be strictly positive to advance and let this bone stay
  1504. *pEnd = nActiveBoneB;
  1505. pEnd += _rotl(-pseudoWeightB, 1) & 1; // pseudoWeight must be strictly positive to advance and let this bone stay
  1506. *pEnd = nActiveBoneC;
  1507. pEnd += _rotl(-pseudoWeightC, 1) & 1; // pseudoWeight must be strictly positive to advance and let this bone stay
  1508. *pEnd = nActiveBoneD;
  1509. pEnd += _rotl(-pseudoWeightD, 1) & 1; // pseudoWeight must be strictly positive to advance and let this bone stay
  1510. }
  1511. #endif
  1512. for(; pActiveBone < pActiveBonesEnd; ++pActiveBone)
  1513. {
  1514. int nActiveBone = *pActiveBone;
  1515. int nMappedBone = pActiveBone[numBones];
  1516. int pseudoWeight = pBonePseudoWeight[nMappedBone];
  1517. *pEnd = nActiveBone;
  1518. pEnd += _rotl(-pseudoWeight, 1) & 1; // pseudoWeight must be strictly positive to advance and let this bone stay
  1519. }
  1520. }
  1521. pActiveBonesEnd = pEnd;
  1522. }
  1523. else
  1524. {
  1525. // one mapping stage off
  1526. // now get rid of non-positively-weighted bones
  1527. int *pEnd = pActiveBones;
  1528. // {BONE_PROFILE_LOOP(BlendBoneLoop2d,pActiveBonesEnd-pActiveBones);//20-50
  1529. for(int *RESTRICT pActiveBone = pActiveBones; pActiveBone < pActiveBonesEnd; ++pActiveBone)
  1530. {
  1531. int nActiveBone = *pActiveBone;
  1532. int pseudoWeight = pBonePseudoWeight[nActiveBone];
  1533. *pEnd = nActiveBone;
  1534. pEnd += _rotl(-pseudoWeight, 1) & 1; // pseudoWeight must be strictly positive to advance and let this bone stay
  1535. }//}
  1536. pActiveBonesEnd = pEnd;
  1537. }
  1538. enum
  1539. {
  1540. nBoneFixedAlignmentShift = GetLog2_t<BONE_FIXED_ALIGNMENT>::kLog2
  1541. };
  1542. // NOTE: When merging back to main, enable this code because Fixed-Alignment is not used in L4D, but may be used in main
  1543. fltx4 scale1 = ReplicateX4( s1 );
  1544. fltx4 scale2 = SubSIMD( Four_Ones, scale1 );
  1545. //fltx4 maskW = LoadAlignedSIMD( (const float *)(g_SIMD_ComponentMask[3]) );
  1546. // pass through all active bones to blend them; those that need it are already aligned
  1547. {
  1548. // 120-155 ticks 4 horizontal at a time; 130 ticks with 1 dot quaternion alignment
  1549. //
  1550. // BONE_PROFILE_LOOP(BlendBoneLoop2g,pActiveBonesEnd-pActiveBones);
  1551. const int *RESTRICT p = pActiveBones, *RESTRICT pNext;
  1552. #if 0//ndef _X360
  1553. // swizzled (vertical) 4 at a time processing
  1554. for(; (pNext = p+4) < pActiveBonesEnd; p = pNext)
  1555. {
  1556. int nBoneA = p[0], nBoneB = p[1], nBoneC = p[2], nBoneD = p[3];
  1557. BoneQuaternion *RESTRICT pq1A = &q1[nBoneA];
  1558. BoneQuaternion *RESTRICT pq1B = &q1[nBoneB];
  1559. BoneQuaternion *RESTRICT pq1C = &q1[nBoneC];
  1560. BoneQuaternion *RESTRICT pq1D = &q1[nBoneD];
  1561. const BoneQuaternion *RESTRICT pq2A = &q2[nBoneA];
  1562. const BoneQuaternion *RESTRICT pq2B = &q2[nBoneB];
  1563. const BoneQuaternion *RESTRICT pq2C = &q2[nBoneC];
  1564. const BoneQuaternion *RESTRICT pq2D = &q2[nBoneD];
  1565. float *pp1A = pos1[nBoneA].Base();
  1566. float *pp1B = pos1[nBoneB].Base();
  1567. float *pp1C = pos1[nBoneC].Base();
  1568. float *pp1D = pos1[nBoneD].Base();
  1569. const float *pp2A = pos2[nBoneA].Base();
  1570. const float *pp2B = pos2[nBoneB].Base();
  1571. const float *pp2C = pos2[nBoneC].Base();
  1572. const float *pp2D = pos2[nBoneD].Base();
  1573. FourQuaternions four4q1, four4q2;
  1574. four4q1.LoadAndSwizzleAligned(pq1A,pq1B,pq1C,pq1D);
  1575. four4q2.LoadAndSwizzleAligned(pq2A,pq2B,pq2C,pq2D);
  1576. FourVectors four4Pos1, four4Pos2;
  1577. four4Pos1.LoadAndSwizzleUnaligned(pp1A,pp1B,pp1C,pp1D);
  1578. four4Pos2.LoadAndSwizzleUnaligned(pp2A,pp2B,pp2C,pp2D);
  1579. four4q1 = QuaternionAlign(four4q2, four4q1);
  1580. FourQuaternions four4Blended = QuaternionNormalize(Madd( four4q1, scale1, Mul( four4q2 , scale2 )));
  1581. // now blend the linear parts
  1582. FourVectors f4PosBlended = Madd(four4Pos1, scale1, Mul(four4Pos2, scale2));
  1583. f4PosBlended.TransposeOntoUnaligned3(*(fltx4*)pp1A, *(fltx4*)pp1B, *(fltx4*)pp1C, *(fltx4*)pp1D);
  1584. four4Blended.SwizzleAndStoreAligned(pq1A,pq1B,pq1C,pq1D);
  1585. }
  1586. #else
  1587. // horizontal 4 at a time processing
  1588. for(; (pNext = p+4) < pActiveBonesEnd; p = pNext)
  1589. {
  1590. int nBoneA = p[0], nBoneB = p[1], nBoneC = p[2], nBoneD = p[3];
  1591. float *RESTRICT pq1A = q1[nBoneA].Base(), *pp1A = pos1[nBoneA].Base();
  1592. float *RESTRICT pq1B = q1[nBoneB].Base(), *pp1B = pos1[nBoneB].Base();
  1593. float *RESTRICT pq1C = q1[nBoneC].Base(), *pp1C = pos1[nBoneC].Base();
  1594. float *RESTRICT pq1D = q1[nBoneD].Base(), *pp1D = pos1[nBoneD].Base();
  1595. const float *RESTRICT pq2A = q2[nBoneA].Base(), *pp2A = pos2[nBoneA].Base();
  1596. const float *RESTRICT pq2B = q2[nBoneB].Base(), *pp2B = pos2[nBoneB].Base();
  1597. const float *RESTRICT pq2C = q2[nBoneC].Base(), *pp2C = pos2[nBoneC].Base();
  1598. const float *RESTRICT pq2D = q2[nBoneD].Base(), *pp2D = pos2[nBoneD].Base();
  1599. fltx4 f4q1A = LoadAlignedSIMD(pq1A), f4q2A = LoadAlignedSIMD(pq2A);
  1600. fltx4 f4q1B = LoadAlignedSIMD(pq1B), f4q2B = LoadAlignedSIMD(pq2B);
  1601. fltx4 f4q1C = LoadAlignedSIMD(pq1C), f4q2C = LoadAlignedSIMD(pq2C);
  1602. fltx4 f4q1D = LoadAlignedSIMD(pq1D), f4q2D = LoadAlignedSIMD(pq2D);
  1603. //ALIGN fltx4 f4Pos1A = LoadUnaligned3SIMD(pp1A), f4Pos2A = LoadUnaligned3SIMD(pp2A);
  1604. fltx4 f4Pos1A = LoadAlignedSIMD(pp1A), f4Pos2A = LoadAlignedSIMD(pp2A);
  1605. //ALIGN fltx4 f4Pos1B = LoadUnaligned3SIMD(pp1B), f4Pos2B = LoadUnaligned3SIMD(pp2B);
  1606. fltx4 f4Pos1B = LoadAlignedSIMD(pp1B), f4Pos2B = LoadAlignedSIMD(pp2B);
  1607. //ALIGN fltx4 f4Pos1C = LoadUnaligned3SIMD(pp1C), f4Pos2C = LoadUnaligned3SIMD(pp2C);
  1608. fltx4 f4Pos1C = LoadAlignedSIMD(pp1C), f4Pos2C = LoadAlignedSIMD(pp2C);
  1609. //ALIGN fltx4 f4Pos1D = LoadUnaligned3SIMD(pp1D), f4Pos2D = LoadUnaligned3SIMD(pp2D);
  1610. fltx4 f4Pos1D = LoadAlignedSIMD(pp1D), f4Pos2D = LoadAlignedSIMD(pp2D);
  1611. f4q1A = BoneQuaternionAlignSIMD(f4q2A, f4q1A);
  1612. f4q1B = BoneQuaternionAlignSIMD(f4q2B, f4q1B);
  1613. f4q1C = BoneQuaternionAlignSIMD(f4q2C, f4q1C);
  1614. f4q1D = BoneQuaternionAlignSIMD(f4q2D, f4q1D);
  1615. fltx4 f4BlendedA = MulSIMD( scale2, f4q2A );
  1616. fltx4 f4BlendedB = MulSIMD( scale2, f4q2B );
  1617. fltx4 f4BlendedC = MulSIMD( scale2, f4q2C );
  1618. fltx4 f4BlendedD = MulSIMD( scale2, f4q2D );
  1619. f4BlendedA = MaddSIMD( scale1, f4q1A, f4BlendedA );
  1620. f4BlendedB = MaddSIMD( scale1, f4q1B, f4BlendedB );
  1621. f4BlendedC = MaddSIMD( scale1, f4q1C, f4BlendedC );
  1622. f4BlendedD = MaddSIMD( scale1, f4q1D, f4BlendedD );
  1623. f4BlendedA = BoneQuaternionNormalizeSIMD(f4BlendedA);
  1624. f4BlendedB = BoneQuaternionNormalizeSIMD(f4BlendedB);
  1625. f4BlendedC = BoneQuaternionNormalizeSIMD(f4BlendedC);
  1626. f4BlendedD = BoneQuaternionNormalizeSIMD(f4BlendedD);
  1627. // now blend the linear parts
  1628. fltx4 f4PosBlendedA = MaddSIMD(scale1, f4Pos1A, MulSIMD(scale2,f4Pos2A));
  1629. fltx4 f4PosBlendedB = MaddSIMD(scale1, f4Pos1B, MulSIMD(scale2,f4Pos2B));
  1630. fltx4 f4PosBlendedC = MaddSIMD(scale1, f4Pos1C, MulSIMD(scale2,f4Pos2C));
  1631. fltx4 f4PosBlendedD = MaddSIMD(scale1, f4Pos1D, MulSIMD(scale2,f4Pos2D));
  1632. //f4PosBlended = MaskedAssign(maskW, f4Pos1, f4PosBlended);
  1633. StoreAlignedSIMD(pq1A,f4BlendedA);
  1634. //ALIGN StoreUnaligned3SIMD(pp1A, f4PosBlendedA);
  1635. StoreAlignedSIMD(pp1A, f4PosBlendedA);
  1636. StoreAlignedSIMD(pq1B,f4BlendedB);
  1637. //ALIGN StoreUnaligned3SIMD(pp1B, f4PosBlendedB);
  1638. StoreAlignedSIMD(pp1B, f4PosBlendedB);
  1639. StoreAlignedSIMD(pq1C,f4BlendedC);
  1640. //ALIGN StoreUnaligned3SIMD(pp1C, f4PosBlendedC);
  1641. StoreAlignedSIMD(pp1C, f4PosBlendedC);
  1642. StoreAlignedSIMD(pq1D,f4BlendedD);
  1643. //ALIGN StoreUnaligned3SIMD(pp1D, f4PosBlendedD);
  1644. StoreAlignedSIMD(pp1D, f4PosBlendedD);
  1645. }
  1646. #endif
  1647. for(; p < pActiveBonesEnd; ++p)
  1648. {
  1649. int nBone = *p;
  1650. float *RESTRICT pq1 = q1[nBone].Base(), *RESTRICT pp1 = pos1[nBone].Base();
  1651. const float *RESTRICT pq2 = q2[nBone].Base(), *RESTRICT pp2 = pos2[nBone].Base();
  1652. fltx4 f4q1 = LoadAlignedSIMD(pq1), f4q2 = LoadAlignedSIMD(pq2);
  1653. //ALIGN fltx4 f4Pos1 = LoadUnaligned3SIMD(pp1), f4Pos2 = LoadUnaligned3SIMD(pp2);
  1654. fltx4 f4Pos1 = LoadAlignedSIMD(pp1), f4Pos2 = LoadAlignedSIMD(pp2);
  1655. f4q1 = BoneQuaternionAlignSIMD(f4q2, f4q1);
  1656. fltx4 f4Blended = MulSIMD( scale2, f4q2 );
  1657. f4Blended = MaddSIMD( scale1, f4q1, f4Blended );
  1658. f4Blended = BoneQuaternionNormalizeSIMD(f4Blended);
  1659. // now blend the linear parts
  1660. fltx4 f4PosBlended = MaddSIMD(scale1, f4Pos1, MulSIMD(scale2,f4Pos2));
  1661. //f4PosBlended = MaskedAssign(maskW, f4Pos1, f4PosBlended);
  1662. StoreAlignedSIMD(pq1,f4Blended);
  1663. //ALIGN StoreUnaligned3SIMD(pp1, f4PosBlended);
  1664. StoreAlignedSIMD(pp1, f4PosBlended);
  1665. }
  1666. }
  1667. PopLSStack( numBones * sizeof(int) * 2 );
  1668. }
  1669. else
  1670. #endif // POSIX
  1671. {
  1672. // 360-400 ticks per loop pass
  1673. // there are usually 40-100 bones on average in a frame
  1674. for( i = 0; i < pBonejob->numBones; i++ )
  1675. {
  1676. // skip unused bones
  1677. if( !( pBonejob->boneFlags[i] & boneMask) )
  1678. {
  1679. continue;
  1680. }
  1681. if( pPoseEntry->pEA_seqgroup_boneMap )
  1682. {
  1683. j = boneMap[i];
  1684. }
  1685. else
  1686. {
  1687. j = i;
  1688. }
  1689. if( j >= 0 && boneWeight[j] > 0.0f )
  1690. {
  1691. if( pBonejob->boneFlags[i] & BONE_FIXED_ALIGNMENT)
  1692. {
  1693. QuaternionBlendNoAlign_PS3( q2[i], q1[i], s1, q3 );
  1694. }
  1695. else
  1696. {
  1697. QuaternionBlend_PS3( q2[i], q1[i], s1, q3 );
  1698. }
  1699. q1[i][0] = q3[0];
  1700. q1[i][1] = q3[1];
  1701. q1[i][2] = q3[2];
  1702. q1[i][3] = q3[3];
  1703. pos1[i][0] = pos1[i][0] * s1 + pos2[i][0] * s2;
  1704. pos1[i][1] = pos1[i][1] * s1 + pos2[i][1] * s2;
  1705. pos1[i][2] = pos1[i][2] * s1 + pos2[i][2] * s2;
  1706. }
  1707. }
  1708. }
  1709. }
  1710. //-----------------------------------------------------------------------------
  1711. // Purpose: Scale a set of bones. Must be of type delta
  1712. //-----------------------------------------------------------------------------
  1713. void ScaleBones_PS3(
  1714. const bonejob_SPU *pBonejob,
  1715. const accumposeentry_SPU *pPoseEntry,
  1716. BoneQuaternion *q1,
  1717. BoneVector *pos1,
  1718. const int *boneMap,
  1719. const float *boneWeight,
  1720. float s,
  1721. int boneMask )
  1722. {
  1723. SNPROF_ANIM("ScaleBones_PS3");
  1724. int i, j;
  1725. Quaternion q3;
  1726. float s2 = s;
  1727. float s1 = 1.0f - s2;
  1728. for (i = 0; i < pBonejob->numBones; i++)
  1729. {
  1730. // skip unused bones
  1731. if( !( pBonejob->boneFlags[i] & boneMask) )
  1732. {
  1733. continue;
  1734. }
  1735. if( pPoseEntry->pEA_seqgroup_boneMap )
  1736. {
  1737. j = boneMap[i];
  1738. }
  1739. else
  1740. {
  1741. j = i;
  1742. }
  1743. if( j >= 0 && boneWeight[j] > 0.0f )
  1744. {
  1745. QuaternionIdentityBlend_PS3( q1[i], s1, q1[i] );
  1746. VectorScale_PS3( pos1[i], s2, pos1[i] );
  1747. }
  1748. }
  1749. }
  1750. // temp - debugging DMA's
  1751. // NOINLINE void *SPUmemcpy_UnalignedGet( void *ls, uint32 ea, uint32_t size )
  1752. // {
  1753. // void *aligned_ls;
  1754. //
  1755. // aligned_ls = (void *)((uint32)ls | (ea & 0xf)); // + 0xf in case ls not 16B aligned
  1756. //
  1757. // #if defined(__SPU__)
  1758. // //Msg("GET ls:0x%x, ea:0x%x, size:%d\n", (uint32_t)aligned_ls, ea, size);
  1759. // // SPU
  1760. // cellDmaUnalignedGet( aligned_ls, ea, size, DMATAG_ANIM, 0, 0 );
  1761. // cellDmaWaitTagStatusAny( 1 << DMATAG_ANIM );
  1762. // #else
  1763. // // PPU
  1764. // memcpy( aligned_ls, (void *)ea, size );
  1765. // #endif
  1766. //
  1767. //
  1768. // return aligned_ls;
  1769. // }
  1770. //
  1771. //
  1772. // NOINLINE void SPUmemcpy_UnalignedPut( void *ls, uint32 ea, uint32_t size )
  1773. // {
  1774. // #if defined(__SPU__)
  1775. // //Msg("PUT ls:0x%x, ea:0x%x, size:%d\n", (uint32_t)ls, ea, size);
  1776. // // SPU
  1777. // cellDmaUnalignedPut( ls, ea, size, DMATAG_ANIM, 0, 0 );
  1778. // cellDmaWaitTagStatusAny( 1 << DMATAG_ANIM );
  1779. // #else
  1780. // Assert(((uint32)ls&0xf) == ea&0xf);
  1781. //
  1782. // // PPU
  1783. // memcpy( (void *)ea, ls, size );
  1784. // #endif
  1785. // }