Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1478 lines
60 KiB

  1. //===== Copyright � 1996-2005, Valve Corporation, All rights reserved. ======//
  2. //
  3. // Purpose:
  4. //
  5. //===========================================================================//
  6. #if defined(__SPU__)
  7. #include "platform.h"
  8. #include "basetypes.h"
  9. #include "mathlib/mathlib.h"
  10. #include "mathlib/math_pfns.h"
  11. // #include "mathlib/fltx4.h"
  12. #include "ps3/spu_job_shared.h"
  13. #endif
  14. #include "mathlib/ssemath.h"
  15. #include "mathlib/ssequaternion.h"
  16. #include "mathlib/compressed_vector.h"
  17. // NOTE: This has to be the last file included!
  18. #include "tier0/memdbgon.h"
  19. #if !defined(__SPU__)
  20. const fltx4 Four_PointFives={0.5,0.5,0.5,0.5};
  21. #ifndef _X360
  22. const fltx4 Four_Zeros={0.0,0.0,0.0,0.0};
  23. const fltx4 Four_Ones={1.0,1.0,1.0,1.0};
  24. #endif
  25. const fltx4 Four_Twos={2.0,2.0,2.0,2.0};
  26. const fltx4 Four_Threes={3.0,3.0,3.0,3.0};
  27. const fltx4 Four_Fours={4.0,4.0,4.0,4.0};
  28. const fltx4 Four_Origin={0,0,0,1};
  29. const fltx4 Four_NegativeOnes={-1,-1,-1,-1};
  30. const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) };
  31. const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) };
  32. const fltx4 Four_2ToThe23s={ (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) };
  33. const fltx4 Four_2ToThe24s={ (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) };
  34. const fltx4 Four_Thirds={ 0.33333333, 0.33333333, 0.33333333, 0.33333333 };
  35. const fltx4 Four_TwoThirds={ 0.66666666, 0.66666666, 0.66666666, 0.66666666 };
  36. const fltx4 Four_Point225s={ .225, .225, .225, .225 };
  37. const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON};
  38. const fltx4 Four_DegToRad= { ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f)), ((float)(M_PI_F / 180.f))};
  39. const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  40. const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX};
  41. const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. };
  42. const fltx4 Four_LinearToGammaCoefficients_A = { -3.7295, -3.7295, -3.7295, -3.7295 };
  43. const fltx4 Four_LinearToGammaCoefficients_B = { 8.9635, 8.9635, 8.9635, 8.9635 };
  44. const fltx4 Four_LinearToGammaCoefficients_C = { -7.7397, -7.7397, -7.7397, -7.7397 };
  45. const fltx4 Four_LinearToGammaCoefficients_D = {3.443, 3.443, 3.443, 3.443 };
  46. const fltx4 Four_LinearToGammaCoefficients_E = { 0.048, 0.048, 0.048, 0.048 };
  47. const fltx4 Four_GammaToLinearCoefficients_A = { .1731, .1731, .1731, .1731 };
  48. const fltx4 Four_GammaToLinearCoefficients_B = { .8717, .8717, .8717, .8717 };
  49. const fltx4 Four_GammaToLinearCoefficients_C = { -.0452, -.0452, -.0452, -.0452 };
  50. const fltx4 Four_GammaToLinearCoefficients_D = { .0012, .0012, .0012, .0012 };
  51. const fltx4 g_QuatMultRowSign[4] =
  52. {
  53. { 1.0f, 1.0f, -1.0f, 1.0f },
  54. { -1.0f, 1.0f, 1.0f, 1.0f },
  55. { 1.0f, -1.0f, 1.0f, 1.0f },
  56. { -1.0f, -1.0f, -1.0f, 1.0f }
  57. };
  58. #endif
  59. const int32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
  60. const int32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
  61. const int32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe };
  62. const int32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
  63. const int32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0
  64. const int32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4
  65. const int32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST =
  66. {
  67. { 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF }
  68. };
  69. const fltx4 g_SIMD_Identity[4] =
  70. {
  71. { 1.0, 0, 0, 0 }, { 0, 1.0, 0, 0 }, { 0, 0, 1.0, 0 }, { 0, 0, 0, 1.0 }
  72. };
  73. const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
  74. {
  75. { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
  76. { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 },
  77. { 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
  78. { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 },
  79. };
  80. const int32 ALIGN16 g_SIMD_EveryOtherMask[4] = { 0, ~0, 0, ~0 };
  81. #ifdef PLATFORM_PPC
  82. /// Passed as a parameter to vslh, shuffles the z component of a quat48 stored in the zw words left by one bit.
  83. const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[] = {
  84. 0x00, 0x00, // x word
  85. 0x00, 0x00, // y word
  86. 0x00, 0x01, // z word
  87. 0x00, 0x00 }; // w word
  88. // this permutes uint16's x,y,z packed in the most significant four halfwords of a fltx4
  89. // so that each gets its own word in the output. expected use is // __vperm( XX, Four_Threes, permute )
  90. // -- that way each int is represented as 3.0 + n * 2^-22 , which we can pull into the
  91. // appropriate range with a single madd!
  92. const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16] =
  93. {
  94. 16, 17, 0, 1, // word one: 00XX
  95. 16, 17, 2, 3, // word two: 00YY
  96. 16, 17, 4, 5, // word three: 00ZZ
  97. 16, 17, 6, 7 // word four: 00WW
  98. };
  99. // the other permutes are a little trickier. note: I'm defining them out of order.
  100. // 2 and 5 blend together prior results, rather than a source with 3.0f
  101. // out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____
  102. const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16] =
  103. {
  104. 16, 17, 6, 7, // word one: 00XX
  105. 16, 17, 8, 9, // word two: 00YY
  106. 16, 17, 10, 11, // word three: 00ZZ
  107. 16, 17, 12, 13 // word four: 00WW
  108. };
  109. // out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2 // z2 is important, goes into out2
  110. const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16] =
  111. {
  112. 16, 17, 2, 3,
  113. 16, 17, 4, 5,
  114. 16, 17, 6, 7,
  115. 16, 17, 0, 1
  116. };
  117. // out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5 // x5 is important, goes into out5
  118. const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16] =
  119. {
  120. 16, 17, 8, 9,
  121. 16, 17, 10, 11,
  122. 16, 17, 12, 13,
  123. 16, 17, 14, 15
  124. };
  125. // out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____
  126. const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16] =
  127. {
  128. 16, 17, 4, 5, // word one
  129. 16, 17, 6, 7, // word two
  130. 16, 17, 8, 9, // word three
  131. 16, 17, 10, 11 // word four (garbage)
  132. };
  133. // out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____
  134. const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16] =
  135. {
  136. 16, 17, 10, 11, // word one
  137. 16, 17, 12, 13, // word two
  138. 16, 17, 14, 15, // word three
  139. 16, 17, 16, 17 // word four (garbage)
  140. };
  141. // these last two are tricky because we mix old output with source input. we get the 3.0f
  142. // from the old output.
  143. // out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute2) ); // __x2__y2__z2____
  144. const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16] =
  145. {
  146. 16, 17, 12, 13, // 3.x2
  147. 16, 17, 14, 15, // 3.y2
  148. 16, 17, 30, 31, // 3.z2 (from out2)
  149. 16, 17, 16, 17
  150. };
  151. // out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast<const fltx4 *>(g_SIMD_Quat48_Unpack_Permute5) ) // __x5__y5__z5____
  152. const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16] =
  153. {
  154. 16, 17, 30, 31, // 3.x5 (from out5)
  155. 16, 17, 0, 1, // 3.y5
  156. 16, 17, 2, 3, // 3.z5
  157. 16, 17, 16, 17 // garbage
  158. };
  159. // magic constants that we use to convert the unpacked q48 components from 2 + n * 2^-22 (where n = 0 .. 65535)
  160. // to -1.0 .. 1
  161. #define UnpackMul16s ( (1 << 22) / 32767.5 )
  162. #define UnpackAdd16s ( ( -UnpackMul16s * 3.0 ) - 1 )
  163. // we put the constants all into one word to save a little memory bandwidth
  164. // but otherwise it would look like this:
  165. // static const fltx4 vUpkMul = { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
  166. // static const fltx4 vUpkAdd = { UnpackAdd16s , UnpackAdd16s , UnpackAdd16s , UnpackAdd16s };
  167. const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants = { UnpackMul16s , UnpackAdd16s, 0, 0 };
  168. #undef UnpackMul16s
  169. #undef UnpackAdd16s
  170. #endif
  171. // FUNCTIONS
  172. // NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE
  173. // Generally speaking, you want to make sure SIMD math functions
  174. // are inlined, because that gives the compiler much more latitude
  175. // in instruction scheduling. It's not that the overhead of calling
  176. // the function is particularly great; rather, many of the SIMD
  177. // opcodes have long latencies, and if you have a sequence of
  178. // several dependent ones inside a function call, the latencies
  179. // stack up to create a big penalty. If the function is inlined,
  180. // the compiler can interleave its operations with ones from the
  181. // caller to better hide those latencies. Finally, on the 360,
  182. // putting parameters or return values on the stack, and then
  183. // reading them back within the next forty cycles, is a very
  184. // severe penalty. So, as much as possible, you want to leave your
  185. // data on the registers.
  186. // That said, there are certain occasions where it is appropriate
  187. // to call into functions -- particularly for very large blocks
  188. // of code that will spill most of the registers anyway. Unless your
  189. // function is more than one screen long, yours is probably not one
  190. // of those occasions.
  191. #if !defined(__SPU__)
  192. /// You can use this to rotate a long array of FourVectors all by the same
  193. /// matrix. The first parameter is the head of the array. The second is the
  194. /// number of vectors to rotate. The third is the matrix.
  195. void FourVectors::RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
  196. {
  197. Assert(numVectors > 0);
  198. if ( numVectors == 0 )
  199. return;
  200. // Splat out each of the entries in the matrix to a fltx4. Do this
  201. // in the order that we will need them, to hide latency. I'm
  202. // avoiding making an array of them, so that they'll remain in
  203. // registers.
  204. fltx4 matSplat00, matSplat01, matSplat02,
  205. matSplat10, matSplat11, matSplat12,
  206. matSplat20, matSplat21, matSplat22;
  207. {
  208. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  209. // often unaligned. The w components will be the tranpose row of
  210. // the matrix, but we don't really care about that.
  211. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  212. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  213. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  214. matSplat00 = SplatXSIMD(matCol0);
  215. matSplat01 = SplatYSIMD(matCol0);
  216. matSplat02 = SplatZSIMD(matCol0);
  217. matSplat10 = SplatXSIMD(matCol1);
  218. matSplat11 = SplatYSIMD(matCol1);
  219. matSplat12 = SplatZSIMD(matCol1);
  220. matSplat20 = SplatXSIMD(matCol2);
  221. matSplat21 = SplatYSIMD(matCol2);
  222. matSplat22 = SplatZSIMD(matCol2);
  223. }
  224. #if defined(_X360) || defined(_PS3)
  225. // Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
  226. // and simplify prefetching. Named variables are deliberately used instead of arrays to
  227. // ensure that the variables live on the registers instead of the stack (stack load/store
  228. // is a serious penalty on 360). Nb: for prefetching to be most efficient here, the
  229. // loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is
  230. // 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line.
  231. // That way you can fetch the next 3 cache lines while you work on these three.
  232. // If you do go this route, be sure to dissassemble and make sure it doesn't spill
  233. // registers to stack as you do this; the cost of that will be excessive. Unroll the loop
  234. // a little and just live with the fact that you'll be doing a couple of redundant dbcts
  235. // (they don't cost you anything). Be aware that all three cores share L2 and it can only
  236. // have eight cache lines fetching at a time.
  237. fltx4 outX0, outY0, outZ0; // bank one of outputs
  238. fltx4 outX1, outY1, outZ1; // bank two of outputs
  239. // Because of instruction latencies and scheduling, it's actually faster to use adds and muls
  240. // rather than madds. (Empirically determined by timing.)
  241. const FourVectors * stop = pVectors + numVectors;
  242. FourVectors * RESTRICT pVectNext;
  243. // prime the pump.
  244. if (numVectors & 0x01)
  245. {
  246. // odd number of vectors to process
  247. // prime the 1 group of registers
  248. pVectNext = pVectors++;
  249. outX1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ), MulSIMD( pVectNext->z, matSplat02 ) );
  250. outY1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ), MulSIMD( pVectNext->z, matSplat12 ) );
  251. outZ1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ), MulSIMD( pVectNext->z, matSplat22 ) );
  252. }
  253. else
  254. {
  255. // even number of total vectors to process;
  256. // prime the zero group and jump into the middle of the loop
  257. outX0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ), MulSIMD( pVectors->z, matSplat02 ) );
  258. outY0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ), MulSIMD( pVectors->z, matSplat12 ) );
  259. outZ0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ), MulSIMD( pVectors->z, matSplat22 ) );
  260. goto EVEN_CASE;
  261. }
  262. // perform an even number of iterations through this loop.
  263. while (pVectors < stop)
  264. {
  265. outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
  266. outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
  267. outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
  268. pVectNext->x = outX1;
  269. pVectNext->y = outY1;
  270. pVectNext->z = outZ1;
  271. EVEN_CASE:
  272. pVectNext = pVectors+1;
  273. outX1 = MaddSIMD( pVectNext->z, matSplat02, AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ) );
  274. outY1 = MaddSIMD( pVectNext->z, matSplat12, AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ) );
  275. outZ1 = MaddSIMD( pVectNext->z, matSplat22, AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ) );
  276. pVectors->x = outX0;
  277. pVectors->y = outY0;
  278. pVectors->z = outZ0;
  279. pVectors += 2;
  280. }
  281. // flush the last round of output
  282. pVectNext->x = outX1;
  283. pVectNext->y = outY1;
  284. pVectNext->z = outZ1;
  285. #else
  286. // PC does not benefit from the unroll/scheduling above
  287. fltx4 outX0, outY0, outZ0; // bank one of outputs
  288. // Because of instruction latencies and scheduling, it's actually faster to use adds and muls
  289. // rather than madds. (Empirically determined by timing.)
  290. const FourVectors * stop = pVectors + numVectors;
  291. // perform an even number of iterations through this loop.
  292. while (pVectors < stop)
  293. {
  294. outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
  295. outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
  296. outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
  297. pVectors->x = outX0;
  298. pVectors->y = outY0;
  299. pVectors->z = outZ0;
  300. pVectors++;
  301. }
  302. #endif
  303. }
  304. // Get the closest point from P to the (infinite) line through vLineA and vLineB and
  305. // calculate the shortest distance from P to the line.
  306. // If you pass in a value for t, it will tell you the t for (A + (B-A)t) to get the closest point.
  307. // If the closest point lies on the segment between A and B, then 0 <= t <= 1.
  308. void FourVectors::CalcClosestPointOnLineSIMD( const FourVectors &P, const FourVectors &vLineA, const FourVectors &vLineB, FourVectors &vClosest, fltx4 *outT)
  309. {
  310. FourVectors vDir;
  311. fltx4 t = CalcClosestPointToLineTSIMD( P, vLineA, vLineB, vDir );
  312. if ( outT ) *outT = t;
  313. vClosest = vDir;
  314. vClosest *= t;
  315. vClosest += vLineA;
  316. }
  317. fltx4 FourVectors::CalcClosestPointToLineTSIMD( const FourVectors &P, const FourVectors &vLineA, const FourVectors &vLineB, FourVectors &vDir )
  318. {
  319. Assert( s_bMathlibInitialized );
  320. vDir = vLineB;
  321. vDir -= vLineA;
  322. fltx4 div = vDir * vDir;
  323. bi32x4 Mask;
  324. fltx4 Compare = ReplicateX4( 0.00001f );
  325. fltx4 result;
  326. Mask = CmpLtSIMD( div, Compare );
  327. result = DivSIMD( SubSIMD( vDir * P, vDir * vLineA ), div );
  328. MaskedAssign( Mask, Four_Zeros, result );
  329. return result;
  330. }
  331. void FourVectors::RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
  332. {
  333. Assert(numVectors > 0);
  334. if ( numVectors == 0 )
  335. return;
  336. // Splat out each of the entries in the matrix to a fltx4. Do this
  337. // in the order that we will need them, to hide latency. I'm
  338. // avoiding making an array of them, so that they'll remain in
  339. // registers.
  340. fltx4 matSplat00, matSplat01, matSplat02,
  341. matSplat10, matSplat11, matSplat12,
  342. matSplat20, matSplat21, matSplat22;
  343. {
  344. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  345. // often unaligned. The w components will be the tranpose row of
  346. // the matrix, but we don't really care about that.
  347. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  348. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  349. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  350. matSplat00 = SplatXSIMD(matCol0);
  351. matSplat01 = SplatYSIMD(matCol0);
  352. matSplat02 = SplatZSIMD(matCol0);
  353. matSplat10 = SplatXSIMD(matCol1);
  354. matSplat11 = SplatYSIMD(matCol1);
  355. matSplat12 = SplatZSIMD(matCol1);
  356. matSplat20 = SplatXSIMD(matCol2);
  357. matSplat21 = SplatYSIMD(matCol2);
  358. matSplat22 = SplatZSIMD(matCol2);
  359. }
  360. #if defined(_X360) || defined(_PS3)
  361. // Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
  362. // and simplify prefetching. Named variables are deliberately used instead of arrays to
  363. // ensure that the variables live on the registers instead of the stack (stack load/store
  364. // is a serious penalty on 360). Nb: for prefetching to be most efficient here, the
  365. // loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is
  366. // 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line.
  367. // That way you can fetch the next 3 cache lines while you work on these three.
  368. // If you do go this route, be sure to dissassemble and make sure it doesn't spill
  369. // registers to stack as you do this; the cost of that will be excessive. Unroll the loop
  370. // a little and just live with the fact that you'll be doing a couple of redundant dbcts
  371. // (they don't cost you anything). Be aware that all three cores share L2 and it can only
  372. // have eight cache lines fetching at a time.
  373. fltx4 outX0, outY0, outZ0; // bank one of outputs
  374. fltx4 outX1, outY1, outZ1; // bank two of outputs
  375. // Because of instruction latencies and scheduling, it's actually faster to use adds and muls
  376. // rather than madds. (Empirically determined by timing.)
  377. const FourVectors * stop = pVectors + numVectors;
  378. FourVectors * RESTRICT pVectNext;
  379. FourVectors * RESTRICT pOutNext;
  380. // prime the pump.
  381. if (numVectors & 0x01)
  382. {
  383. // odd number of vectors to process
  384. // prime the 1 group of registers
  385. pVectNext = pVectors++;
  386. pOutNext = pOut++;
  387. outX1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ), MulSIMD( pVectNext->z, matSplat02 ) );
  388. outY1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ), MulSIMD( pVectNext->z, matSplat12 ) );
  389. outZ1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ), MulSIMD( pVectNext->z, matSplat22 ) );
  390. }
  391. else
  392. {
  393. // even number of total vectors to process;
  394. // prime the zero group and jump into the middle of the loop
  395. outX0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ), MulSIMD( pVectors->z, matSplat02 ) );
  396. outY0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ), MulSIMD( pVectors->z, matSplat12 ) );
  397. outZ0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ), MulSIMD( pVectors->z, matSplat22 ) );
  398. goto EVEN_CASE;
  399. }
  400. // perform an even number of iterations through this loop.
  401. while (pVectors < stop)
  402. {
  403. outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
  404. outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
  405. outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
  406. pOutNext->x = outX1;
  407. pOutNext->y = outY1;
  408. pOutNext->z = outZ1;
  409. EVEN_CASE:
  410. pVectNext = pVectors+1;
  411. pOutNext = pOut+1;
  412. outX1 = MaddSIMD( pVectNext->z, matSplat02, AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ) );
  413. outY1 = MaddSIMD( pVectNext->z, matSplat12, AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ) );
  414. outZ1 = MaddSIMD( pVectNext->z, matSplat22, AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ) );
  415. pOut->x = outX0;
  416. pOut->y = outY0;
  417. pOut->z = outZ0;
  418. pVectors += 2;
  419. pOut += 2;
  420. }
  421. // flush the last round of output
  422. pVectNext->x = outX1;
  423. pVectNext->y = outY1;
  424. pVectNext->z = outZ1;
  425. #else
  426. // PC does not benefit from the unroll/scheduling above
  427. fltx4 outX0, outY0, outZ0; // bank one of outputs
  428. // Because of instruction latencies and scheduling, it's actually faster to use adds and muls
  429. // rather than madds. (Empirically determined by timing.)
  430. const FourVectors * stop = pVectors + numVectors;
  431. // perform an even number of iterations through this loop.
  432. while (pVectors < stop)
  433. {
  434. outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
  435. outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
  436. outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
  437. pOut->x = outX0;
  438. pOut->y = outY0;
  439. pOut->z = outZ0;
  440. pVectors++;
  441. pOut++;
  442. }
  443. #endif
  444. }
  445. #ifdef _X360
  446. // Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
  447. void FourVectors_TransformManyGroupsOfEightBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
  448. {
  449. Assert(numVectors > 0);
  450. if ( numVectors == 0 )
  451. return;
  452. AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
  453. (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
  454. // Splat out each of the entries in the matrix to a fltx4. Do this
  455. // in the order that we will need them, to hide latency. I'm
  456. // avoiding making an array of them, so that they'll remain in
  457. // registers.
  458. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  459. matSplat10, matSplat11, matSplat12, matSplat13,
  460. matSplat20, matSplat21, matSplat22, matSplat23;
  461. {
  462. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  463. // often unaligned. The w components will be the tranpose row of
  464. // the matrix.
  465. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  466. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  467. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  468. matSplat00 = SplatXSIMD(matCol0);
  469. matSplat01 = SplatYSIMD(matCol0);
  470. matSplat02 = SplatZSIMD(matCol0);
  471. matSplat03 = SplatWSIMD(matCol0);
  472. matSplat10 = SplatXSIMD(matCol1);
  473. matSplat11 = SplatYSIMD(matCol1);
  474. matSplat12 = SplatZSIMD(matCol1);
  475. matSplat13 = SplatWSIMD(matCol1);
  476. matSplat20 = SplatXSIMD(matCol2);
  477. matSplat21 = SplatYSIMD(matCol2);
  478. matSplat22 = SplatZSIMD(matCol2);
  479. matSplat23 = SplatWSIMD(matCol2);
  480. }
  481. // this macro defines how to compute a specific row from an input and certain splat columns
  482. #define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
  483. #define WRITE(term, reg, toptr) toptr->term = reg
  484. // define result groups (we're going to have an eight-way unroll)
  485. fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS
  486. fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
  487. fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
  488. fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
  489. fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
  490. fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
  491. fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
  492. fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
  493. // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
  494. #define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
  495. #define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
  496. /*
  497. // stage 1 -- 6 ops for xyz, each w 12 cycle latency
  498. res0X = MulSIMD( (invec)->y, matSplat01 );
  499. res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
  500. // stage 2 -- 3 clocks for xyz
  501. res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
  502. // stage 3 -- 3 clocks for xyz
  503. res0X = AddSIMD(res0X, res0Temp);
  504. */
  505. #define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
  506. #define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
  507. #define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar
  508. #define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  509. COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  510. COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  511. #define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  512. COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  513. COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  514. #define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  515. COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  516. COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  517. FourVectors * RESTRICT inData = pVectors;
  518. FourVectors * RESTRICT outData = pOut;
  519. const FourVectors * const RESTRICT STOP = pVectors + numVectors;
  520. // Use techniques of loop scheduling to eliminate data hazards; process
  521. // eight groups simultaneously so that we never have any operations stalling
  522. // waiting for data.
  523. // Note: this loop, while pretty fast, could be faster still -- you'll notice
  524. // that it does all of its loads, then all computation, then writes everything
  525. // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
  526. // stage 3, and write, then throughput could be higher (probably by about 50%).
  527. while (inData < STOP)
  528. {
  529. // start prefetching the three cache lines
  530. // we'll hit two iterations from now
  531. __dcbt( sizeof(FourVectors) * 16, inData );
  532. __dcbt( sizeof(FourVectors) * 16 + 128, inData );
  533. __dcbt( sizeof(FourVectors) * 16 + 256, inData );
  534. // synchro
  535. COMPUTE_STAGE1_GROUP(res0, inData + 0);
  536. COMPUTE_STAGE1_GROUP(res1, inData + 1);
  537. COMPUTE_STAGE1_GROUP(res2, inData + 2);
  538. COMPUTE_STAGE1_GROUP(res3, inData + 3);
  539. COMPUTE_STAGE2_GROUP(res0, inData + 0);
  540. COMPUTE_STAGE1_GROUP(res4, inData + 4);
  541. COMPUTE_STAGE2_GROUP(res1, inData + 1);
  542. COMPUTE_STAGE1_GROUP(res5, inData + 5);
  543. COMPUTE_STAGE2_GROUP(res2, inData + 2);
  544. COMPUTE_STAGE1_GROUP(res6, inData + 6);
  545. COMPUTE_STAGE2_GROUP(res3, inData + 3);
  546. COMPUTE_STAGE1_GROUP(res7, inData + 7);
  547. COMPUTE_STAGE3_GROUP(res0, inData + 0);
  548. COMPUTE_STAGE2_GROUP(res4, inData + 4);
  549. COMPUTE_STAGE3_GROUP(res1, inData + 1);
  550. COMPUTE_STAGE2_GROUP(res5, inData + 5);
  551. COMPUTE_STAGE3_GROUP(res2, inData + 2);
  552. COMPUTE_STAGE2_GROUP(res6, inData + 6);
  553. COMPUTE_STAGE3_GROUP(res3, inData + 3);
  554. COMPUTE_STAGE2_GROUP(res7, inData + 7);
  555. COMPUTE_STAGE3_GROUP(res4, inData + 4);
  556. WRITE_GROUP( outData + 0, res0 );
  557. COMPUTE_STAGE3_GROUP(res5, inData + 5);
  558. WRITE_GROUP( outData + 1, res1 );
  559. COMPUTE_STAGE3_GROUP(res6, inData + 6);
  560. WRITE_GROUP( outData + 2, res2 );
  561. COMPUTE_STAGE3_GROUP(res7, inData + 7);
  562. WRITE_GROUP( outData + 3, res3 );
  563. WRITE_GROUP( outData + 4, res4 );
  564. WRITE_GROUP( outData + 5, res5 );
  565. WRITE_GROUP( outData + 6, res6 );
  566. WRITE_GROUP( outData + 7, res7 );
  567. inData += 8;
  568. outData += 8;
  569. }
  570. #undef COMPUTE
  571. #undef WRITE
  572. #undef COMPUTE_STAGE1_ROW
  573. #undef COMPUTE_STAGE2_ROW
  574. #undef COMPUTE_STAGE3_ROW
  575. #undef COMPUTE_STAGE1_GROUP
  576. #undef COMPUTE_STAGE2_GROUP
  577. #undef COMPUTE_STAGE3_GROUP
  578. #undef COMPUTE_GROUP
  579. #undef WRITE_GROUP
  580. }
  581. #ifdef _X360
  582. // Loop-scheduled code to process FourVectors in groups of eight quite efficiently. This is the version
  583. // to call when starting on a 128-byte-aligned address.
  584. void FourVectors_TransformManyGroupsOfEightBy_128byteAligned(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
  585. {
  586. /* If this has changed, you will need to change all the prefetches, *
  587. * and groups of eight are no longer the ideal unit for iterating *
  588. * on many vectors. */
  589. COMPILE_TIME_ASSERT( sizeof(FourVectors) == 48 ) ;
  590. Assert(numVectors > 0);
  591. if ( numVectors == 0 )
  592. return;
  593. AssertMsg((numVectors & 0x07) == 0, "FourVectors_TransformManyGroupsOfEight called with numVectors % 8 != 0!");
  594. // Assert alignment
  595. AssertMsg( ( ( reinterpret_cast<uint32>( pVectors ) & 127 ) == 0) &&
  596. ( ( reinterpret_cast<uint32>(pOut) & 127 ) == 0),
  597. "FourVectors_Transform..aligned called with non-128-byte-aligned buffers." );
  598. // Assert non overlap
  599. AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
  600. (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
  601. // Here's the plan. 8 four-vecs = 3 cache lines exactly. It takes about 400 cycles to process a group
  602. // of eight, and cache latency is 600 cycles, so we try to prefetch two iterations ahead (eg fetch
  603. // iteration 3 while working on iteration 1). In the case of the output, we can simply zero-flush
  604. // the cache lines since we are sure to write into them. Because we're reading and fetching two ahead,
  605. // we want to stop two away from the last iteration.
  606. // No matter what, we will need to prefetch the first two groups of eight of input (that's the
  607. // first six cache lines)
  608. __dcbt( 0, pVectors );
  609. __dcbt( 128, pVectors );
  610. __dcbt( 256, pVectors );
  611. __dcbt( 384, pVectors );
  612. __dcbt( 512, pVectors );
  613. __dcbt( 640, pVectors );
  614. // Splat out each of the entries in the matrix to a fltx4. Do this
  615. // in the order that we will need them, to hide latency. I'm
  616. // avoiding making an array of them, so that they'll remain in
  617. // registers.
  618. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  619. matSplat10, matSplat11, matSplat12, matSplat13,
  620. matSplat20, matSplat21, matSplat22, matSplat23;
  621. {
  622. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  623. // often unaligned. The w components will be the tranpose row of
  624. // the matrix.
  625. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  626. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  627. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  628. matSplat00 = SplatXSIMD(matCol0);
  629. matSplat01 = SplatYSIMD(matCol0);
  630. matSplat02 = SplatZSIMD(matCol0);
  631. matSplat03 = SplatWSIMD(matCol0);
  632. matSplat10 = SplatXSIMD(matCol1);
  633. matSplat11 = SplatYSIMD(matCol1);
  634. matSplat12 = SplatZSIMD(matCol1);
  635. matSplat13 = SplatWSIMD(matCol1);
  636. matSplat20 = SplatXSIMD(matCol2);
  637. matSplat21 = SplatYSIMD(matCol2);
  638. matSplat22 = SplatZSIMD(matCol2);
  639. matSplat23 = SplatWSIMD(matCol2);
  640. }
  641. // this macro defines how to compute a specific row from an input and certain splat columns
  642. #define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
  643. #define WRITE(term, reg, toptr) toptr->term = reg
  644. // define result groups (we're going to have an eight-way unroll)
  645. fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS
  646. fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
  647. fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
  648. fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
  649. fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
  650. fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
  651. fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
  652. fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
  653. // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
  654. #define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
  655. #define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
  656. /*
  657. // stage 1 -- 6 ops for xyz, each w 12 cycle latency
  658. res0X = MulSIMD( (invec)->y, matSplat01 );
  659. res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
  660. // stage 2 -- 3 clocks for xyz
  661. res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
  662. // stage 3 -- 3 clocks for xyz
  663. res0X = AddSIMD(res0X, res0Temp);
  664. */
  665. #define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
  666. #define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
  667. #define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar
  668. #define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  669. COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  670. COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  671. #define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  672. COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  673. COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  674. #define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  675. COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  676. COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  677. // Okay. First do all but the last two turns of the crank; we don't want to overshoot with the flush-to-zero.
  678. FourVectors * RESTRICT inData = pVectors;
  679. FourVectors * RESTRICT outData = pOut;
  680. const FourVectors * RESTRICT STOP;
  681. if (numVectors > 16)
  682. {
  683. STOP = pVectors + numVectors - 16;
  684. // flush the first two blocks we'll write into
  685. __dcbz128( 0, outData );
  686. __dcbz128( 128, outData );
  687. __dcbz128( 256, outData );
  688. while (inData < STOP)
  689. {
  690. // start prefetching the three cache lines
  691. // we'll hit two iterations from now
  692. __dcbt( sizeof(FourVectors) * 16, inData );
  693. __dcbt( sizeof(FourVectors) * 16 + 128, inData );
  694. __dcbt( sizeof(FourVectors) * 16 + 256, inData );
  695. // synchro
  696. COMPUTE_STAGE1_GROUP(res0, inData + 0);
  697. COMPUTE_STAGE1_GROUP(res1, inData + 1);
  698. COMPUTE_STAGE1_GROUP(res2, inData + 2);
  699. COMPUTE_STAGE1_GROUP(res3, inData + 3);
  700. // pre-zero the three cache lines we'll overwrite
  701. // in the next iteration
  702. __dcbz128( 384, outData );
  703. __dcbz128( 512, outData );
  704. __dcbz128( 640, outData );
  705. COMPUTE_STAGE2_GROUP(res0, inData + 0);
  706. COMPUTE_STAGE1_GROUP(res4, inData + 4);
  707. COMPUTE_STAGE2_GROUP(res1, inData + 1);
  708. COMPUTE_STAGE1_GROUP(res5, inData + 5);
  709. COMPUTE_STAGE2_GROUP(res2, inData + 2);
  710. COMPUTE_STAGE1_GROUP(res6, inData + 6);
  711. COMPUTE_STAGE2_GROUP(res3, inData + 3);
  712. COMPUTE_STAGE1_GROUP(res7, inData + 7);
  713. COMPUTE_STAGE3_GROUP(res0, inData + 0);
  714. COMPUTE_STAGE2_GROUP(res4, inData + 4);
  715. COMPUTE_STAGE3_GROUP(res1, inData + 1);
  716. COMPUTE_STAGE2_GROUP(res5, inData + 5);
  717. COMPUTE_STAGE3_GROUP(res2, inData + 2);
  718. COMPUTE_STAGE2_GROUP(res6, inData + 6);
  719. COMPUTE_STAGE3_GROUP(res3, inData + 3);
  720. COMPUTE_STAGE2_GROUP(res7, inData + 7);
  721. COMPUTE_STAGE3_GROUP(res4, inData + 4);
  722. WRITE_GROUP( outData + 0, res0 );
  723. COMPUTE_STAGE3_GROUP(res5, inData + 5);
  724. WRITE_GROUP( outData + 1, res1 );
  725. COMPUTE_STAGE3_GROUP(res6, inData + 6);
  726. WRITE_GROUP( outData + 2, res2 );
  727. COMPUTE_STAGE3_GROUP(res7, inData + 7);
  728. WRITE_GROUP( outData + 3, res3 );
  729. WRITE_GROUP( outData + 4, res4 );
  730. WRITE_GROUP( outData + 5, res5 );
  731. WRITE_GROUP( outData + 6, res6 );
  732. WRITE_GROUP( outData + 7, res7 );
  733. inData += 8;
  734. outData += 8;
  735. }
  736. }
  737. else if (numVectors == 16)
  738. {
  739. // zero out the exactly six cache lines we will write into
  740. __dcbz128( 0, outData );
  741. __dcbz128( 128, outData );
  742. __dcbz128( 256, outData );
  743. __dcbz128( 384, outData );
  744. __dcbz128( 512, outData );
  745. __dcbz128( 640, outData );
  746. }
  747. else if (numVectors == 8)
  748. {
  749. // zero out the exactly three cache lines we will write into
  750. __dcbz128( 0, outData );
  751. __dcbz128( 128, outData );
  752. __dcbz128( 256, outData );
  753. }
  754. else
  755. {
  756. AssertMsg(false, "Can't happen!");
  757. }
  758. // deal with the ultimate two groups (or, if we were fed
  759. // less than 16 groups, the whole shebang)
  760. STOP = pVectors + numVectors - 16;
  761. // Use techniques of loop scheduling to eliminate data hazards; process
  762. // eight groups simultaneously so that we never have any operations stalling
  763. // waiting for data.
  764. // Note: this loop, while pretty fast, could be faster still -- you'll notice
  765. // that it does all of its loads, then all computation, then writes everything
  766. // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
  767. // stage 3, and write, then throughput could be higher (probably by about 50%).
  768. while (inData < STOP)
  769. {
  770. // synchro
  771. COMPUTE_STAGE1_GROUP(res0, inData + 0);
  772. COMPUTE_STAGE1_GROUP(res1, inData + 1);
  773. COMPUTE_STAGE1_GROUP(res2, inData + 2);
  774. COMPUTE_STAGE1_GROUP(res3, inData + 3);
  775. COMPUTE_STAGE2_GROUP(res0, inData + 0);
  776. COMPUTE_STAGE1_GROUP(res4, inData + 4);
  777. COMPUTE_STAGE2_GROUP(res1, inData + 1);
  778. COMPUTE_STAGE1_GROUP(res5, inData + 5);
  779. COMPUTE_STAGE2_GROUP(res2, inData + 2);
  780. COMPUTE_STAGE1_GROUP(res6, inData + 6);
  781. COMPUTE_STAGE2_GROUP(res3, inData + 3);
  782. COMPUTE_STAGE1_GROUP(res7, inData + 7);
  783. COMPUTE_STAGE3_GROUP(res0, inData + 0);
  784. COMPUTE_STAGE2_GROUP(res4, inData + 4);
  785. COMPUTE_STAGE3_GROUP(res1, inData + 1);
  786. COMPUTE_STAGE2_GROUP(res5, inData + 5);
  787. COMPUTE_STAGE3_GROUP(res2, inData + 2);
  788. COMPUTE_STAGE2_GROUP(res6, inData + 6);
  789. COMPUTE_STAGE3_GROUP(res3, inData + 3);
  790. COMPUTE_STAGE2_GROUP(res7, inData + 7);
  791. COMPUTE_STAGE3_GROUP(res4, inData + 4);
  792. WRITE_GROUP( outData + 0, res0 );
  793. COMPUTE_STAGE3_GROUP(res5, inData + 5);
  794. WRITE_GROUP( outData + 1, res1 );
  795. COMPUTE_STAGE3_GROUP(res6, inData + 6);
  796. WRITE_GROUP( outData + 2, res2 );
  797. COMPUTE_STAGE3_GROUP(res7, inData + 7);
  798. WRITE_GROUP( outData + 3, res3 );
  799. WRITE_GROUP( outData + 4, res4 );
  800. WRITE_GROUP( outData + 5, res5 );
  801. WRITE_GROUP( outData + 6, res6 );
  802. WRITE_GROUP( outData + 7, res7 );
  803. inData += 8;
  804. outData += 8;
  805. }
  806. #undef COMPUTE
  807. #undef WRITE
  808. #undef COMPUTE_STAGE1_ROW
  809. #undef COMPUTE_STAGE2_ROW
  810. #undef COMPUTE_STAGE3_ROW
  811. #undef COMPUTE_STAGE1_GROUP
  812. #undef COMPUTE_STAGE2_GROUP
  813. #undef COMPUTE_STAGE3_GROUP
  814. #undef COMPUTE_GROUP
  815. #undef WRITE_GROUP
  816. }
  817. #endif
  818. // Transform a long array of FourVectors by a given matrix.
  819. void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
  820. {
  821. Assert(numVectors > 0);
  822. AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
  823. (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
  824. #ifdef _X360
  825. // The really fast version of this function likes to operate on blocks of eight. So, chug through
  826. // groups of eight, then deal with any leftovers.
  827. int numVectorsRoundedToNearestEight = numVectors & (~0x07);
  828. if (numVectors >= 8)
  829. {
  830. // aligned?
  831. if ((reinterpret_cast<unsigned int>(pVectors) & 127) == 0 && (reinterpret_cast<unsigned int>(pOut) & 127) == 0)
  832. {
  833. FourVectors_TransformManyGroupsOfEightBy_128byteAligned(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
  834. }
  835. else
  836. {
  837. FourVectors_TransformManyGroupsOfEightBy(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
  838. }
  839. numVectors -= numVectorsRoundedToNearestEight;
  840. pVectors += numVectorsRoundedToNearestEight;
  841. pOut += numVectorsRoundedToNearestEight;
  842. }
  843. #endif
  844. // any left over?
  845. if (numVectors > 0)
  846. {
  847. // Splat out each of the entries in the matrix to a fltx4. Do this
  848. // in the order that we will need them, to hide latency. I'm
  849. // avoiding making an array of them, so that they'll remain in
  850. // registers.
  851. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  852. matSplat10, matSplat11, matSplat12, matSplat13,
  853. matSplat20, matSplat21, matSplat22, matSplat23;
  854. {
  855. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  856. // often unaligned. The w components will be the transpose row of
  857. // the matrix.
  858. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  859. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  860. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  861. matSplat00 = SplatXSIMD(matCol0);
  862. matSplat01 = SplatYSIMD(matCol0);
  863. matSplat02 = SplatZSIMD(matCol0);
  864. matSplat03 = SplatWSIMD(matCol0);
  865. matSplat10 = SplatXSIMD(matCol1);
  866. matSplat11 = SplatYSIMD(matCol1);
  867. matSplat12 = SplatZSIMD(matCol1);
  868. matSplat13 = SplatWSIMD(matCol1);
  869. matSplat20 = SplatXSIMD(matCol2);
  870. matSplat21 = SplatYSIMD(matCol2);
  871. matSplat22 = SplatZSIMD(matCol2);
  872. matSplat23 = SplatWSIMD(matCol2);
  873. }
  874. do
  875. {
  876. // Trust in the compiler to schedule these operations correctly:
  877. pOut->x = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
  878. pOut->y = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
  879. pOut->z = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));
  880. ++pOut;
  881. ++pVectors;
  882. --numVectors;
  883. } while(numVectors > 0);
  884. }
  885. }
  886. #ifdef _X360
  887. // Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
  888. static void FourVectors_TransformManyGroupsOfEightBy_InPlace(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
  889. {
  890. Assert(numVectors > 0);
  891. if ( numVectors == 0 )
  892. return;
  893. // Prefetch line 1 and 2
  894. __dcbt(0,pVectors);
  895. __dcbt(128,pVectors);
  896. // Splat out each of the entries in the matrix to a fltx4. Do this
  897. // in the order that we will need them, to hide latency. I'm
  898. // avoiding making an array of them, so that they'll remain in
  899. // registers.
  900. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  901. matSplat10, matSplat11, matSplat12, matSplat13,
  902. matSplat20, matSplat21, matSplat22, matSplat23;
  903. {
  904. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  905. // often unaligned. The w components will be the tranpose row of
  906. // the matrix.
  907. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  908. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  909. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  910. matSplat00 = SplatXSIMD(matCol0);
  911. matSplat01 = SplatYSIMD(matCol0);
  912. matSplat02 = SplatZSIMD(matCol0);
  913. matSplat03 = SplatWSIMD(matCol0);
  914. matSplat10 = SplatXSIMD(matCol1);
  915. matSplat11 = SplatYSIMD(matCol1);
  916. matSplat12 = SplatZSIMD(matCol1);
  917. matSplat13 = SplatWSIMD(matCol1);
  918. matSplat20 = SplatXSIMD(matCol2);
  919. matSplat21 = SplatYSIMD(matCol2);
  920. matSplat22 = SplatZSIMD(matCol2);
  921. matSplat23 = SplatWSIMD(matCol2);
  922. }
  923. // this macro defines how to compute a specific row from an input and certain splat columns
  924. #define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
  925. #define WRITE(term, reg, toptr) toptr->term = reg
  926. // define result groups (we're going to have an eight-way unroll)
  927. fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS
  928. fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
  929. fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
  930. fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
  931. fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
  932. fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
  933. fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
  934. fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
  935. // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
  936. #define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
  937. #define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
  938. /*
  939. // stage 1 -- 6 ops for xyz, each w 12 cycle latency
  940. res0X = MulSIMD( (invec)->y, matSplat01 );
  941. res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
  942. // stage 2 -- 3 clocks for xyz
  943. res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
  944. // stage 3 -- 3 clocks for xyz
  945. res0X = AddSIMD(res0X, res0Temp);
  946. */
  947. #define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
  948. #define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
  949. #define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar
  950. #define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  951. COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  952. COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  953. #define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  954. COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  955. COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  956. #define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  957. COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  958. COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  959. const FourVectors * const RESTRICT STOP = pVectors + numVectors;
  960. // Use techniques of loop scheduling to eliminate data hazards; process
  961. // eight groups simultaneously so that we never have any operations stalling
  962. // waiting for data.
  963. // Note: this loop, while pretty fast, could be faster still -- you'll notice
  964. // that it does all of its loads, then all computation, then writes everything
  965. // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
  966. // stage 3, and write, then throughput could be higher (probably by about 50%).
  967. while (pVectors < STOP)
  968. {
  969. // start prefetching the three cache lines
  970. // we'll hit two iterations from now
  971. __dcbt( sizeof(FourVectors) * 16, pVectors );
  972. __dcbt( sizeof(FourVectors) * 16 + 128, pVectors );
  973. __dcbt( sizeof(FourVectors) * 16 + 256, pVectors );
  974. // synchro
  975. COMPUTE_STAGE1_GROUP(res0, pVectors + 0);
  976. COMPUTE_STAGE1_GROUP(res1, pVectors + 1);
  977. COMPUTE_STAGE1_GROUP(res2, pVectors + 2);
  978. COMPUTE_STAGE1_GROUP(res3, pVectors + 3);
  979. COMPUTE_STAGE2_GROUP(res0, pVectors + 0);
  980. COMPUTE_STAGE1_GROUP(res4, pVectors + 4);
  981. COMPUTE_STAGE2_GROUP(res1, pVectors + 1);
  982. COMPUTE_STAGE1_GROUP(res5, pVectors + 5);
  983. COMPUTE_STAGE2_GROUP(res2, pVectors + 2);
  984. COMPUTE_STAGE1_GROUP(res6, pVectors + 6);
  985. COMPUTE_STAGE2_GROUP(res3, pVectors + 3);
  986. COMPUTE_STAGE1_GROUP(res7, pVectors + 7);
  987. COMPUTE_STAGE3_GROUP(res0, pVectors + 0);
  988. COMPUTE_STAGE2_GROUP(res4, pVectors + 4);
  989. COMPUTE_STAGE3_GROUP(res1, pVectors + 1);
  990. COMPUTE_STAGE2_GROUP(res5, pVectors + 5);
  991. COMPUTE_STAGE3_GROUP(res2, pVectors + 2);
  992. COMPUTE_STAGE2_GROUP(res6, pVectors + 6);
  993. COMPUTE_STAGE3_GROUP(res3, pVectors + 3);
  994. COMPUTE_STAGE2_GROUP(res7, pVectors + 7);
  995. COMPUTE_STAGE3_GROUP(res4, pVectors + 4);
  996. WRITE_GROUP( pVectors + 0, res0 );
  997. COMPUTE_STAGE3_GROUP(res5, pVectors + 5);
  998. WRITE_GROUP( pVectors + 1, res1 );
  999. COMPUTE_STAGE3_GROUP(res6, pVectors + 6);
  1000. WRITE_GROUP( pVectors + 2, res2 );
  1001. COMPUTE_STAGE3_GROUP(res7, pVectors + 7);
  1002. WRITE_GROUP( pVectors + 3, res3 );
  1003. WRITE_GROUP( pVectors + 4, res4 );
  1004. WRITE_GROUP( pVectors + 5, res5 );
  1005. WRITE_GROUP( pVectors + 6, res6 );
  1006. WRITE_GROUP( pVectors + 7, res7 );
  1007. pVectors += 8;
  1008. }
  1009. #undef COMPUTE
  1010. #undef WRITE
  1011. #undef COMPUTE_STAGE1_ROW
  1012. #undef COMPUTE_STAGE2_ROW
  1013. #undef COMPUTE_STAGE3_ROW
  1014. #undef COMPUTE_STAGE1_GROUP
  1015. #undef COMPUTE_STAGE2_GROUP
  1016. #undef COMPUTE_STAGE3_GROUP
  1017. #undef COMPUTE_GROUP
  1018. #undef WRITE_GROUP
  1019. }
  1020. #endif
  1021. // In-place version of above. It's necessary to have this, rather than just allowing pOut and pVectors
  1022. // to equal each other, because of the semantics of RESTRICT: pVectors and pOut must not be allowed
  1023. // to alias. (Simply un-restricting the pointers results in very poor scheduling.)
  1024. void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
  1025. {
  1026. Assert(numVectors > 0);
  1027. #ifdef _X360
  1028. // The really fast version of this function likes to operate on blocks of eight. So, chug through
  1029. // groups of eight, then deal with any leftovers.
  1030. int numVectorsRoundedToNearestEight = numVectors & (~0x07);
  1031. if (numVectors >= 8)
  1032. {
  1033. FourVectors_TransformManyGroupsOfEightBy_InPlace(pVectors, numVectorsRoundedToNearestEight, rotationMatrix);
  1034. numVectors -= numVectorsRoundedToNearestEight;
  1035. pVectors += numVectorsRoundedToNearestEight;
  1036. }
  1037. #endif
  1038. // any left over?
  1039. if (numVectors > 0)
  1040. {
  1041. // Splat out each of the entries in the matrix to a fltx4. Do this
  1042. // in the order that we will need them, to hide latency. I'm
  1043. // avoiding making an array of them, so that they'll remain in
  1044. // registers.
  1045. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  1046. matSplat10, matSplat11, matSplat12, matSplat13,
  1047. matSplat20, matSplat21, matSplat22, matSplat23;
  1048. {
  1049. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  1050. // often unaligned. The w components will be the transpose row of
  1051. // the matrix.
  1052. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  1053. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  1054. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  1055. matSplat00 = SplatXSIMD(matCol0);
  1056. matSplat01 = SplatYSIMD(matCol0);
  1057. matSplat02 = SplatZSIMD(matCol0);
  1058. matSplat03 = SplatWSIMD(matCol0);
  1059. matSplat10 = SplatXSIMD(matCol1);
  1060. matSplat11 = SplatYSIMD(matCol1);
  1061. matSplat12 = SplatZSIMD(matCol1);
  1062. matSplat13 = SplatWSIMD(matCol1);
  1063. matSplat20 = SplatXSIMD(matCol2);
  1064. matSplat21 = SplatYSIMD(matCol2);
  1065. matSplat22 = SplatZSIMD(matCol2);
  1066. matSplat23 = SplatWSIMD(matCol2);
  1067. }
  1068. do
  1069. {
  1070. fltx4 resultX, resultY, resultZ;
  1071. // Trust in the compiler to schedule these operations correctly:
  1072. resultX = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
  1073. resultY = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
  1074. resultZ = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));
  1075. pVectors->x = resultX;
  1076. pVectors->y = resultY;
  1077. pVectors->z = resultZ;
  1078. ++pVectors;
  1079. --numVectors;
  1080. } while(numVectors > 0);
  1081. }
  1082. }
  1083. #endif
  1084. // Transform many (horizontal) points in-place by a 3x4 matrix,
  1085. // here already loaded onto three fltx4 registers but not transposed.
  1086. // The points must be stored as 16-byte aligned. They are points
  1087. // and not vectors because we assume the w-component to be 1.
  1088. #ifdef _X360
  1089. void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow0, FLTX4 mRow1, FLTX4 mRow2)
  1090. {
  1091. /**************************************************
  1092. * Here is an elaborate and carefully scheduled *
  1093. * algorithm nicked from xboxmath.inl and hacked *
  1094. * up for 3x4 matrices. *
  1095. **************************************************/
  1096. COMPILE_TIME_ASSERT(sizeof(VectorAligned) == sizeof(XMFLOAT4)); // VectorAligned's need to be 16 bytes
  1097. XMVECTOR R0[8], R1[8], R2[8];
  1098. XMVECTOR vIn[8];
  1099. // C_ASSERT(UnrollCount == 8);
  1100. // C_ASSERT(sizeof(XMFLOAT4) == 16);
  1101. Assert(pVectors);
  1102. Assert(((UINT_PTR)pVectors & 3) == 0); // assert alignment
  1103. UINT GroupIndex;
  1104. VectorAligned * RESTRICT vCurrent = pVectors;
  1105. // sentinel pointers
  1106. VectorAligned * vStreamEnd, *vStreamGroupBase, *vStreamGroupEnd;
  1107. {
  1108. // cook up the pointers from integer math. Necessary because otherwise we LHS all over
  1109. // the place. (Odd that this doesn't happen to the xbox math.)
  1110. UINT_PTR InputVector = (UINT_PTR)pVectors;
  1111. UINT_PTR InputStreamEnd = InputVector + numVectors * sizeof(XMFLOAT4);
  1112. // compute start and end points on 128-byte alignment
  1113. UINT_PTR InputStreamCGroupBase = XMMin(InputVector + (XM_CACHE_LINE_SIZE - 1), InputStreamEnd) & ~(XM_CACHE_LINE_SIZE - 1);
  1114. UINT_PTR InputStreamCGroupEnd = InputStreamCGroupBase + ((InputStreamEnd - InputStreamCGroupBase) & ~(4 * XM_CACHE_LINE_SIZE - 1));
  1115. vStreamEnd = (VectorAligned *)InputStreamEnd;
  1116. vStreamGroupBase = (VectorAligned *)InputStreamCGroupBase;
  1117. vStreamGroupEnd = (VectorAligned *)InputStreamCGroupEnd;
  1118. }
  1119. __dcbt(0, vStreamGroupBase);
  1120. __dcbt(XM_CACHE_LINE_SIZE, vStreamGroupBase);
  1121. __dcbt(XM_CACHE_LINE_SIZE * 2, vStreamGroupBase);
  1122. __dcbt(XM_CACHE_LINE_SIZE * 3, vStreamGroupBase);
  1123. while (vCurrent < vStreamGroupBase)
  1124. {
  1125. fltx4 vec = __lvx(vCurrent->Base(), 0);
  1126. R0[0] = __vmsum4fp(vec, mRow0);
  1127. R1[0] = __vmsum4fp(vec, mRow1);
  1128. R2[0] = __vmsum4fp(vec, mRow2);
  1129. __stvewx(R0[0], vCurrent->Base(), 0);
  1130. __stvewx(R1[0], vCurrent->Base(), 4);
  1131. __stvewx(R2[0], vCurrent->Base(), 8);
  1132. vCurrent++;
  1133. }
  1134. while (vCurrent < vStreamGroupEnd)
  1135. {
  1136. __dcbt(XM_CACHE_LINE_SIZE * 4, vCurrent);
  1137. __dcbt(XM_CACHE_LINE_SIZE * 5, vCurrent);
  1138. __dcbt(XM_CACHE_LINE_SIZE * 6, vCurrent);
  1139. __dcbt(XM_CACHE_LINE_SIZE * 7, vCurrent);
  1140. for (GroupIndex = 0; GroupIndex < 4; GroupIndex++)
  1141. {
  1142. // all kinds of LHS on this pointer. Why?
  1143. VectorAligned* OutputVector = vCurrent;
  1144. vIn[0] = __lvx(vCurrent->Base(), 0);
  1145. vCurrent++;
  1146. vIn[1] = __lvx(vCurrent->Base(), 0);
  1147. vCurrent++;
  1148. vIn[2] = __lvx(vCurrent->Base(), 0);
  1149. vCurrent++;
  1150. vIn[3] = __lvx(vCurrent->Base(), 0);
  1151. vCurrent++;
  1152. vIn[4] = __lvx(vCurrent->Base(), 0);
  1153. vCurrent++;
  1154. vIn[5] = __lvx(vCurrent->Base(), 0);
  1155. vCurrent++;
  1156. vIn[6] = __lvx(vCurrent->Base(), 0);
  1157. vCurrent++;
  1158. vIn[7] = __lvx(vCurrent->Base(), 0);
  1159. vCurrent++;
  1160. R0[0] = __vmsum4fp(vIn[0], mRow0);
  1161. R1[0] = __vmsum4fp(vIn[0], mRow1);
  1162. R2[0] = __vmsum4fp(vIn[0], mRow2);
  1163. R0[1] = __vmsum4fp(vIn[1], mRow0);
  1164. R1[1] = __vmsum4fp(vIn[1], mRow1);
  1165. R2[1] = __vmsum4fp(vIn[1], mRow2);
  1166. R0[2] = __vmsum4fp(vIn[2], mRow0);
  1167. R1[2] = __vmsum4fp(vIn[2], mRow1);
  1168. R2[2] = __vmsum4fp(vIn[2], mRow2);
  1169. R0[3] = __vmsum4fp(vIn[3], mRow0);
  1170. R1[3] = __vmsum4fp(vIn[3], mRow1);
  1171. R2[3] = __vmsum4fp(vIn[3], mRow2);
  1172. R0[4] = __vmsum4fp(vIn[4], mRow0);
  1173. R1[4] = __vmsum4fp(vIn[4], mRow1);
  1174. R2[4] = __vmsum4fp(vIn[4], mRow2);
  1175. R0[5] = __vmsum4fp(vIn[5], mRow0);
  1176. R1[5] = __vmsum4fp(vIn[5], mRow1);
  1177. R2[5] = __vmsum4fp(vIn[5], mRow2);
  1178. R0[6] = __vmsum4fp(vIn[6], mRow0);
  1179. R1[6] = __vmsum4fp(vIn[6], mRow1);
  1180. R2[6] = __vmsum4fp(vIn[6], mRow2);
  1181. R0[7] = __vmsum4fp(vIn[7], mRow0);
  1182. R1[7] = __vmsum4fp(vIn[7], mRow1);
  1183. R2[7] = __vmsum4fp(vIn[7], mRow2);
  1184. __stvewx(R0[0], OutputVector, 0);
  1185. __stvewx(R1[0], OutputVector, 4);
  1186. __stvewx(R2[0], OutputVector, 8);
  1187. OutputVector++;
  1188. __stvewx(R0[1], OutputVector, 0);
  1189. __stvewx(R1[1], OutputVector, 4);
  1190. __stvewx(R2[1], OutputVector, 8);
  1191. OutputVector++;
  1192. __stvewx(R0[2], OutputVector, 0);
  1193. __stvewx(R1[2], OutputVector, 4);
  1194. __stvewx(R2[2], OutputVector, 8);
  1195. OutputVector++;
  1196. __stvewx(R0[3], OutputVector, 0);
  1197. __stvewx(R1[3], OutputVector, 4);
  1198. __stvewx(R2[3], OutputVector, 8);
  1199. OutputVector++;
  1200. __stvewx(R0[4], OutputVector, 0);
  1201. __stvewx(R1[4], OutputVector, 4);
  1202. __stvewx(R2[4], OutputVector, 8);
  1203. OutputVector++;
  1204. __stvewx(R0[5], OutputVector, 0);
  1205. __stvewx(R1[5], OutputVector, 4);
  1206. __stvewx(R2[5], OutputVector, 8);
  1207. OutputVector++;
  1208. __stvewx(R0[6], OutputVector, 0);
  1209. __stvewx(R1[6], OutputVector, 4);
  1210. __stvewx(R2[6], OutputVector, 8);
  1211. OutputVector++;
  1212. __stvewx(R0[7], OutputVector, 0);
  1213. __stvewx(R1[7], OutputVector, 4);
  1214. __stvewx(R2[7], OutputVector, 8);
  1215. OutputVector++;
  1216. }
  1217. }
  1218. while (vCurrent < vStreamEnd)
  1219. {
  1220. vIn[0] = __lvx(vCurrent->Base(), 0);
  1221. R0[0] = __vmsum4fp(vIn[0], mRow0);
  1222. R1[0] = __vmsum4fp(vIn[0], mRow1);
  1223. R2[0] = __vmsum4fp(vIn[0], mRow2);
  1224. __stvewx(R0[0], vCurrent->Base(), 0);
  1225. __stvewx(R1[0], vCurrent->Base(), 4);
  1226. __stvewx(R2[0], vCurrent->Base(), 8);
  1227. vCurrent++;
  1228. }
  1229. }
  1230. #endif // #if !defined(__SPU__)
  1231. #endif