Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1164 lines
47 KiB

  1. //========= Copyright Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose:
  4. //
  5. //===========================================================================//
  6. #include "mathlib/ssemath.h"
  7. #include "mathlib/ssequaternion.h"
  8. const fltx4 Four_PointFives={0.5,0.5,0.5,0.5};
  9. #ifndef _X360
  10. const fltx4 Four_Zeros={0.0,0.0,0.0,0.0};
  11. const fltx4 Four_Ones={1.0,1.0,1.0,1.0};
  12. #endif
  13. const fltx4 Four_Twos={2.0,2.0,2.0,2.0};
  14. const fltx4 Four_Threes={3.0,3.0,3.0,3.0};
  15. const fltx4 Four_Fours={4.0,4.0,4.0,4.0};
  16. const fltx4 Four_Origin={0,0,0,1};
  17. const fltx4 Four_NegativeOnes={-1,-1,-1,-1};
  18. const fltx4 Four_2ToThe21s={ (float) (1<<21), (float) (1<<21), (float) (1<<21), (float)(1<<21) };
  19. const fltx4 Four_2ToThe22s={ (float) (1<<22), (float) (1<<22), (float) (1<<22), (float)(1<<22) };
  20. const fltx4 Four_2ToThe23s={ (float) (1<<23), (float) (1<<23), (float) (1<<23), (float)(1<<23) };
  21. const fltx4 Four_2ToThe24s={ (float) (1<<24), (float) (1<<24), (float) (1<<24), (float)(1<<24) };
  22. const fltx4 Four_Point225s={ .225, .225, .225, .225 };
  23. const fltx4 Four_Epsilons={FLT_EPSILON,FLT_EPSILON,FLT_EPSILON,FLT_EPSILON};
  24. const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  25. const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX};
  26. const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. };
  27. const fltx4 g_QuatMultRowSign[4] =
  28. {
  29. { 1.0f, 1.0f, -1.0f, 1.0f },
  30. { -1.0f, 1.0f, 1.0f, 1.0f },
  31. { 1.0f, -1.0f, 1.0f, 1.0f },
  32. { -1.0f, -1.0f, -1.0f, 1.0f }
  33. };
  34. const uint32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff};
  35. const uint32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
  36. const uint32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { 0xfffffffe, 0xfffffffe, 0xfffffffe, 0xfffffffe };
  37. const uint32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
  38. const uint32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; // ~0,~0,~0,~0
  39. const uint32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4
  40. const uint32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST =
  41. {
  42. { 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF }
  43. };
  44. const uint32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
  45. {
  46. { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
  47. { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 },
  48. { 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 },
  49. { 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 },
  50. };
  51. // FUNCTIONS
  52. // NOTE: WHY YOU **DO NOT** WANT TO PUT FUNCTIONS HERE
  53. // Generally speaking, you want to make sure SIMD math functions
  54. // are inlined, because that gives the compiler much more latitude
  55. // in instruction scheduling. It's not that the overhead of calling
  56. // the function is particularly great; rather, many of the SIMD
  57. // opcodes have long latencies, and if you have a sequence of
  58. // several dependent ones inside a function call, the latencies
  59. // stack up to create a big penalty. If the function is inlined,
  60. // the compiler can interleave its operations with ones from the
  61. // caller to better hide those latencies. Finally, on the 360,
  62. // putting parameters or return values on the stack, and then
  63. // reading them back within the next forty cycles, is a very
  64. // severe penalty. So, as much as possible, you want to leave your
  65. // data on the registers.
  66. // That said, there are certain occasions where it is appropriate
  67. // to call into functions -- particularly for very large blocks
  68. // of code that will spill most of the registers anyway. Unless your
  69. // function is more than one screen long, yours is probably not one
  70. // of those occasions.
  71. /// You can use this to rotate a long array of FourVectors all by the same
  72. /// matrix. The first parameter is the head of the array. The second is the
  73. /// number of vectors to rotate. The third is the matrix.
  74. void FourVectors::RotateManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
  75. {
  76. Assert(numVectors > 0);
  77. if ( numVectors == 0 )
  78. return;
  79. // Splat out each of the entries in the matrix to a fltx4. Do this
  80. // in the order that we will need them, to hide latency. I'm
  81. // avoiding making an array of them, so that they'll remain in
  82. // registers.
  83. fltx4 matSplat00, matSplat01, matSplat02,
  84. matSplat10, matSplat11, matSplat12,
  85. matSplat20, matSplat21, matSplat22;
  86. {
  87. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  88. // often unaligned. The w components will be the tranpose row of
  89. // the matrix, but we don't really care about that.
  90. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  91. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  92. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  93. matSplat00 = SplatXSIMD(matCol0);
  94. matSplat01 = SplatYSIMD(matCol0);
  95. matSplat02 = SplatZSIMD(matCol0);
  96. matSplat10 = SplatXSIMD(matCol1);
  97. matSplat11 = SplatYSIMD(matCol1);
  98. matSplat12 = SplatZSIMD(matCol1);
  99. matSplat20 = SplatXSIMD(matCol2);
  100. matSplat21 = SplatYSIMD(matCol2);
  101. matSplat22 = SplatZSIMD(matCol2);
  102. }
  103. #ifdef _X360
  104. // Same algorithm as above, but the loop is unrolled to eliminate data hazard latencies
  105. // and simplify prefetching. Named variables are deliberately used instead of arrays to
  106. // ensure that the variables live on the registers instead of the stack (stack load/store
  107. // is a serious penalty on 360). Nb: for prefetching to be most efficient here, the
  108. // loop should be unrolled to 8 FourVectors per iteration; because each FourVectors is
  109. // 48 bytes long, 48 * 8 = 384, its least common multiple with the 128-byte cache line.
  110. // That way you can fetch the next 3 cache lines while you work on these three.
  111. // If you do go this route, be sure to dissassemble and make sure it doesn't spill
  112. // registers to stack as you do this; the cost of that will be excessive. Unroll the loop
  113. // a little and just live with the fact that you'll be doing a couple of redundant dbcts
  114. // (they don't cost you anything). Be aware that all three cores share L2 and it can only
  115. // have eight cache lines fetching at a time.
  116. fltx4 outX0, outY0, outZ0; // bank one of outputs
  117. fltx4 outX1, outY1, outZ1; // bank two of outputs
  118. // Because of instruction latencies and scheduling, it's actually faster to use adds and muls
  119. // rather than madds. (Empirically determined by timing.)
  120. const FourVectors * stop = pVectors + numVectors;
  121. FourVectors * RESTRICT pVectNext;
  122. // prime the pump.
  123. if (numVectors & 0x01)
  124. {
  125. // odd number of vectors to process
  126. // prime the 1 group of registers
  127. pVectNext = pVectors++;
  128. outX1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ), MulSIMD( pVectNext->z, matSplat02 ) );
  129. outY1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ), MulSIMD( pVectNext->z, matSplat12 ) );
  130. outZ1 = AddSIMD( AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ), MulSIMD( pVectNext->z, matSplat22 ) );
  131. }
  132. else
  133. {
  134. // even number of total vectors to process;
  135. // prime the zero group and jump into the middle of the loop
  136. outX0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ), MulSIMD( pVectors->z, matSplat02 ) );
  137. outY0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ), MulSIMD( pVectors->z, matSplat12 ) );
  138. outZ0 = AddSIMD( AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ), MulSIMD( pVectors->z, matSplat22 ) );
  139. goto EVEN_CASE;
  140. }
  141. // perform an even number of iterations through this loop.
  142. while (pVectors < stop)
  143. {
  144. outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
  145. outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
  146. outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
  147. pVectNext->x = outX1;
  148. pVectNext->y = outY1;
  149. pVectNext->z = outZ1;
  150. EVEN_CASE:
  151. pVectNext = pVectors+1;
  152. outX1 = MaddSIMD( pVectNext->z, matSplat02, AddSIMD( MulSIMD( pVectNext->x, matSplat00 ), MulSIMD( pVectNext->y, matSplat01 ) ) );
  153. outY1 = MaddSIMD( pVectNext->z, matSplat12, AddSIMD( MulSIMD( pVectNext->x, matSplat10 ), MulSIMD( pVectNext->y, matSplat11 ) ) );
  154. outZ1 = MaddSIMD( pVectNext->z, matSplat22, AddSIMD( MulSIMD( pVectNext->x, matSplat20 ), MulSIMD( pVectNext->y, matSplat21 ) ) );
  155. pVectors->x = outX0;
  156. pVectors->y = outY0;
  157. pVectors->z = outZ0;
  158. pVectors += 2;
  159. }
  160. // flush the last round of output
  161. pVectNext->x = outX1;
  162. pVectNext->y = outY1;
  163. pVectNext->z = outZ1;
  164. #else
  165. // PC does not benefit from the unroll/scheduling above
  166. fltx4 outX0, outY0, outZ0; // bank one of outputs
  167. // Because of instruction latencies and scheduling, it's actually faster to use adds and muls
  168. // rather than madds. (Empirically determined by timing.)
  169. const FourVectors * stop = pVectors + numVectors;
  170. // perform an even number of iterations through this loop.
  171. while (pVectors < stop)
  172. {
  173. outX0 = MaddSIMD( pVectors->z, matSplat02, AddSIMD( MulSIMD( pVectors->x, matSplat00 ), MulSIMD( pVectors->y, matSplat01 ) ) );
  174. outY0 = MaddSIMD( pVectors->z, matSplat12, AddSIMD( MulSIMD( pVectors->x, matSplat10 ), MulSIMD( pVectors->y, matSplat11 ) ) );
  175. outZ0 = MaddSIMD( pVectors->z, matSplat22, AddSIMD( MulSIMD( pVectors->x, matSplat20 ), MulSIMD( pVectors->y, matSplat21 ) ) );
  176. pVectors->x = outX0;
  177. pVectors->y = outY0;
  178. pVectors->z = outZ0;
  179. pVectors++;
  180. }
  181. #endif
  182. }
  183. #ifdef _X360
  184. // Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
  185. void FourVectors_TransformManyGroupsOfEightBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
  186. {
  187. Assert(numVectors > 0);
  188. if ( numVectors == 0 )
  189. return;
  190. AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
  191. (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
  192. // Splat out each of the entries in the matrix to a fltx4. Do this
  193. // in the order that we will need them, to hide latency. I'm
  194. // avoiding making an array of them, so that they'll remain in
  195. // registers.
  196. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  197. matSplat10, matSplat11, matSplat12, matSplat13,
  198. matSplat20, matSplat21, matSplat22, matSplat23;
  199. {
  200. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  201. // often unaligned. The w components will be the tranpose row of
  202. // the matrix.
  203. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  204. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  205. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  206. matSplat00 = SplatXSIMD(matCol0);
  207. matSplat01 = SplatYSIMD(matCol0);
  208. matSplat02 = SplatZSIMD(matCol0);
  209. matSplat03 = SplatWSIMD(matCol0);
  210. matSplat10 = SplatXSIMD(matCol1);
  211. matSplat11 = SplatYSIMD(matCol1);
  212. matSplat12 = SplatZSIMD(matCol1);
  213. matSplat13 = SplatWSIMD(matCol1);
  214. matSplat20 = SplatXSIMD(matCol2);
  215. matSplat21 = SplatYSIMD(matCol2);
  216. matSplat22 = SplatZSIMD(matCol2);
  217. matSplat23 = SplatWSIMD(matCol2);
  218. }
  219. // this macro defines how to compute a specific row from an input and certain splat columns
  220. #define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
  221. #define WRITE(term, reg, toptr) toptr->term = reg
  222. // define result groups (we're going to have an eight-way unroll)
  223. fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS
  224. fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
  225. fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
  226. fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
  227. fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
  228. fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
  229. fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
  230. fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
  231. // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
  232. #define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
  233. #define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
  234. /*
  235. // stage 1 -- 6 ops for xyz, each w 12 cycle latency
  236. res0X = MulSIMD( (invec)->y, matSplat01 );
  237. res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
  238. // stage 2 -- 3 clocks for xyz
  239. res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
  240. // stage 3 -- 3 clocks for xyz
  241. res0X = AddSIMD(res0X, res0Temp);
  242. */
  243. #define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
  244. #define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
  245. #define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar
  246. #define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  247. COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  248. COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  249. #define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  250. COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  251. COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  252. #define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  253. COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  254. COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  255. FourVectors * RESTRICT inData = pVectors;
  256. FourVectors * RESTRICT outData = pOut;
  257. const FourVectors * const RESTRICT STOP = pVectors + numVectors;
  258. // Use techniques of loop scheduling to eliminate data hazards; process
  259. // eight groups simultaneously so that we never have any operations stalling
  260. // waiting for data.
  261. // Note: this loop, while pretty fast, could be faster still -- you'll notice
  262. // that it does all of its loads, then all computation, then writes everything
  263. // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
  264. // stage 3, and write, then throughput could be higher (probably by about 50%).
  265. while (inData < STOP)
  266. {
  267. // start prefetching the three cache lines
  268. // we'll hit two iterations from now
  269. __dcbt( sizeof(FourVectors) * 16, inData );
  270. __dcbt( sizeof(FourVectors) * 16 + 128, inData );
  271. __dcbt( sizeof(FourVectors) * 16 + 256, inData );
  272. // synchro
  273. COMPUTE_STAGE1_GROUP(res0, inData + 0);
  274. COMPUTE_STAGE1_GROUP(res1, inData + 1);
  275. COMPUTE_STAGE1_GROUP(res2, inData + 2);
  276. COMPUTE_STAGE1_GROUP(res3, inData + 3);
  277. COMPUTE_STAGE2_GROUP(res0, inData + 0);
  278. COMPUTE_STAGE1_GROUP(res4, inData + 4);
  279. COMPUTE_STAGE2_GROUP(res1, inData + 1);
  280. COMPUTE_STAGE1_GROUP(res5, inData + 5);
  281. COMPUTE_STAGE2_GROUP(res2, inData + 2);
  282. COMPUTE_STAGE1_GROUP(res6, inData + 6);
  283. COMPUTE_STAGE2_GROUP(res3, inData + 3);
  284. COMPUTE_STAGE1_GROUP(res7, inData + 7);
  285. COMPUTE_STAGE3_GROUP(res0, inData + 0);
  286. COMPUTE_STAGE2_GROUP(res4, inData + 4);
  287. COMPUTE_STAGE3_GROUP(res1, inData + 1);
  288. COMPUTE_STAGE2_GROUP(res5, inData + 5);
  289. COMPUTE_STAGE3_GROUP(res2, inData + 2);
  290. COMPUTE_STAGE2_GROUP(res6, inData + 6);
  291. COMPUTE_STAGE3_GROUP(res3, inData + 3);
  292. COMPUTE_STAGE2_GROUP(res7, inData + 7);
  293. COMPUTE_STAGE3_GROUP(res4, inData + 4);
  294. WRITE_GROUP( outData + 0, res0 );
  295. COMPUTE_STAGE3_GROUP(res5, inData + 5);
  296. WRITE_GROUP( outData + 1, res1 );
  297. COMPUTE_STAGE3_GROUP(res6, inData + 6);
  298. WRITE_GROUP( outData + 2, res2 );
  299. COMPUTE_STAGE3_GROUP(res7, inData + 7);
  300. WRITE_GROUP( outData + 3, res3 );
  301. WRITE_GROUP( outData + 4, res4 );
  302. WRITE_GROUP( outData + 5, res5 );
  303. WRITE_GROUP( outData + 6, res6 );
  304. WRITE_GROUP( outData + 7, res7 );
  305. inData += 8;
  306. outData += 8;
  307. }
  308. #undef COMPUTE
  309. #undef WRITE
  310. #undef COMPUTE_STAGE1_ROW
  311. #undef COMPUTE_STAGE2_ROW
  312. #undef COMPUTE_STAGE3_ROW
  313. #undef COMPUTE_STAGE1_GROUP
  314. #undef COMPUTE_STAGE2_GROUP
  315. #undef COMPUTE_STAGE3_GROUP
  316. #undef COMPUTE_GROUP
  317. #undef WRITE_GROUP
  318. }
  319. #ifdef _X360
  320. // Loop-scheduled code to process FourVectors in groups of eight quite efficiently. This is the version
  321. // to call when starting on a 128-byte-aligned address.
  322. void FourVectors_TransformManyGroupsOfEightBy_128byteAligned(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
  323. {
  324. /* If this has changed, you will need to change all the prefetches, *
  325. * and groups of eight are no longer the ideal unit for iterating *
  326. * on many vectors. */
  327. COMPILE_TIME_ASSERT( sizeof(FourVectors) == 48 ) ;
  328. Assert(numVectors > 0);
  329. if ( numVectors == 0 )
  330. return;
  331. AssertMsg((numVectors & 0x07) == 0, "FourVectors_TransformManyGroupsOfEight called with numVectors % 8 != 0!");
  332. // Assert alignment
  333. AssertMsg( ( ( reinterpret_cast<uint32>( pVectors ) & 127 ) == 0) &&
  334. ( ( reinterpret_cast<uint32>(pOut) & 127 ) == 0),
  335. "FourVectors_Transform..aligned called with non-128-byte-aligned buffers." );
  336. // Assert non overlap
  337. AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
  338. (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
  339. // Here's the plan. 8 four-vecs = 3 cache lines exactly. It takes about 400 cycles to process a group
  340. // of eight, and cache latency is 600 cycles, so we try to prefetch two iterations ahead (eg fetch
  341. // iteration 3 while working on iteration 1). In the case of the output, we can simply zero-flush
  342. // the cache lines since we are sure to write into them. Because we're reading and fetching two ahead,
  343. // we want to stop two away from the last iteration.
  344. // No matter what, we will need to prefetch the first two groups of eight of input (that's the
  345. // first six cache lines)
  346. __dcbt( 0, pVectors );
  347. __dcbt( 128, pVectors );
  348. __dcbt( 256, pVectors );
  349. __dcbt( 384, pVectors );
  350. __dcbt( 512, pVectors );
  351. __dcbt( 640, pVectors );
  352. // Splat out each of the entries in the matrix to a fltx4. Do this
  353. // in the order that we will need them, to hide latency. I'm
  354. // avoiding making an array of them, so that they'll remain in
  355. // registers.
  356. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  357. matSplat10, matSplat11, matSplat12, matSplat13,
  358. matSplat20, matSplat21, matSplat22, matSplat23;
  359. {
  360. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  361. // often unaligned. The w components will be the tranpose row of
  362. // the matrix.
  363. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  364. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  365. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  366. matSplat00 = SplatXSIMD(matCol0);
  367. matSplat01 = SplatYSIMD(matCol0);
  368. matSplat02 = SplatZSIMD(matCol0);
  369. matSplat03 = SplatWSIMD(matCol0);
  370. matSplat10 = SplatXSIMD(matCol1);
  371. matSplat11 = SplatYSIMD(matCol1);
  372. matSplat12 = SplatZSIMD(matCol1);
  373. matSplat13 = SplatWSIMD(matCol1);
  374. matSplat20 = SplatXSIMD(matCol2);
  375. matSplat21 = SplatYSIMD(matCol2);
  376. matSplat22 = SplatZSIMD(matCol2);
  377. matSplat23 = SplatWSIMD(matCol2);
  378. }
  379. // this macro defines how to compute a specific row from an input and certain splat columns
  380. #define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
  381. #define WRITE(term, reg, toptr) toptr->term = reg
  382. // define result groups (we're going to have an eight-way unroll)
  383. fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS
  384. fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
  385. fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
  386. fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
  387. fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
  388. fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
  389. fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
  390. fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
  391. // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
  392. #define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
  393. #define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
  394. /*
  395. // stage 1 -- 6 ops for xyz, each w 12 cycle latency
  396. res0X = MulSIMD( (invec)->y, matSplat01 );
  397. res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
  398. // stage 2 -- 3 clocks for xyz
  399. res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
  400. // stage 3 -- 3 clocks for xyz
  401. res0X = AddSIMD(res0X, res0Temp);
  402. */
  403. #define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
  404. #define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
  405. #define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar
  406. #define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  407. COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  408. COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  409. #define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  410. COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  411. COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  412. #define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  413. COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  414. COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  415. // Okay. First do all but the last two turns of the crank; we don't want to overshoot with the flush-to-zero.
  416. FourVectors * RESTRICT inData = pVectors;
  417. FourVectors * RESTRICT outData = pOut;
  418. const FourVectors * RESTRICT STOP;
  419. if (numVectors > 16)
  420. {
  421. STOP = pVectors + numVectors - 16;
  422. // flush the first two blocks we'll write into
  423. __dcbz128( 0, outData );
  424. __dcbz128( 128, outData );
  425. __dcbz128( 256, outData );
  426. while (inData < STOP)
  427. {
  428. // start prefetching the three cache lines
  429. // we'll hit two iterations from now
  430. __dcbt( sizeof(FourVectors) * 16, inData );
  431. __dcbt( sizeof(FourVectors) * 16 + 128, inData );
  432. __dcbt( sizeof(FourVectors) * 16 + 256, inData );
  433. // synchro
  434. COMPUTE_STAGE1_GROUP(res0, inData + 0);
  435. COMPUTE_STAGE1_GROUP(res1, inData + 1);
  436. COMPUTE_STAGE1_GROUP(res2, inData + 2);
  437. COMPUTE_STAGE1_GROUP(res3, inData + 3);
  438. // pre-zero the three cache lines we'll overwrite
  439. // in the next iteration
  440. __dcbz128( 384, outData );
  441. __dcbz128( 512, outData );
  442. __dcbz128( 640, outData );
  443. COMPUTE_STAGE2_GROUP(res0, inData + 0);
  444. COMPUTE_STAGE1_GROUP(res4, inData + 4);
  445. COMPUTE_STAGE2_GROUP(res1, inData + 1);
  446. COMPUTE_STAGE1_GROUP(res5, inData + 5);
  447. COMPUTE_STAGE2_GROUP(res2, inData + 2);
  448. COMPUTE_STAGE1_GROUP(res6, inData + 6);
  449. COMPUTE_STAGE2_GROUP(res3, inData + 3);
  450. COMPUTE_STAGE1_GROUP(res7, inData + 7);
  451. COMPUTE_STAGE3_GROUP(res0, inData + 0);
  452. COMPUTE_STAGE2_GROUP(res4, inData + 4);
  453. COMPUTE_STAGE3_GROUP(res1, inData + 1);
  454. COMPUTE_STAGE2_GROUP(res5, inData + 5);
  455. COMPUTE_STAGE3_GROUP(res2, inData + 2);
  456. COMPUTE_STAGE2_GROUP(res6, inData + 6);
  457. COMPUTE_STAGE3_GROUP(res3, inData + 3);
  458. COMPUTE_STAGE2_GROUP(res7, inData + 7);
  459. COMPUTE_STAGE3_GROUP(res4, inData + 4);
  460. WRITE_GROUP( outData + 0, res0 );
  461. COMPUTE_STAGE3_GROUP(res5, inData + 5);
  462. WRITE_GROUP( outData + 1, res1 );
  463. COMPUTE_STAGE3_GROUP(res6, inData + 6);
  464. WRITE_GROUP( outData + 2, res2 );
  465. COMPUTE_STAGE3_GROUP(res7, inData + 7);
  466. WRITE_GROUP( outData + 3, res3 );
  467. WRITE_GROUP( outData + 4, res4 );
  468. WRITE_GROUP( outData + 5, res5 );
  469. WRITE_GROUP( outData + 6, res6 );
  470. WRITE_GROUP( outData + 7, res7 );
  471. inData += 8;
  472. outData += 8;
  473. }
  474. }
  475. else if (numVectors == 16)
  476. {
  477. // zero out the exactly six cache lines we will write into
  478. __dcbz128( 0, outData );
  479. __dcbz128( 128, outData );
  480. __dcbz128( 256, outData );
  481. __dcbz128( 384, outData );
  482. __dcbz128( 512, outData );
  483. __dcbz128( 640, outData );
  484. }
  485. else if (numVectors == 8)
  486. {
  487. // zero out the exactly three cache lines we will write into
  488. __dcbz128( 0, outData );
  489. __dcbz128( 128, outData );
  490. __dcbz128( 256, outData );
  491. }
  492. else
  493. {
  494. AssertMsg(false, "Can't happen!");
  495. }
  496. // deal with the ultimate two groups (or, if we were fed
  497. // less than 16 groups, the whole shebang)
  498. STOP = pVectors + numVectors - 16;
  499. // Use techniques of loop scheduling to eliminate data hazards; process
  500. // eight groups simultaneously so that we never have any operations stalling
  501. // waiting for data.
  502. // Note: this loop, while pretty fast, could be faster still -- you'll notice
  503. // that it does all of its loads, then all computation, then writes everything
  504. // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
  505. // stage 3, and write, then throughput could be higher (probably by about 50%).
  506. while (inData < STOP)
  507. {
  508. // synchro
  509. COMPUTE_STAGE1_GROUP(res0, inData + 0);
  510. COMPUTE_STAGE1_GROUP(res1, inData + 1);
  511. COMPUTE_STAGE1_GROUP(res2, inData + 2);
  512. COMPUTE_STAGE1_GROUP(res3, inData + 3);
  513. COMPUTE_STAGE2_GROUP(res0, inData + 0);
  514. COMPUTE_STAGE1_GROUP(res4, inData + 4);
  515. COMPUTE_STAGE2_GROUP(res1, inData + 1);
  516. COMPUTE_STAGE1_GROUP(res5, inData + 5);
  517. COMPUTE_STAGE2_GROUP(res2, inData + 2);
  518. COMPUTE_STAGE1_GROUP(res6, inData + 6);
  519. COMPUTE_STAGE2_GROUP(res3, inData + 3);
  520. COMPUTE_STAGE1_GROUP(res7, inData + 7);
  521. COMPUTE_STAGE3_GROUP(res0, inData + 0);
  522. COMPUTE_STAGE2_GROUP(res4, inData + 4);
  523. COMPUTE_STAGE3_GROUP(res1, inData + 1);
  524. COMPUTE_STAGE2_GROUP(res5, inData + 5);
  525. COMPUTE_STAGE3_GROUP(res2, inData + 2);
  526. COMPUTE_STAGE2_GROUP(res6, inData + 6);
  527. COMPUTE_STAGE3_GROUP(res3, inData + 3);
  528. COMPUTE_STAGE2_GROUP(res7, inData + 7);
  529. COMPUTE_STAGE3_GROUP(res4, inData + 4);
  530. WRITE_GROUP( outData + 0, res0 );
  531. COMPUTE_STAGE3_GROUP(res5, inData + 5);
  532. WRITE_GROUP( outData + 1, res1 );
  533. COMPUTE_STAGE3_GROUP(res6, inData + 6);
  534. WRITE_GROUP( outData + 2, res2 );
  535. COMPUTE_STAGE3_GROUP(res7, inData + 7);
  536. WRITE_GROUP( outData + 3, res3 );
  537. WRITE_GROUP( outData + 4, res4 );
  538. WRITE_GROUP( outData + 5, res5 );
  539. WRITE_GROUP( outData + 6, res6 );
  540. WRITE_GROUP( outData + 7, res7 );
  541. inData += 8;
  542. outData += 8;
  543. }
  544. #undef COMPUTE
  545. #undef WRITE
  546. #undef COMPUTE_STAGE1_ROW
  547. #undef COMPUTE_STAGE2_ROW
  548. #undef COMPUTE_STAGE3_ROW
  549. #undef COMPUTE_STAGE1_GROUP
  550. #undef COMPUTE_STAGE2_GROUP
  551. #undef COMPUTE_STAGE3_GROUP
  552. #undef COMPUTE_GROUP
  553. #undef WRITE_GROUP
  554. }
  555. #endif
  556. // Transform a long array of FourVectors by a given matrix.
  557. void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix, FourVectors * RESTRICT pOut )
  558. {
  559. Assert(numVectors > 0);
  560. AssertMsg( (pOut < pVectors && pOut+numVectors <= pVectors) ||
  561. (pOut > pVectors && pVectors+numVectors <= pOut), "FourVectors::TransformManyBy called with overlapping buffer pointers." );
  562. #ifdef _X360
  563. // The really fast version of this function likes to operate on blocks of eight. So, chug through
  564. // groups of eight, then deal with any leftovers.
  565. int numVectorsRoundedToNearestEight = numVectors & (~0x07);
  566. if (numVectors >= 8)
  567. {
  568. // aligned?
  569. if ((reinterpret_cast<unsigned int>(pVectors) & 127) == 0 && (reinterpret_cast<unsigned int>(pOut) & 127) == 0)
  570. {
  571. FourVectors_TransformManyGroupsOfEightBy_128byteAligned(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
  572. }
  573. else
  574. {
  575. FourVectors_TransformManyGroupsOfEightBy(pVectors, numVectorsRoundedToNearestEight, rotationMatrix, pOut);
  576. }
  577. numVectors -= numVectorsRoundedToNearestEight;
  578. pVectors += numVectorsRoundedToNearestEight;
  579. pOut += numVectorsRoundedToNearestEight;
  580. }
  581. #endif
  582. // any left over?
  583. if (numVectors > 0)
  584. {
  585. // Splat out each of the entries in the matrix to a fltx4. Do this
  586. // in the order that we will need them, to hide latency. I'm
  587. // avoiding making an array of them, so that they'll remain in
  588. // registers.
  589. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  590. matSplat10, matSplat11, matSplat12, matSplat13,
  591. matSplat20, matSplat21, matSplat22, matSplat23;
  592. {
  593. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  594. // often unaligned. The w components will be the transpose row of
  595. // the matrix.
  596. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  597. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  598. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  599. matSplat00 = SplatXSIMD(matCol0);
  600. matSplat01 = SplatYSIMD(matCol0);
  601. matSplat02 = SplatZSIMD(matCol0);
  602. matSplat03 = SplatWSIMD(matCol0);
  603. matSplat10 = SplatXSIMD(matCol1);
  604. matSplat11 = SplatYSIMD(matCol1);
  605. matSplat12 = SplatZSIMD(matCol1);
  606. matSplat13 = SplatWSIMD(matCol1);
  607. matSplat20 = SplatXSIMD(matCol2);
  608. matSplat21 = SplatYSIMD(matCol2);
  609. matSplat22 = SplatZSIMD(matCol2);
  610. matSplat23 = SplatWSIMD(matCol2);
  611. }
  612. do
  613. {
  614. // Trust in the compiler to schedule these operations correctly:
  615. pOut->x = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
  616. pOut->y = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
  617. pOut->z = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));
  618. ++pOut;
  619. ++pVectors;
  620. --numVectors;
  621. } while(numVectors > 0);
  622. }
  623. }
  624. #ifdef _X360
  625. // Loop-scheduled code to process FourVectors in groups of eight quite efficiently.
  626. static void FourVectors_TransformManyGroupsOfEightBy_InPlace(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
  627. {
  628. Assert(numVectors > 0);
  629. if ( numVectors == 0 )
  630. return;
  631. // Prefetch line 1 and 2
  632. __dcbt(0,pVectors);
  633. __dcbt(128,pVectors);
  634. // Splat out each of the entries in the matrix to a fltx4. Do this
  635. // in the order that we will need them, to hide latency. I'm
  636. // avoiding making an array of them, so that they'll remain in
  637. // registers.
  638. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  639. matSplat10, matSplat11, matSplat12, matSplat13,
  640. matSplat20, matSplat21, matSplat22, matSplat23;
  641. {
  642. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  643. // often unaligned. The w components will be the tranpose row of
  644. // the matrix.
  645. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  646. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  647. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  648. matSplat00 = SplatXSIMD(matCol0);
  649. matSplat01 = SplatYSIMD(matCol0);
  650. matSplat02 = SplatZSIMD(matCol0);
  651. matSplat03 = SplatWSIMD(matCol0);
  652. matSplat10 = SplatXSIMD(matCol1);
  653. matSplat11 = SplatYSIMD(matCol1);
  654. matSplat12 = SplatZSIMD(matCol1);
  655. matSplat13 = SplatWSIMD(matCol1);
  656. matSplat20 = SplatXSIMD(matCol2);
  657. matSplat21 = SplatYSIMD(matCol2);
  658. matSplat22 = SplatZSIMD(matCol2);
  659. matSplat23 = SplatWSIMD(matCol2);
  660. }
  661. // this macro defines how to compute a specific row from an input and certain splat columns
  662. #define COMPUTE(res, invec, xterm, yterm, zterm, transterm) res = AddSIMD( AddSIMD( MulSIMD((invec)->z, zterm), AddSIMD( MulSIMD( (invec)->x, xterm ), MulSIMD( (invec)->y, yterm ) ) ), transterm )
  663. #define WRITE(term, reg, toptr) toptr->term = reg
  664. // define result groups (we're going to have an eight-way unroll)
  665. fltx4 res0X, res0Y, res0Z, res0XTemp, res0YTemp, res0ZTemp; // 48 REGISTERS
  666. fltx4 res1X, res1Y, res1Z, res1XTemp, res1YTemp, res1ZTemp;
  667. fltx4 res2X, res2Y, res2Z, res2XTemp, res2YTemp, res2ZTemp;
  668. fltx4 res3X, res3Y, res3Z, res3XTemp, res3YTemp, res3ZTemp;
  669. fltx4 res4X, res4Y, res4Z, res4XTemp, res4YTemp, res4ZTemp;
  670. fltx4 res5X, res5Y, res5Z, res5XTemp, res5YTemp, res5ZTemp;
  671. fltx4 res6X, res6Y, res6Z, res6XTemp, res6YTemp, res6ZTemp;
  672. fltx4 res7X, res7Y, res7Z, res7XTemp, res7YTemp, res7ZTemp;
  673. // #define FROZ(out,in,offset) COMPUTE((out+offset)->x, (in + offset), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE((out + offset )->y, (in + offset), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE((out + offset)->z, (in + offset), matSplat20, matSplat21, matSplat22, matSplat23)
  674. #define COMPUTE_GROUP(resgroup,dataptr) COMPUTE(resgroup ## X, (dataptr), matSplat00, matSplat01, matSplat02, matSplat03); COMPUTE(resgroup ## Y, (dataptr), matSplat10, matSplat11, matSplat12, matSplat13); COMPUTE(resgroup ## Z, (dataptr), matSplat20, matSplat21, matSplat22, matSplat23)
  675. #define WRITE_GROUP(ptr, resgroup) (ptr)->x = resgroup ## X; (ptr)->y = resgroup ## Y; (ptr)->z = resgroup ## Z
  676. /*
  677. // stage 1 -- 6 ops for xyz, each w 12 cycle latency
  678. res0X = MulSIMD( (invec)->y, matSplat01 );
  679. res0Temp = MaddSIMD((invec)->z, matSplat02, matSplat03);
  680. // stage 2 -- 3 clocks for xyz
  681. res0X = MaddSIMD( (invec)->x, matSplat00, res0X );
  682. // stage 3 -- 3 clocks for xyz
  683. res0X = AddSIMD(res0X, res0Temp);
  684. */
  685. #define COMPUTE_STAGE1_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MulSIMD( (invec)->y, ysplat ); tempvar = MaddSIMD((invec)->z, zsplat, transplat)
  686. #define COMPUTE_STAGE2_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = MaddSIMD( (invec)->x, xsplat, res )
  687. #define COMPUTE_STAGE3_ROW(res, tempvar, invec, xsplat, ysplat, zsplat, transplat) res = AddSIMD(res, tempvar) // frees up the tempvar
  688. #define COMPUTE_STAGE1_GROUP(resgroup, invec) COMPUTE_STAGE1_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  689. COMPUTE_STAGE1_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  690. COMPUTE_STAGE1_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  691. #define COMPUTE_STAGE2_GROUP(resgroup, invec) COMPUTE_STAGE2_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  692. COMPUTE_STAGE2_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  693. COMPUTE_STAGE2_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  694. #define COMPUTE_STAGE3_GROUP(resgroup, invec) COMPUTE_STAGE3_ROW(resgroup ## X, resgroup ## X ## Temp, invec, matSplat00, matSplat01, matSplat02, matSplat03);\
  695. COMPUTE_STAGE3_ROW(resgroup ## Y, resgroup ## Y ## Temp, invec, matSplat10, matSplat11, matSplat12, matSplat13);\
  696. COMPUTE_STAGE3_ROW(resgroup ## Z, resgroup ## Z ## Temp, invec, matSplat20, matSplat21, matSplat22, matSplat23)
  697. const FourVectors * const RESTRICT STOP = pVectors + numVectors;
  698. // Use techniques of loop scheduling to eliminate data hazards; process
  699. // eight groups simultaneously so that we never have any operations stalling
  700. // waiting for data.
  701. // Note: this loop, while pretty fast, could be faster still -- you'll notice
  702. // that it does all of its loads, then all computation, then writes everything
  703. // out. If made truly cyclic, such that every line interleaved a stage 1, stage 2,
  704. // stage 3, and write, then throughput could be higher (probably by about 50%).
  705. while (pVectors < STOP)
  706. {
  707. // start prefetching the three cache lines
  708. // we'll hit two iterations from now
  709. __dcbt( sizeof(FourVectors) * 16, pVectors );
  710. __dcbt( sizeof(FourVectors) * 16 + 128, pVectors );
  711. __dcbt( sizeof(FourVectors) * 16 + 256, pVectors );
  712. // synchro
  713. COMPUTE_STAGE1_GROUP(res0, pVectors + 0);
  714. COMPUTE_STAGE1_GROUP(res1, pVectors + 1);
  715. COMPUTE_STAGE1_GROUP(res2, pVectors + 2);
  716. COMPUTE_STAGE1_GROUP(res3, pVectors + 3);
  717. COMPUTE_STAGE2_GROUP(res0, pVectors + 0);
  718. COMPUTE_STAGE1_GROUP(res4, pVectors + 4);
  719. COMPUTE_STAGE2_GROUP(res1, pVectors + 1);
  720. COMPUTE_STAGE1_GROUP(res5, pVectors + 5);
  721. COMPUTE_STAGE2_GROUP(res2, pVectors + 2);
  722. COMPUTE_STAGE1_GROUP(res6, pVectors + 6);
  723. COMPUTE_STAGE2_GROUP(res3, pVectors + 3);
  724. COMPUTE_STAGE1_GROUP(res7, pVectors + 7);
  725. COMPUTE_STAGE3_GROUP(res0, pVectors + 0);
  726. COMPUTE_STAGE2_GROUP(res4, pVectors + 4);
  727. COMPUTE_STAGE3_GROUP(res1, pVectors + 1);
  728. COMPUTE_STAGE2_GROUP(res5, pVectors + 5);
  729. COMPUTE_STAGE3_GROUP(res2, pVectors + 2);
  730. COMPUTE_STAGE2_GROUP(res6, pVectors + 6);
  731. COMPUTE_STAGE3_GROUP(res3, pVectors + 3);
  732. COMPUTE_STAGE2_GROUP(res7, pVectors + 7);
  733. COMPUTE_STAGE3_GROUP(res4, pVectors + 4);
  734. WRITE_GROUP( pVectors + 0, res0 );
  735. COMPUTE_STAGE3_GROUP(res5, pVectors + 5);
  736. WRITE_GROUP( pVectors + 1, res1 );
  737. COMPUTE_STAGE3_GROUP(res6, pVectors + 6);
  738. WRITE_GROUP( pVectors + 2, res2 );
  739. COMPUTE_STAGE3_GROUP(res7, pVectors + 7);
  740. WRITE_GROUP( pVectors + 3, res3 );
  741. WRITE_GROUP( pVectors + 4, res4 );
  742. WRITE_GROUP( pVectors + 5, res5 );
  743. WRITE_GROUP( pVectors + 6, res6 );
  744. WRITE_GROUP( pVectors + 7, res7 );
  745. pVectors += 8;
  746. }
  747. #undef COMPUTE
  748. #undef WRITE
  749. #undef COMPUTE_STAGE1_ROW
  750. #undef COMPUTE_STAGE2_ROW
  751. #undef COMPUTE_STAGE3_ROW
  752. #undef COMPUTE_STAGE1_GROUP
  753. #undef COMPUTE_STAGE2_GROUP
  754. #undef COMPUTE_STAGE3_GROUP
  755. #undef COMPUTE_GROUP
  756. #undef WRITE_GROUP
  757. }
  758. #endif
  759. // In-place version of above. It's necessary to have this, rather than just allowing pOut and pVectors
  760. // to equal each other, because of the semantics of RESTRICT: pVectors and pOut must not be allowed
  761. // to alias. (Simply un-restricting the pointers results in very poor scheduling.)
  762. void FourVectors::TransformManyBy(FourVectors * RESTRICT pVectors, unsigned int numVectors, const matrix3x4_t& rotationMatrix )
  763. {
  764. Assert(numVectors > 0);
  765. #ifdef _X360
  766. // The really fast version of this function likes to operate on blocks of eight. So, chug through
  767. // groups of eight, then deal with any leftovers.
  768. int numVectorsRoundedToNearestEight = numVectors & (~0x07);
  769. if (numVectors >= 8)
  770. {
  771. FourVectors_TransformManyGroupsOfEightBy_InPlace(pVectors, numVectorsRoundedToNearestEight, rotationMatrix);
  772. numVectors -= numVectorsRoundedToNearestEight;
  773. pVectors += numVectorsRoundedToNearestEight;
  774. }
  775. #endif
  776. // any left over?
  777. if (numVectors > 0)
  778. {
  779. // Splat out each of the entries in the matrix to a fltx4. Do this
  780. // in the order that we will need them, to hide latency. I'm
  781. // avoiding making an array of them, so that they'll remain in
  782. // registers.
  783. fltx4 matSplat00, matSplat01, matSplat02, matSplat03, // TWELVE REGISTERS
  784. matSplat10, matSplat11, matSplat12, matSplat13,
  785. matSplat20, matSplat21, matSplat22, matSplat23;
  786. {
  787. // Load the matrix into local vectors. Sadly, matrix3x4_ts are
  788. // often unaligned. The w components will be the transpose row of
  789. // the matrix.
  790. fltx4 matCol0 = LoadUnalignedSIMD(rotationMatrix[0]);
  791. fltx4 matCol1 = LoadUnalignedSIMD(rotationMatrix[1]);
  792. fltx4 matCol2 = LoadUnalignedSIMD(rotationMatrix[2]);
  793. matSplat00 = SplatXSIMD(matCol0);
  794. matSplat01 = SplatYSIMD(matCol0);
  795. matSplat02 = SplatZSIMD(matCol0);
  796. matSplat03 = SplatWSIMD(matCol0);
  797. matSplat10 = SplatXSIMD(matCol1);
  798. matSplat11 = SplatYSIMD(matCol1);
  799. matSplat12 = SplatZSIMD(matCol1);
  800. matSplat13 = SplatWSIMD(matCol1);
  801. matSplat20 = SplatXSIMD(matCol2);
  802. matSplat21 = SplatYSIMD(matCol2);
  803. matSplat22 = SplatZSIMD(matCol2);
  804. matSplat23 = SplatWSIMD(matCol2);
  805. }
  806. do
  807. {
  808. fltx4 resultX, resultY, resultZ;
  809. // Trust in the compiler to schedule these operations correctly:
  810. resultX = MaddSIMD(pVectors->z, matSplat02, MaddSIMD(pVectors->y, matSplat01, MaddSIMD(pVectors->x, matSplat00, matSplat03)));
  811. resultY = MaddSIMD(pVectors->z, matSplat12, MaddSIMD(pVectors->y, matSplat11, MaddSIMD(pVectors->x, matSplat00, matSplat13)));
  812. resultZ = MaddSIMD(pVectors->z, matSplat22, MaddSIMD(pVectors->y, matSplat21, MaddSIMD(pVectors->x, matSplat00, matSplat23)));
  813. pVectors->x = resultX;
  814. pVectors->y = resultY;
  815. pVectors->z = resultZ;
  816. ++pVectors;
  817. --numVectors;
  818. } while(numVectors > 0);
  819. }
  820. }
  821. #endif
  822. // Transform many (horizontal) points in-place by a 3x4 matrix,
  823. // here already loaded onto three fltx4 registers but not transposed.
  824. // The points must be stored as 16-byte aligned. They are points
  825. // and not vectors because we assume the w-component to be 1.
  826. #ifdef _X360
  827. void TransformManyPointsBy(VectorAligned * RESTRICT pVectors, unsigned int numVectors, FLTX4 mRow0, FLTX4 mRow1, FLTX4 mRow2)
  828. {
  829. /**************************************************
  830. * Here is an elaborate and carefully scheduled *
  831. * algorithm nicked from xboxmath.inl and hacked *
  832. * up for 3x4 matrices. *
  833. **************************************************/
  834. COMPILE_TIME_ASSERT(sizeof(VectorAligned) == sizeof(XMFLOAT4)); // VectorAligned's need to be 16 bytes
  835. XMVECTOR R0[8], R1[8], R2[8];
  836. XMVECTOR vIn[8];
  837. // C_ASSERT(UnrollCount == 8);
  838. // C_ASSERT(sizeof(XMFLOAT4) == 16);
  839. Assert(pVectors);
  840. Assert(((UINT_PTR)pVectors & 3) == 0); // assert alignment
  841. UINT GroupIndex;
  842. VectorAligned * RESTRICT vCurrent = pVectors;
  843. // sentinel pointers
  844. VectorAligned * vStreamEnd, *vStreamGroupBase, *vStreamGroupEnd;
  845. {
  846. // cook up the pointers from integer math. Necessary because otherwise we LHS all over
  847. // the place. (Odd that this doesn't happen to the xbox math.)
  848. UINT_PTR InputVector = (UINT_PTR)pVectors;
  849. UINT_PTR InputStreamEnd = InputVector + numVectors * sizeof(XMFLOAT4);
  850. // compute start and end points on 128-byte alignment
  851. UINT_PTR InputStreamCGroupBase = XMMin(InputVector + (XM_CACHE_LINE_SIZE - 1), InputStreamEnd) & ~(XM_CACHE_LINE_SIZE - 1);
  852. UINT_PTR InputStreamCGroupEnd = InputStreamCGroupBase + ((InputStreamEnd - InputStreamCGroupBase) & ~(4 * XM_CACHE_LINE_SIZE - 1));
  853. vStreamEnd = (VectorAligned *)InputStreamEnd;
  854. vStreamGroupBase = (VectorAligned *)InputStreamCGroupBase;
  855. vStreamGroupEnd = (VectorAligned *)InputStreamCGroupEnd;
  856. }
  857. __dcbt(0, vStreamGroupBase);
  858. __dcbt(XM_CACHE_LINE_SIZE, vStreamGroupBase);
  859. __dcbt(XM_CACHE_LINE_SIZE * 2, vStreamGroupBase);
  860. __dcbt(XM_CACHE_LINE_SIZE * 3, vStreamGroupBase);
  861. while (vCurrent < vStreamGroupBase)
  862. {
  863. fltx4 vec = __lvx(vCurrent->Base(), 0);
  864. R0[0] = __vmsum4fp(vec, mRow0);
  865. R1[0] = __vmsum4fp(vec, mRow1);
  866. R2[0] = __vmsum4fp(vec, mRow2);
  867. __stvewx(R0[0], vCurrent->Base(), 0);
  868. __stvewx(R1[0], vCurrent->Base(), 4);
  869. __stvewx(R2[0], vCurrent->Base(), 8);
  870. vCurrent++;
  871. }
  872. while (vCurrent < vStreamGroupEnd)
  873. {
  874. __dcbt(XM_CACHE_LINE_SIZE * 4, vCurrent);
  875. __dcbt(XM_CACHE_LINE_SIZE * 5, vCurrent);
  876. __dcbt(XM_CACHE_LINE_SIZE * 6, vCurrent);
  877. __dcbt(XM_CACHE_LINE_SIZE * 7, vCurrent);
  878. for (GroupIndex = 0; GroupIndex < 4; GroupIndex++)
  879. {
  880. // all kinds of LHS on this pointer. Why?
  881. VectorAligned* OutputVector = vCurrent;
  882. vIn[0] = __lvx(vCurrent->Base(), 0);
  883. vCurrent++;
  884. vIn[1] = __lvx(vCurrent->Base(), 0);
  885. vCurrent++;
  886. vIn[2] = __lvx(vCurrent->Base(), 0);
  887. vCurrent++;
  888. vIn[3] = __lvx(vCurrent->Base(), 0);
  889. vCurrent++;
  890. vIn[4] = __lvx(vCurrent->Base(), 0);
  891. vCurrent++;
  892. vIn[5] = __lvx(vCurrent->Base(), 0);
  893. vCurrent++;
  894. vIn[6] = __lvx(vCurrent->Base(), 0);
  895. vCurrent++;
  896. vIn[7] = __lvx(vCurrent->Base(), 0);
  897. vCurrent++;
  898. R0[0] = __vmsum4fp(vIn[0], mRow0);
  899. R1[0] = __vmsum4fp(vIn[0], mRow1);
  900. R2[0] = __vmsum4fp(vIn[0], mRow2);
  901. R0[1] = __vmsum4fp(vIn[1], mRow0);
  902. R1[1] = __vmsum4fp(vIn[1], mRow1);
  903. R2[1] = __vmsum4fp(vIn[1], mRow2);
  904. R0[2] = __vmsum4fp(vIn[2], mRow0);
  905. R1[2] = __vmsum4fp(vIn[2], mRow1);
  906. R2[2] = __vmsum4fp(vIn[2], mRow2);
  907. R0[3] = __vmsum4fp(vIn[3], mRow0);
  908. R1[3] = __vmsum4fp(vIn[3], mRow1);
  909. R2[3] = __vmsum4fp(vIn[3], mRow2);
  910. R0[4] = __vmsum4fp(vIn[4], mRow0);
  911. R1[4] = __vmsum4fp(vIn[4], mRow1);
  912. R2[4] = __vmsum4fp(vIn[4], mRow2);
  913. R0[5] = __vmsum4fp(vIn[5], mRow0);
  914. R1[5] = __vmsum4fp(vIn[5], mRow1);
  915. R2[5] = __vmsum4fp(vIn[5], mRow2);
  916. R0[6] = __vmsum4fp(vIn[6], mRow0);
  917. R1[6] = __vmsum4fp(vIn[6], mRow1);
  918. R2[6] = __vmsum4fp(vIn[6], mRow2);
  919. R0[7] = __vmsum4fp(vIn[7], mRow0);
  920. R1[7] = __vmsum4fp(vIn[7], mRow1);
  921. R2[7] = __vmsum4fp(vIn[7], mRow2);
  922. __stvewx(R0[0], OutputVector, 0);
  923. __stvewx(R1[0], OutputVector, 4);
  924. __stvewx(R2[0], OutputVector, 8);
  925. OutputVector++;
  926. __stvewx(R0[1], OutputVector, 0);
  927. __stvewx(R1[1], OutputVector, 4);
  928. __stvewx(R2[1], OutputVector, 8);
  929. OutputVector++;
  930. __stvewx(R0[2], OutputVector, 0);
  931. __stvewx(R1[2], OutputVector, 4);
  932. __stvewx(R2[2], OutputVector, 8);
  933. OutputVector++;
  934. __stvewx(R0[3], OutputVector, 0);
  935. __stvewx(R1[3], OutputVector, 4);
  936. __stvewx(R2[3], OutputVector, 8);
  937. OutputVector++;
  938. __stvewx(R0[4], OutputVector, 0);
  939. __stvewx(R1[4], OutputVector, 4);
  940. __stvewx(R2[4], OutputVector, 8);
  941. OutputVector++;
  942. __stvewx(R0[5], OutputVector, 0);
  943. __stvewx(R1[5], OutputVector, 4);
  944. __stvewx(R2[5], OutputVector, 8);
  945. OutputVector++;
  946. __stvewx(R0[6], OutputVector, 0);
  947. __stvewx(R1[6], OutputVector, 4);
  948. __stvewx(R2[6], OutputVector, 8);
  949. OutputVector++;
  950. __stvewx(R0[7], OutputVector, 0);
  951. __stvewx(R1[7], OutputVector, 4);
  952. __stvewx(R2[7], OutputVector, 8);
  953. OutputVector++;
  954. }
  955. }
  956. while (vCurrent < vStreamEnd)
  957. {
  958. vIn[0] = __lvx(vCurrent->Base(), 0);
  959. R0[0] = __vmsum4fp(vIn[0], mRow0);
  960. R1[0] = __vmsum4fp(vIn[0], mRow1);
  961. R2[0] = __vmsum4fp(vIn[0], mRow2);
  962. __stvewx(R0[0], vCurrent->Base(), 0);
  963. __stvewx(R1[0], vCurrent->Base(), 4);
  964. __stvewx(R2[0], vCurrent->Base(), 8);
  965. vCurrent++;
  966. }
  967. }
  968. #endif