Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

321 lines
10 KiB

  1. //========= Copyright Valve Corporation, All rights reserved. ============//
  2. //
  3. // Purpose:
  4. //
  5. //===========================================================================//
  6. #include "tier0/platform.h"
  7. #include "tier0/progressbar.h"
  8. #include "bitmap/float_bm.h"
  9. #include "mathlib/mathlib.h"
  10. #include "tier2/tier2.h"
  11. #include "tier0/memdbgon.h"
  12. #include "mathlib/ssemath.h"
  13. #ifdef _X360
  14. #include "xbox/xbox_console.h"
  15. #endif
  16. #define PROBLEM_SIZE 1000
  17. #define N_ITERS 100000
  18. //#define RECORD_OUTPUT
  19. static FourVectors g_XYZ[PROBLEM_SIZE];
  20. static fltx4 g_CreationTime[PROBLEM_SIZE];
  21. bool SIMDTest()
  22. {
  23. const Vector StartPnt(0,0,0);
  24. const Vector MidP(0,0,100);
  25. const Vector EndPnt(100,0,50);
  26. // This app doesn't go through regular engine init, so init FPU/VPU math behaviour here:
  27. SetupFPUControlWord();
  28. TestVPUFlags();
  29. // Initialize g_XYZ[] and g_CreationTime[]
  30. SeedRandSIMD(1987301);
  31. for (int i = 0;i < PROBLEM_SIZE;i++)
  32. {
  33. float fourStartTimes[4];
  34. Vector fourPoints[4];
  35. Vector offset;
  36. for (int j = 0;j < 4;j++)
  37. {
  38. float t = (j + 4 * i) / (4.0f * (PROBLEM_SIZE - 1));
  39. fourStartTimes[j] = t;
  40. fourPoints[j] = StartPnt + t*( EndPnt - StartPnt );
  41. offset.Random( -10.0f, +10.0f );
  42. fourPoints[j] += offset;
  43. }
  44. g_XYZ[i].LoadAndSwizzle( fourPoints[0], fourPoints[1], fourPoints[2], fourPoints[3] );
  45. g_CreationTime[i] = LoadUnalignedSIMD( fourStartTimes );
  46. }
  47. #ifdef RECORD_OUTPUT
  48. char outputBuffer[1024];
  49. Q_snprintf( outputBuffer, sizeof( outputBuffer ), "float testOutput[%d][4][3] = {\n", N_ITERS );
  50. Warning(outputBuffer);
  51. #endif // RECORD_OUTPUT
  52. double STime=Plat_FloatTime();
  53. bool bChangedSomething = false;
  54. for(int i=0;i<N_ITERS;i++)
  55. {
  56. float t=i*(1.0/N_ITERS);
  57. FourVectors * __restrict pXYZ = g_XYZ;
  58. fltx4 * __restrict pCreationTime = g_CreationTime;
  59. fltx4 CurTime = ReplicateX4( t );
  60. fltx4 TimeScale = ReplicateX4( 1.0/(max(0.001, 1.0 ) ) );
  61. // calculate radius spline
  62. bool bConstantRadius = true;
  63. fltx4 Rad0=ReplicateX4(2.0);
  64. fltx4 Radm=Rad0;
  65. fltx4 Rad1=Rad0;
  66. fltx4 RadmMinusRad0=SubSIMD( Radm, Rad0);
  67. fltx4 Rad1MinusRadm=SubSIMD( Rad1, Radm);
  68. fltx4 SIMDMinDist=ReplicateX4( 2.0 );
  69. fltx4 SIMDMinDist2=ReplicateX4( 2.0*2.0 );
  70. fltx4 SIMDMaxDist=MaxSIMD( Rad0, MaxSIMD( Radm, Rad1 ) );
  71. fltx4 SIMDMaxDist2=MulSIMD( SIMDMaxDist, SIMDMaxDist);
  72. FourVectors StartP;
  73. StartP.DuplicateVector( StartPnt );
  74. FourVectors MiddleP;
  75. MiddleP.DuplicateVector( MidP );
  76. // form delta terms needed for quadratic bezier
  77. FourVectors Delta0;
  78. Delta0.DuplicateVector( MidP-StartPnt );
  79. FourVectors Delta1;
  80. Delta1.DuplicateVector( EndPnt-MidP );
  81. int nLoopCtr = PROBLEM_SIZE;
  82. do
  83. {
  84. fltx4 TScale=MinSIMD(
  85. Four_Ones,
  86. MulSIMD( TimeScale, SubSIMD( CurTime, *pCreationTime ) ) );
  87. // bezier(a,b,c,t)=lerp( lerp(a,b,t),lerp(b,c,t),t)
  88. FourVectors L0 = Delta0;
  89. L0 *= TScale;
  90. L0 += StartP;
  91. FourVectors L1= Delta1;
  92. L1 *= TScale;
  93. L1 += MiddleP;
  94. FourVectors Center = L1;
  95. Center -= L0;
  96. Center *= TScale;
  97. Center += L0;
  98. FourVectors pts_original = *(pXYZ);
  99. FourVectors pts = pts_original;
  100. pts -= Center;
  101. // calculate radius at the point. !!speed!! - use special case for constant radius
  102. fltx4 dist_squared= pts * pts;
  103. fltx4 TooFarMask = CmpGtSIMD( dist_squared, SIMDMaxDist2 );
  104. if ( ( !bConstantRadius) && ( ! IsAnyNegative( TooFarMask ) ) )
  105. {
  106. // need to calculate and adjust for true radius =- we've only trivially rejected note
  107. // voodoo here - we update simdmaxdist for true radius, but not max dist^2, since
  108. // that's used only for the trivial reject case, which we've already done
  109. fltx4 R0=AddSIMD( Rad0, MulSIMD( RadmMinusRad0, TScale ) );
  110. fltx4 R1=AddSIMD( Radm, MulSIMD( Rad1MinusRadm, TScale ) );
  111. SIMDMaxDist = AddSIMD( R0, MulSIMD( SubSIMD( R1, R0 ), TScale) );
  112. // now that we know the true radius, update our mask
  113. TooFarMask = CmpGtSIMD( dist_squared, MulSIMD( SIMDMaxDist, SIMDMaxDist ) );
  114. }
  115. fltx4 TooCloseMask = CmpLtSIMD( dist_squared, SIMDMinDist2 );
  116. fltx4 NeedAdjust = OrSIMD( TooFarMask, TooCloseMask );
  117. if ( IsAnyNegative( NeedAdjust ) ) // any out of bounds?
  118. {
  119. // change squared distance into approximate rsqr root
  120. fltx4 guess=ReciprocalSqrtEstSIMD(dist_squared);
  121. // newton iteration for 1/sqrt(x) : y(n+1)=1/2 (y(n)*(3-x*y(n)^2));
  122. guess=MulSIMD(guess,SubSIMD(Four_Threes,MulSIMD(dist_squared,MulSIMD(guess,guess))));
  123. guess=MulSIMD(Four_PointFives,guess);
  124. pts *= guess;
  125. FourVectors clamp_far=pts;
  126. clamp_far *= SIMDMaxDist;
  127. clamp_far += Center;
  128. FourVectors clamp_near=pts;
  129. clamp_near *= SIMDMinDist;
  130. clamp_near += Center;
  131. pts.x = MaskedAssign( TooCloseMask, clamp_near.x, MaskedAssign( TooFarMask, clamp_far.x, pts_original.x ));
  132. pts.y = MaskedAssign( TooCloseMask, clamp_near.y, MaskedAssign( TooFarMask, clamp_far.y, pts_original.y ));
  133. pts.z = MaskedAssign( TooCloseMask, clamp_near.z, MaskedAssign( TooFarMask, clamp_far.z, pts_original.z ));
  134. *(pXYZ) = pts;
  135. bChangedSomething = true;
  136. }
  137. #ifdef RECORD_OUTPUT
  138. if (nLoopCtr == 257)
  139. {
  140. Q_snprintf( outputBuffer, sizeof( outputBuffer ), "/*%04d:*/ { {%+14e,%+14e,%+14e}, {%+14e,%+14e,%+14e}, {%+14e,%+14e,%+14e}, {%+14e,%+14e,%+14e} },\n", i,
  141. pXYZ->X(0), pXYZ->Y(0), pXYZ->Z(0),
  142. pXYZ->X(1), pXYZ->Y(1), pXYZ->Z(1),
  143. pXYZ->X(2), pXYZ->Y(2), pXYZ->Z(2),
  144. pXYZ->X(3), pXYZ->Y(3), pXYZ->Z(3));
  145. Warning(outputBuffer);
  146. }
  147. #endif // RECORD_OUTPUT
  148. ++pXYZ;
  149. ++pCreationTime;
  150. } while ( --nLoopCtr );
  151. }
  152. double ETime=Plat_FloatTime()-STime;
  153. #ifdef RECORD_OUTPUT
  154. Q_snprintf( outputBuffer, sizeof( outputBuffer ), " };\n" );
  155. Warning(outputBuffer);
  156. #endif // RECORD_OUTPUT
  157. printf("elapsed time=%f p/s=%f\n",ETime, (4.0*PROBLEM_SIZE*N_ITERS)/ETime );
  158. return bChangedSomething;
  159. }
  160. #ifdef _X360
  161. __declspec(passinreg) struct float4
  162. {
  163. operator __vector4 () const { return vmx; }
  164. __vector4 vmx;
  165. };
  166. void OctoberXDKCompilerIssueTestCode( const fltx4 & val, fltx4 * out )
  167. {
  168. // UNDONE: This code demonstrates serious 360 compiler issues. XBox Developer Support has been contacted.
  169. // The assembly contains tons of useless instructions (vector stores and supporting integer math), even in the
  170. // below code - no use of pointers or static constants, no wrapper layers on top of the vector intrinsics.
  171. // If/when the compiler issue is resolved, other known issues are:
  172. // - pass vector params by const reference
  173. // - avoid putting __vector4 in a union or an array
  174. // - avoid default constructors, return constructed objects directly ("return VecClass(__vector4Val);")
  175. #define DECL_ASS( _var_, _val_ ) fltx4 _var_ = _val_
  176. //#define DECL_ASS( _var_, _val_ ) float4 _var_; _var_.vmx = _val_
  177. //#define DECL_ASS( _var_, _val_ ) float4 _var_( _val_ )
  178. DECL_ASS( resultx, Four_Zeros ); DECL_ASS( resulty, Four_Zeros ); DECL_ASS( resultz, Four_Zeros );
  179. DECL_ASS( CurTime, __vmulfp( val, Four_PointFives ) );
  180. DECL_ASS( TimeScale, val );
  181. //fltx4 *pCreationTime = g_CreationTime;
  182. DECL_ASS( Delta0x, val ); DECL_ASS( Delta0y, val ); DECL_ASS( Delta0z, val );
  183. DECL_ASS( Delta1x, __vaddfp(Delta0x, Delta0x) ); DECL_ASS( Delta1y, __vaddfp(Delta0y, Delta0y) ); DECL_ASS( Delta1z, __vaddfp(Delta0z, Delta0z) );
  184. DECL_ASS( StartPx, __vaddfp(Delta0x, Delta0x) ); DECL_ASS( StartPy, __vaddfp(Delta0y, Delta0y) ); DECL_ASS( StartPz, __vaddfp(Delta0z, Delta0z) );
  185. DECL_ASS( MiddlePx, __vaddfp(StartPx, StartPx) ); DECL_ASS( MiddlePy, __vaddfp(StartPy, StartPy) ); DECL_ASS( MiddlePz, __vaddfp(StartPz, StartPz) );
  186. for (int i = 0;i < 1000;i++)
  187. {
  188. DECL_ASS( TScale, __vsubfp( CurTime, resultx ) );//*pCreationTime );
  189. TScale = __vmulfp( TScale, TimeScale );
  190. TScale = __vminfp( TScale, resulty );//Four_Ones );
  191. //resultx = __vaddfp( resultx, TScale );
  192. //resulty = __vaddfp( resulty, TScale );
  193. //resultz = __vaddfp( resultz, TScale );
  194. DECL_ASS( L0x, Delta0x ); DECL_ASS( L0y, Delta0y ); DECL_ASS( L0z, Delta0z );
  195. L0x = __vmulfp(L0x,TScale); L0y = __vmulfp(L0y,TScale); L0z = __vmulfp(L0z,TScale);
  196. L0x = __vaddfp(StartPx,L0x); L0y = __vaddfp(StartPy,L0y); L0z = __vaddfp(StartPz,L0z);
  197. DECL_ASS( L1x, Delta1x ); DECL_ASS( L1y, Delta1y ); DECL_ASS( L1z, Delta1z );
  198. L1x = __vmulfp(L1x,TScale); L1y = __vmulfp(L1y,TScale); L1z = __vmulfp(L1z,TScale);
  199. L1x = __vaddfp(MiddlePx,L1x); L1y = __vaddfp(MiddlePy,L1y); L1z = __vaddfp(MiddlePz,L1z);
  200. L0x = __vaddfp(L0x,L1x); L0y = __vaddfp(L0y,L1y); L0z = __vaddfp(L0z,L1z);
  201. resultx = __vaddfp( resultx, L0x );
  202. resulty = __vaddfp( resulty, L0y );
  203. resultz = __vaddfp( resultz, L0z );
  204. //pCreationTime++;
  205. }
  206. out[0] = resultx;
  207. out[1] = resulty;
  208. out[2] = resultz;
  209. }
  210. #else // _X360
  211. void
  212. SSEClassTest( const fltx4 & val, fltx4 & out )
  213. {
  214. fltx4 result = Four_Zeros;
  215. for (int i = 0;i < N_ITERS;i++)
  216. {
  217. result = SubSIMD( val, result );
  218. result = MulSIMD( val, result );
  219. result = AddSIMD( val, result );
  220. result = MinSIMD( val, result );
  221. }
  222. FourVectors result4; result4.x = result; result4.y = result; result4.z = result;
  223. for (int i = 0;i < N_ITERS;i++)
  224. {
  225. result4 *= result4;
  226. result4 += result4;
  227. result4 *= result4;
  228. result4 += result4;
  229. }
  230. result = result4*result4;
  231. out = result;
  232. }
  233. #endif // !_X360
  234. int main(int argc,char **argv)
  235. {
  236. #ifndef _X360
  237. // UNDONE: InitCommandLineProgram needs fixing for 360 (if we want to make lots of new 360 executables)
  238. InitCommandLineProgram( argc, argv );
  239. // This function is useful for inspecting compiler output
  240. fltx4 result;
  241. SSEClassTest( Four_PointFives, result );
  242. printf("(%f,%f,%f,%f)\n", SubFloat( result, 0 ), SubFloat( result, 1 ), SubFloat( result, 2 ), SubFloat( result, 3 ) );
  243. #else // _X360
  244. // Wait for VXConsole, so that all debug output goes there
  245. XBX_InitConsoleMonitor(true);
  246. // This function is useful for inspecting compiler output
  247. FourVectors result;
  248. OctoberXDKCompilerIssueTestCode( Four_PointFives, (fltx4 *)&result );
  249. printf("(%f,%f,%f,%f)\n", result.X(0), result.X(1), result.X(2), result.X(3));
  250. printf("(%f,%f,%f,%f)\n", result.Y(0), result.Y(1), result.Y(2), result.Y(3));
  251. printf("(%f,%f,%f,%f)\n", result.Z(0), result.Z(1), result.Z(2), result.Z(3));
  252. #endif // _X360
  253. // Run the perf. test
  254. SIMDTest();
  255. return 0;
  256. }