Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

396 lines
12 KiB

  1. //============ Copyright (c) Valve Corporation, All rights reserved. ============
  2. //
  3. // A non-trivial number of Valve customers hit performance problems because their CPUs overheat
  4. // and are thermally throttled. While thermal throttling is better than melting it is still a
  5. // hardware flaw and it leads to a bad user experience. In some cases the CPU frequency drops
  6. // (constantly or occasionally) by 50-75%, leading to equal or greater framerate drops.
  7. //
  8. // This is equivalent to a car that goes into limp-home mode to let it continue running after the
  9. // radiator fails -- it's better than destroying the engine, but clearly it needs to be fixed.
  10. //
  11. // When CPU monitoring is enabled a bunch of background threads are created that wake up at
  12. // the set frequency, spin in a loop to measure the actual usable CPU frequency, then sleep again.
  13. // A delay loop is used to measure the frequency because this is portable (it works for Intel
  14. // and AMD and handles both frequency throttling and duty-cycle reductions) and it doesn't
  15. // require administrator privileges. This technique has been used in VTrace for a while.
  16. //
  17. // This code doesn't use normal worker threads because of the special purpose nature of this
  18. // work. The threads are started on demand and are never terminated, in order to simplify
  19. // the code.
  20. //
  21. //===============================================================================
  22. #include "pch_tier0.h"
  23. #include "tier0/cpumonitoring.h"
  24. #ifdef PLATFORM_WINDOWS_PC32
  25. #include "tier0/threadtools.h"
  26. #define NOMINMAX
  27. #include <windows.h>
  28. #include "PowrProf.h"
  29. #include <algorithm>
  30. #pragma comment(lib, "PowrProf.lib")
  31. // This lock protects s_results and s_nDelayMilliseconds.
  32. static CThreadMutex s_lock;
  33. static CPUFrequencyResults s_results;
  34. static unsigned s_nDelayMilliseconds;
  35. // Has monitoring been enabled? If not measurements may still continue
  36. // if kDelayMillisecondsWhenDisabled is non-zero.
  37. static bool s_fEnabled = false;
  38. // This is the delay between measurements when measurements are 'disabled'. If it
  39. // is zero then the measurements are truly disabled.
  40. const unsigned kDelayMillisecondsWhenDisabled = 5000;
  41. // Delay before first measurement
  42. const unsigned kFirstInterval = 500;
  43. const unsigned kPostMeasureInterval = 5;
  44. const unsigned kMinimumDelay = 300;
  45. const int nMaxCPUs = 32;
  46. // This loop spins spinCount times and should take about 50 times spinCount
  47. // cycles to execute. This should be true on any reasonable modern processor
  48. // since the latency of integer add is almost always one cycle.
  49. // The Xbox 360 and PS3 CPUs are the one known exception but this code will
  50. // never run on them.
  51. static void SpinALot( int spinCount )
  52. {
  53. __asm
  54. {
  55. mov ecx, spinCount
  56. start:
  57. add eax, eax
  58. add eax, eax
  59. add eax, eax
  60. add eax, eax
  61. add eax, eax
  62. add eax, eax
  63. add eax, eax
  64. add eax, eax
  65. add eax, eax
  66. add eax, eax
  67. add eax, eax
  68. add eax, eax
  69. add eax, eax
  70. add eax, eax
  71. add eax, eax
  72. add eax, eax
  73. add eax, eax
  74. add eax, eax
  75. add eax, eax
  76. add eax, eax
  77. add eax, eax
  78. add eax, eax
  79. add eax, eax
  80. add eax, eax
  81. add eax, eax
  82. add eax, eax
  83. add eax, eax
  84. add eax, eax
  85. add eax, eax
  86. add eax, eax
  87. add eax, eax
  88. add eax, eax
  89. add eax, eax
  90. add eax, eax
  91. add eax, eax
  92. add eax, eax
  93. add eax, eax
  94. add eax, eax
  95. add eax, eax
  96. add eax, eax
  97. add eax, eax
  98. add eax, eax
  99. add eax, eax
  100. add eax, eax
  101. add eax, eax
  102. add eax, eax
  103. add eax, eax
  104. add eax, eax
  105. add eax, eax
  106. add eax, eax
  107. sub ecx,1
  108. jne start
  109. }
  110. }
  111. static LARGE_INTEGER frequency;
  112. static LARGE_INTEGER base;
  113. static void InitializeGetTime()
  114. {
  115. QueryPerformanceFrequency( &frequency );
  116. QueryPerformanceCounter( &base );
  117. }
  118. static double GetTime()
  119. {
  120. LARGE_INTEGER value;
  121. QueryPerformanceCounter( &value );
  122. // Subtracting off the base time gives us a zero point at application start up and
  123. // gives us more precision.
  124. return ( value.QuadPart - base.QuadPart ) / double( frequency.QuadPart );
  125. }
  126. static float GetFrequency()
  127. {
  128. double start = GetTime();
  129. // This should cause a delay of 500,000 cycles (50 * spinCount) which should be a
  130. // fraction of a millisecond on any reasonable processor, thus ensuring that the
  131. // sampling interrupt will not be hit too frequently.
  132. SpinALot( 10000 );
  133. double elapsed = GetTime() - start;
  134. double frequency = ( 500000 / elapsed ) / 1e9;
  135. return (float)frequency;
  136. }
  137. // This semaphore is used to release all of the measurement threads simultaneously.
  138. static HANDLE g_releaseSemaphore;
  139. // This semaphore is used to wait for all of the measurement threads to complete.
  140. static HANDLE g_workCompleteSemaphore;
  141. static DWORD g_numCPUs;
  142. // This function measures the CPU frequency by doing repeated integer adds.
  143. // It measures it multiple times and records the highest frequency -- the
  144. // assumption is that any given test might be slowed by interrupts or
  145. // context switches so the fastest run should indicate the true performance.
  146. static float GetSampledFrequency( int iterations )
  147. {
  148. float maxFrequency = 0.0;
  149. for ( int i = 0; i < iterations; ++i )
  150. {
  151. float frequency = GetFrequency();
  152. if ( frequency > maxFrequency )
  153. maxFrequency = frequency;
  154. }
  155. return maxFrequency;
  156. }
  157. // The measured frequency of all of the threads
  158. static float s_frequency[ nMaxCPUs ];
  159. // Measurement thread, designed to be one per core.
  160. static DWORD WINAPI MeasureThread( LPVOID vThreadNum )
  161. {
  162. ThreadSetDebugName( "CPUMonitoringMeasureThread" );
  163. int threadNum = (int)vThreadNum;
  164. for ( ; ; )
  165. {
  166. // Wait until the MCP says it's time to wake up and measure CPU speed
  167. WaitForSingleObject( g_releaseSemaphore, INFINITE );
  168. // Seven seems like a good number of times to measure the frequency -- it makes
  169. // it likely that a couple of the tests will not hit any interrupts.
  170. float frequency = GetSampledFrequency( 7 );
  171. s_frequency[ threadNum ] = frequency;
  172. // Tell the heartbeat thread that one thread has completed.
  173. ReleaseSemaphore( g_workCompleteSemaphore, 1, NULL );
  174. }
  175. // This will never be hit.
  176. return 0;
  177. }
  178. /*
  179. Note that this structure definition was accidentally omitted from WinNT.h. This error will be corrected in the future. In the meantime, to compile your application, include the structure definition contained in this topic in your source code.
  180. */
  181. typedef struct _PROCESSOR_POWER_INFORMATION {
  182. ULONG Number;
  183. ULONG MaxMhz;
  184. ULONG CurrentMhz;
  185. ULONG MhzLimit;
  186. ULONG MaxIdleState;
  187. ULONG CurrentIdleState;
  188. } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;
  189. // Master control thread to periodically wake the measurement threads.
  190. static DWORD WINAPI HeartbeatThread( LPVOID )
  191. {
  192. ThreadSetDebugName( "CPUMonitoringHeartbeatThread" );
  193. // Arbitrary/hacky time to wait for results to become available.
  194. Sleep( kFirstInterval );
  195. for ( ; ; )
  196. {
  197. unsigned delay;
  198. {
  199. // Read and write all the state that is shared with the main thread while holding the lock.
  200. AUTO_LOCK( s_lock );
  201. delay = s_nDelayMilliseconds;
  202. }
  203. // If monitoring is currently enabled then do the work.
  204. if ( delay )
  205. {
  206. // First ask Windows what the processor speed is -- this *might* reflect
  207. // some types of thermal throttling, but doesn't seem to.
  208. PROCESSOR_POWER_INFORMATION processorInfo[ nMaxCPUs ] = {};
  209. CallNtPowerInformation( ProcessorInformation, NULL, 0, &processorInfo, sizeof(processorInfo[0]) * g_numCPUs );
  210. ULONG MaxMHz = processorInfo[ 0 ].MaxMhz;
  211. ULONG LimitMHz = processorInfo[ 0 ].MhzLimit;
  212. ULONG MinCurrentMHz = processorInfo[ 0 ].CurrentMhz;
  213. ULONG MaxCurrentMHz = processorInfo[ 0 ].CurrentMhz;
  214. for ( DWORD i = 0; i < g_numCPUs; ++i )
  215. {
  216. MinCurrentMHz = std::min( MinCurrentMHz, processorInfo[ i ].CurrentMhz );
  217. MaxCurrentMHz = std::max( MaxCurrentMHz, processorInfo[ i ].CurrentMhz );
  218. MaxMHz = std::max( MaxMHz, processorInfo[ i ].MaxMhz );
  219. LimitMHz = std::max( LimitMHz, processorInfo[ i ].MhzLimit );
  220. }
  221. // This will wake up all of the worker threads. It is possible that some of the
  222. // threads will take a long time to wake up in which case the same thread might
  223. // wake up multiple times but this should be harmless.
  224. ReleaseSemaphore( g_releaseSemaphore, g_numCPUs, NULL );
  225. // Wait until all of the measurement threads should have run.
  226. // This is just to avoid having the heartbeat thread fighting for cycles
  227. // but isn't strictly necessary.
  228. Sleep( kPostMeasureInterval );
  229. // Wait for all of the worker threads to finish.
  230. for ( DWORD i = 0; i < g_numCPUs; ++i )
  231. {
  232. WaitForSingleObject( g_workCompleteSemaphore, INFINITE );
  233. }
  234. // Find the minimum and maximum measured frequencies.
  235. float minActualFreq = s_frequency[ 0 ];
  236. float maxActualFreq = s_frequency[ 0 ];
  237. for ( DWORD i = 1; i < g_numCPUs; ++i )
  238. {
  239. minActualFreq = std::min( minActualFreq, s_frequency[ i ] );
  240. maxActualFreq = std::max( maxActualFreq, s_frequency[ i ] );
  241. }
  242. {
  243. // Read and write all the state that is shared with the main thread while holding the lock.
  244. AUTO_LOCK( s_lock );
  245. float freqPercentage = maxActualFreq / (MaxCurrentMHz * 1e-5f);
  246. const float kFudgeFactor = 1.03f; // Make results match reality better
  247. s_results.m_timeStamp = Plat_FloatTime();
  248. s_results.m_GHz = maxActualFreq * kFudgeFactor;
  249. s_results.m_percentage = freqPercentage * kFudgeFactor;
  250. if ( s_results.m_lowestPercentage == 0 || s_results.m_percentage < s_results.m_lowestPercentage )
  251. s_results.m_lowestPercentage = s_results.m_percentage;
  252. // delay may get set to zero at this point
  253. delay = s_nDelayMilliseconds;
  254. }
  255. Sleep( delay );
  256. }
  257. else
  258. {
  259. // If there is nothing to do then just sleep for a bit.
  260. Sleep( kMinimumDelay );
  261. }
  262. }
  263. // This will never be hit.
  264. return 0;
  265. }
  266. PLATFORM_INTERFACE CPUFrequencyResults GetCPUFrequencyResults( bool fGetDisabledResults )
  267. {
  268. AUTO_LOCK( s_lock );
  269. if ( s_fEnabled || fGetDisabledResults )
  270. {
  271. // Return actual results.
  272. return s_results;
  273. }
  274. else
  275. {
  276. // Return zero initialized struct.
  277. return CPUFrequencyResults();
  278. }
  279. }
  280. PLATFORM_INTERFACE void SetCPUMonitoringInterval( unsigned nDelayMilliseconds )
  281. {
  282. static bool s_initialized = false;
  283. // Clamp the delay to a minimum value to save users from running the
  284. // measurements too frequently.
  285. if ( nDelayMilliseconds && nDelayMilliseconds <= kMinimumDelay )
  286. nDelayMilliseconds = kMinimumDelay;
  287. // If not yet initialized then do one-time thread initialization
  288. if ( !s_initialized )
  289. {
  290. s_initialized = true;
  291. InitializeGetTime();
  292. g_releaseSemaphore = CreateSemaphore( NULL, 0, 1000, NULL );
  293. if ( !g_releaseSemaphore )
  294. return;
  295. g_workCompleteSemaphore = CreateSemaphore( NULL, 0, 1000, NULL );
  296. if ( !g_workCompleteSemaphore )
  297. return;
  298. SYSTEM_INFO systemInfo;
  299. GetSystemInfo( &systemInfo );
  300. g_numCPUs = systemInfo.dwNumberOfProcessors;
  301. if ( g_numCPUs > nMaxCPUs )
  302. g_numCPUs = nMaxCPUs;
  303. // Create n threads, affinitize them, and set them to high priority. This will (mostly)
  304. // ensure that they will run promptly on a specific CPU.
  305. for ( DWORD i = 0; i < g_numCPUs; ++i )
  306. {
  307. HANDLE thread = CreateThread( NULL, 0x10000, MeasureThread, (void*)i, 0, NULL );
  308. SetThreadAffinityMask( thread, 1u << i );
  309. SetThreadPriority( thread, THREAD_PRIORITY_HIGHEST );
  310. }
  311. // Create the thread which tells the measurement threads to wake up periodically
  312. CreateThread( NULL, 0x10000, HeartbeatThread, NULL, 0, NULL );
  313. }
  314. AUTO_LOCK( s_lock );
  315. if ( nDelayMilliseconds && s_nDelayMilliseconds == 0 )
  316. {
  317. // If we are enabling/re-enabling then reset the stats.
  318. memset( &s_results, 0, sizeof(s_results) );
  319. }
  320. // Set the specified delay time or 5,000 if it is disabled.
  321. s_nDelayMilliseconds = nDelayMilliseconds ? nDelayMilliseconds : kDelayMillisecondsWhenDisabled;
  322. s_fEnabled = nDelayMilliseconds != 0;
  323. }
  324. class CPUMonitoringStarter
  325. {
  326. public:
  327. CPUMonitoringStarter()
  328. {
  329. // Start up the disabled CPU monitoring at low frequency.
  330. if ( kDelayMillisecondsWhenDisabled )
  331. SetCPUMonitoringInterval( 0 );
  332. }
  333. } s_CPUMonitoringStarter;
  334. #else
  335. PLATFORM_INTERFACE CPUFrequencyResults GetCPUFrequencyResults(bool)
  336. {
  337. // Return zero initialized results which means no data available.
  338. CPUFrequencyResults results = {};
  339. return results;
  340. }
  341. PLATFORM_INTERFACE void SetCPUMonitoringInterval( unsigned nDelayMilliseconds )
  342. {
  343. NOTE_UNUSED( nDelayMilliseconds );
  344. }
  345. #endif