/*++ Copyright (c) 1999-2001 Microsoft Corporation. All Rights Reserved. Module Name: cpu.h Abstract: This module contains code for processor identification, as well as code for calibrating the processor clock speed. Author: Joseph Ballantyne Environment: Kernel Mode Revision History: --*/ // Processor related defines and macros. // Supported architectures. #define X86 1 // Supported manufacturers. #define INTEL 1 #define AMD 2 // Feature bit locations in the CPU id instruction supported features dword. #define FXSR (1<<24) #define MMX (1<<23) #define APIC (1<<9) #define MSR (1<<5) #define TSC (1<<4) #define FPU (1<<0) #define OSFXSR 0x200 #define MAXTIMERREADSLOTS 33 #define LIMITTOTALTIMERREADS 64 #define DEFAULTMAXTIMERCOUNT 0x12 // Instructions not supported by the inline assembler. #define cpuid __asm _emit 0x0f __asm _emit 0xa2 #define rdtsc __asm _emit 0x0f __asm _emit 0x31 // Macro to force serialization of instruction execution. // This is the fastest possible way to force instruction serialization for // Pentium processors. It may not be the fastest on PII, PIII or P4 // processors. #define SerializeExecution() \ __asm { \ __asm sub esp,8 \ __asm sidt [esp] \ __asm lidt [esp] \ __asm add esp,8 \ } // Structures for supporting processor identification and clock speed calibration. // The CPUINFO structure is used to store/access the results of the CPU ID instruction. typedef struct { ULONG eax; ULONG ebx; ULONG edx; ULONG ecx; } CPUINFO; // The TimerReadInfo structure is used by the cpu clock speed calibration code. // Note that we want this structure to be 32 byte aligned. // This allows us to force the whole thing into cache by just touching // the first element. We depend on this in our code for getting // stuff into the cache, so don't change it. typedef struct { ULONGLONG firstreadtimestamp; ULONGLONG lastreadtimestamp; ULONG count; ULONG value; ULONGLONG align; } TimerReadInfo; // These globals store information about the processor we are running on. ULONG CPUArchitecture=0; ULONG CPUManufacturer=0; LONG CPUFamily=0xffffffff; LONG CPUModel=0xffffffff; LONG CPUStepping=0xffffffff; ULONGLONG CPUFeatures=0; // TimerReadInfo stores timing meassurements as they are taken. TimerReadInfo TimerRead[MAXTIMERREADSLOTS]; #pragma warning ( disable : 4035 ) ULONG CpuIdOk ( VOID ) /*++ Routine Description: Determines if the processor supports running the CPUID instruction. It does so by checking if the state of bit 21 of the eflags register can be changed. Arguments: None. Return Value: FALSE (0) if processor does not support the CPUID instruction. TRUE (bit 21 set to 1) if processor does support CPUID. --*/ { __asm { // First save eflags. pushfd // Load eflags into eax. mov eax, [esp] // Flip state of bit 21 and put it back into eflags. xor eax,1<<21 push eax popfd // Read new eflags into eax. pushfd pop eax // See if bit 21 state change stuck. If so, we have CPUID instruction support // and we will return with bit 21 set, otherwise there is no CPUID support and // we will return zero. xor eax, [esp] // Restore orignal eflags. popfd } } #pragma warning ( default : 4035 ) BOOLEAN GetCpuId ( ULONG index, CPUINFO *cpuinfo ) /*++ Routine Description: Checks whether the processor supports the CPUID instruction. If not, returns FALSE. Otherwise, runs the CPUID instruction with the supplied index and writes the output into the location pointed to by cpuinfo if it is not NULL. Arguments: index - Supplies a value that selects which data should be returned by CPUID. cpuinfo - Supplies a pointer to a CPUINFO structure into which the CPUID data will be written. Can be NULL in which case no data is returned. Return Value: TRUE if the CPUID instruction is supported by the processor and was run for this call of the function. FALSE if the CPUID instruction is not supported. Nothing is written into cpuinfo in that case. --*/ { static ULONG initialized=0; // Check if the processor supports the CPUID instruction. // This is a one time check. if (!initialized) { initialized=1+CpuIdOk(); } // If the CPUID instruction is supported, then run it. if (initialized&(1<<21)) { // The CPU supports CPU ID. Do it. __asm { mov eax,index // Now make clear to the compiler that the cpuid instruction // will blow away the contents of these registers. The inline // assembler does not support the cpuid instruction, so the compiler // thinks edx, ebx, and ecx do not change in this function. Not so. xor edx,edx xor ebx,ebx xor ecx,ecx cpuid // Now, if we were passed a valid pointer for the data, then store it. mov esi,cpuinfo or esi,esi jz nullptr mov [esi],eax mov [esi+4],ebx mov [esi+8],edx mov [esi+12],ecx nullptr: } return TRUE; } // CPU does not support CPU ID. return FALSE; } BOOL GetProcessorInfo ( VOID ) /*++ Routine Description: Loads globals containing information about the processor architecture, manufacturer family, model, stepping and features. Currently supports AMD and Intel processors. Arguments: None. Return Value: TRUE if the processor supports CPUID instruction and the manufacturer is known and supported. Currently Intel and AMD are supported. FALSE otherwise. --*/ { CPUINFO thecpu; if (!GetCpuId(0, &thecpu)) { return FALSE; } if (thecpu.ebx==0x756e6547 && thecpu.edx==0x49656e69 && thecpu.ecx==0x6c65746e ) { CPUManufacturer=INTEL; if (thecpu.eax) { GetCpuId(1, &thecpu); CPUFamily=(thecpu.eax>>8)&0xf; CPUModel=(thecpu.eax>>4)&0xf; CPUStepping=(thecpu.eax)&0xf; CPUArchitecture=X86; CPUFeatures=thecpu.edx; return TRUE; } return FALSE; } if (thecpu.ebx==0x68747541 && thecpu.edx==0x69746e65 && thecpu.ecx==0x444d4163 ) { CPUManufacturer=AMD; if (thecpu.eax) { GetCpuId(1, &thecpu); CPUFamily=(thecpu.eax>>8)&0xf; CPUModel=(thecpu.eax>>4)&0xf; CPUStepping=(thecpu.eax)&0xf; CPUArchitecture=X86; CPUFeatures=thecpu.edx; return TRUE; } return FALSE; } CPUManufacturer=(ULONG)-1; return FALSE; } // Note that all of the timing functions should be in locked memory. Note also // that I should make sure that they are in the cache - preferably in the primary CPU // cache, but at least in the secondary cache. // We use timer 0 on the PC motherboard for our calibration of the CPU // cycle counter. #pragma warning ( disable : 4035 ) LONGLONG __inline ReadCycleCounter ( VOID ) { __asm { rdtsc } } ULONG ReadPcTimer ( VOID ) { __asm { xor eax,eax out 0x43,al in al,0x40 mov ecx,eax in al,0x40 shl eax,8 or eax,ecx } } ULONG __inline FastPcTimerRead ( VOID ) { __asm { xor eax,eax in al,0x41 } } #pragma warning ( default : 4035 ) VOID SetupPcTimer ( VOID ) { __asm { #if 0 // The following code is for using timer 2. I have decided to use timer 0. // First turn off PC speaker and turn ON timer 2 gate. mov eax,1 out 0x61,al // Now read from timer port to clear any previously latched data. // In case someone else latched data but never read it. in al,0x42 in al,0x42 // Now latch timer 2 status and make sure mode and latch type OK. mov eax,0xe8 out 0x43,al in al,0x42 and eax,0x37 cmp eax,0x36 jz timersetupok int 3 timersetupok: #endif // Now read from timer port to clear any previously latched data. // In case someone else latched data but never read it. // Note that we do three reads in case both status and a 2 byte // count were latched. in al,0x40 in al,0x40 in al,0x40 // Now latch timer 0 status and make sure mode and latch type OK. mov eax,0xe2 out 0x43,al in al,0x40 and eax,0x37 cmp eax,0x34 jz timersetupok int 3 timersetupok: } // Now since we are NOT reprogramming the mode, or the count of the timer, we // do not know what count the timer counter will autoreload. Therefore, we // cannot make an accurate timing measurement if the timer wraps. Therefore // we wait here until the timer has a positive count that is large enough to // ensure that we can calibrate our delay loop without having the timer wrap. // Note that this does mean we WILL have an extra delay here waiting for the timer // to wrap. Note that we should also make sure that we disable the timer interrupts // so that they don't cause us to wait even longer. while (ReadPcTimer()<20) ; } // IMPORTANT! Note that it is possible though VERY unlikely that someone // could have sent a command to latch BOTH the count and the status // and then never done the reads. To clear out any latched data we // would thus need to do 3 reads. One for status, and 2 for possibly // 16 bits of latched data. // In this function, we check setup of the motherboard PC timer 1. // It should be setup for mode 2 counting, for doing single byte reads, // and for reloading a max count of 0x12. We verify this and return // true if so, false otherwise. BOOL SetupFastPcTimer ( VOID ) { __asm { // Read from timer port to clear any previously latched data. // We do 3 reads in case the timer was programmed for 16 bit reads // and both status and count were latched. in al,0x41 in al,0x41 in al,0x41 // Now latch timer 2 status and make sure mode and latch type OK. mov eax,0xe4 out 0x43,al in al,0x41 and eax,0x37 cmp eax,0x14 jz timersetupok } Trap(); return FALSE; timersetupok: return TRUE; } // Note that this function assumes that there is a cycle counter // in the machine and that calling ReadCycleCounter will read // it correctly. // If this function cannot successfully measure cyclespertick, it // returns 0. ULONG MeasureCPUCyclesPerTick ( VOID ) { ULONG i; ULONG ticks; LONGLONG cyclecount; ULONG totaltimerreads; ULONG fasttimermaxvalue,counterwrapped; ULONG cyclespertick; static ULONG largesttimermaxvalue=0; #if DEBUG static ULONG maxcyclespertick=0,mincyclespertick=0xffffffff; #endif // Make sure that the PC motherboard timer we will use is setup // correctly. If not, then punt. if (!SetupFastPcTimer()) { return 0; } // Get the variables we will use in the cache. cyclecount=0; for (i=0;ifasttimermaxvalue) { fasttimermaxvalue=ticks; } // Track if the counter wraps. // IMPORTANT! We depend here on TimerRead[0].value ALWAYS being larger // than any value we can read from the hardware. if (ticks>TimerRead[i-1].value) { counterwrapped=TRUE; } // Make sure the cyclecounter didn't get blown away between this and // the last measurement. If so, then punt. // IMPORTANT! We depend here on TimerRead[0].lastreadtimestamp // being correctly initialized outside this loop. if ((LONGLONG)(cyclecount-TimerRead[i-1].lastreadtimestamp)<0) { Trap(); return 0; } // If we are on the same tick as the last read, then update // the lastreadtimestamp and count for the previous set of reads. // IMPORTANT! We depend here on TimerRead[0].value ALWAYS being // different from any value we can read from the hardware. if (ticks==TimerRead[i-1].value) { TimerRead[i-1].lastreadtimestamp=cyclecount; TimerRead[i-1].count++; continue; } // If we get here, we have a new timer value. So log the new // value. TimerRead[i].firstreadtimestamp=cyclecount; TimerRead[i].lastreadtimestamp=cyclecount; TimerRead[i].value=ticks; TimerRead[i].count=1; // Point to next timer read slot. i++; } // Make sure that we don't try to calibrate with a broken timer. if (totaltimerreads>=LIMITTOTALTIMERREADS) { Trap(); return 0; } // Track the largest value read from timer 1 since boot. if (fasttimermaxvalue>largesttimermaxvalue) { largesttimermaxvalue=fasttimermaxvalue; // Make sure the max count is OK. if (largesttimermaxvalue>DEFAULTMAXTIMERCOUNT) { dprintf(("Max count of 0x%x instead of 0x12 on channel 1 of 8254 timer.", largesttimermaxvalue)); } } // If we wrapped, and our current max count either equals or is within // 2 ticks of the largest since boot, then use the largest max count // since boot in our loops to calculate the upper and lower bounds // on cyclespertick. // Otherwise we have wrapped and logged a maximum timer 1 count that // is not close enough to the maximum since boot to be able to trust // that the max since boot is the correct value to use when calculating // the bounds, so, force the calculations to only pair up measurements // that do not include counter wraps in the calculations by setting // fasttimermaxvalue to zero. // If we did not wrap, then the value of fasttimermaxvalue is irrelevant // since it will not be needed to correctly calculate the upper and lower // bounds on cyclespertick. if (counterwrapped) { if (fasttimermaxvalue+2>=largesttimermaxvalue) { fasttimermaxvalue=largesttimermaxvalue; // Now make sure that largesttimermaxvalue matches what // the default maximum should be. If not, then prevent the // upper and lower bound calculations from using measurement // pairs that have wrapped timer ticks. if (fasttimermaxvalue!=DEFAULTMAXTIMERCOUNT) { fasttimermaxvalue=0; } } else { fasttimermaxvalue=0; } } // Now calculate the cyclespertick. // Scan through all of the pairs of tick values and calculate an // an upper and lower bound on the cycles per tick for each pair. { ULONG j; ULONG lowerbound,upperbound; ULONG maxlowerbound,minupperbound; ULONG lowerboundtotal,upperboundtotal; ULONG lowerboundcount,upperboundcount; lowerboundtotal=0; lowerboundcount=0; upperboundtotal=0; upperboundcount=0; maxlowerbound=0; minupperbound=0xffffffff; for (i=1;i1) { //Trap(); lowerbound=(ULONG)(TimerRead[i].lastreadtimestamp-TimerRead[i].firstreadtimestamp); if (lowerbound>maxlowerbound) { maxlowerbound=lowerbound; } lowerboundtotal+=lowerbound; lowerboundcount++; } totaltickdiff=0; for (j=i+1;j= 1. If not, then something // is very wrong. So punt completely. if (tickdiff<1) { Trap(); return 0; } totaltickdiff+=tickdiff; lowerbound=(ULONG)(TimerRead[j].lastreadtimestamp-TimerRead[i].firstreadtimestamp)/(totaltickdiff+1); if (lowerbound>maxlowerbound) { maxlowerbound=lowerbound; } lowerboundtotal+=lowerbound; lowerboundcount++; if (totaltickdiff>1) { upperbound=(ULONG)(TimerRead[j].firstreadtimestamp-TimerRead[i].lastreadtimestamp)/(totaltickdiff-1); if (upperbound(maxlowerbound+minupperbound)/2) { static badcount=0; badcount++; //Trap(); return 0; } } if (maxlowerbound>minupperbound) { static invertedcount=0; invertedcount++; //Trap(); } { static ULONG totalminupperbound=0; static ULONG countminupperbound=0; static ULONG minminupperbound=0xffffffff; static ULONG maxminupperbound=0; if (minupperboundmaxminupperbound) { maxminupperbound=minupperbound; } totalminupperbound+=minupperbound; countminupperbound++; } cyclespertick=(upperboundtotal/upperboundcount + lowerboundtotal/lowerboundcount)/2; cyclespertick=(maxlowerbound+minupperbound)/2; } // IDEA! We can probably get a better estimate if we randomize the // time of our reads. That will remove any relationship between the // time it takes to do an i/o read and the period of the timer tick // itself. #if DEBUG // Log statistics. if (cyclespertick>maxcyclespertick) { maxcyclespertick=cyclespertick; } if (cyclespertick