/*++

Copyright (c) 1992-2000  Microsoft Corporation

Module Name:

    analysis.c

Abstract:

    This module contains the main file of the analysis
    module.

Author:

    Ori Gershony (t-orig) creation-date 6-July-1995

Revision History:

      24-Aug-1999 [askhalid] copied from 32-bit wx86 directory and make work for 64bit.


--*/

#include <nt.h>
#include <ntrtl.h>
#include <nturtl.h>
#include <windows.h>
#include <wx86.h>
#include <wx86nt.h>
#include <wx86cpu.h>
#include <cpuassrt.h>
#include <threadst.h>
#include <instr.h>
#include <analysis.h>
#include <decoder.h>
#include <frag.h>
#include <config.h>
#include <compiler.h>

ASSERTNAME;

 

//
// Macro to determine when to stop looking ahead during compilation.
//
#define STOP_DECODING(inst)     (Fragments[inst.Operation].Flags & OPFL_STOP_COMPILE)

//
// Map a REG_ constant (offset into cpu struct) into register bit map
// used by instruction data.
//
const DWORD MapRegNumToRegBits[0x1e] =
    {REGEAX, REGECX, REGEDX, REGEBX, REGESP, REGEBP, REGESI, REGEDI,
     0, 0, 0, 0, 0, 0,
     REGAX, REGCX, REGDX, REGBX, REGSP, REGBP, REGSI, REGDI,
     REGAL, REGCL, REGDL, REGBL, REGAH, REGCH, REGDH, REGBH };


ULONG
LocateEntryPoints(
    PINSTRUCTION InstructionStream,
    ULONG NumberOfInstructions
    )
/*++

Routine Description:

    This function scans the InstructionStream and marks instructions
    which begin entrypoint.  An instruction begins an entrypoint if its
    EntryPoint field has a different value than the previous instruction's
    value.  No instruction will have a NULL pointer.

    Note that in this pass, the EntryPoint field does *not* point to an
    ENTRYPOINT structure... it is only a marker.

Arguments:

    IntelStart -- The intel address of the first instruction in the stream

    IntelStart -- The last byte of the last intel instruction in the stream

Return Value:

    Count of EntryPoints located.
    
--*/
{
    ULONG i, j, intelDest;
    ULONG EntryPointCounter;
    ULONG IntelStart;
    ULONG IntelEnd;

    if (CompilerFlags & COMPFL_SLOW) {
        //
        // The compiler is supposed to generate slowmode code.  Each
        // x86 instruction gets its own ENTRYPOINT
        //
        EntryPointCounter=1;
        for (i=0; i<NumberOfInstructions; i++) {
            //
            // Mark all instructions which don't correspond to 0-byte NOPs
            // following optimized instructions as starting EntryPoints.
            //
            if (InstructionStream[i].Size) {
                EntryPointCounter++;
            }
            InstructionStream[i].EntryPoint = (PENTRYPOINT)EntryPointCounter;
        }

    } else {

        //
        // Find all instructions which need Entrypoints.
        //     Performance is O(n^2) in the worst case, although
        //     it will be typically much closer to O(n)
        //
        //  Instructions which mark the starts of Entrypoints have
        //  their .EntryPoint pointer set to non-NULL.  Instructions which
        //  don't require entrypoints have it set to NULL;
        //

        IntelStart = InstructionStream[0].IntelAddress;
        IntelEnd = IntelStart +
                   InstructionStream[NumberOfInstructions-1].IntelAddress +
                   InstructionStream[NumberOfInstructions-1].Size;

        //
        // The first instruction always gets an entrypoint
        //
        InstructionStream[0].EntryPoint = (PENTRYPOINT)1;

        //
        // Visit each instruction in turn
        //
        for (i=0; i<NumberOfInstructions; i++) {

            if (((i+1) < NumberOfInstructions) &&
                (Fragments[InstructionStream[i].Operation].Flags & OPFL_END_NEXT_EP)) {
                //
                // This instruction marks the end of an Entrypoint.  The next
                // instruction gets a new Entrypoint.
                //
                CPUASSERT(i < CpuInstructionLookahead-1 && i < NumberOfInstructions-1);
                InstructionStream[i+1].EntryPoint = (PENTRYPOINT)1;
            }

            // Now see if it is a direct control transfer instruction with a
            // destination that lies within this instruction stream.  If it is,
            // we want to create an Entry Point at the destination so that the
            // control transfer will be compiled directly to the patched form,
            // and won't have to be patched later.
            //
            if (Fragments[InstructionStream[i].Operation].Flags & OPFL_CTRLTRNS) {
                //
                // The instruction is a direct control-transfer.  If the
                // destination is within the InstructionStream, create an
                // Entrypoint at the destination.
                //

                if (InstructionStream[i].Operand1.Type == OPND_IMM ||
                    InstructionStream[i].Operand1.Type == OPND_NOCODEGEN) {
                    // Get the intel destination from the instruction structure.
                    intelDest = InstructionStream[i].Operand1.Immed;
                } else {
                    CPUASSERT(InstructionStream[i].Operand1.Type == OPND_ADDRREF );
                    // A FAR instruction - Operand1 is a ptr to a SEL:OFFSET pair
                    intelDest = *(UNALIGNED PULONG)(InstructionStream[i].Operand1.Immed);
                }

                // Get the intel destination from the instruction structure.
                // It is always an immediate with direct control transfers.
                
                if ((intelDest >= IntelStart) && (intelDest <= IntelEnd)) {
                    //
                    // Destination of the control-transfer is within the
                    // instructionstream.  Find the destination instruction.
                    //
                    if (intelDest > InstructionStream[i].IntelAddress) {
                        //
                        // The dest. address is at a higher address.
                        //
                        for (j=i+1; j<NumberOfInstructions; ++j) {
                            if (InstructionStream[j].IntelAddress == intelDest) {
                                break;
                            }
                        }
                    } else {
                        //
                        // The dest. address is at a lower address.
                        //
                        for (j=i; j>0; --j) {
                            if (InstructionStream[j].IntelAddress == intelDest) {
                                break;
                            }
                        }
                    }

                    //
                    // An exact match may not be found in the event that the
                    // app is punning (either a real pun or the app is jumping
                    // into the middle of an optimized instruction).  In
                    // either of the cases, defer entrypoint creation until
                    // the branch is actually taken.
                    //
                    if (j >= 0 && j < NumberOfInstructions) {
                        //
                        // Exact match was found.  Create an Entrypoint.
                        //
                        InstructionStream[j].EntryPoint = (PENTRYPOINT)1;
                    }
                }
            }  // if OPFL_CTRLTRNS
        } // for ()

        //
        // Convert the EntryPoint field from NULL/non-NULL to a unique
        // value for each range of instructions.
        //
        EntryPointCounter=1;
        i=0;
        while (i<NumberOfInstructions) {
            //
            // This instruction marks the beginning of a basic block
            //
            InstructionStream[i].EntryPoint = (PENTRYPOINT)EntryPointCounter;
            j=i+1;
            while (j < NumberOfInstructions) {
                if ((j >= NumberOfInstructions) ||
                    (InstructionStream[j].Size && InstructionStream[j].EntryPoint)) {
                    //
                    // Either ran out of instructions, or encountered an instruction
                    // which marks the start of the next basic block.  Note that
                    // 0-byte NOP instructions are not allowed to start basic blocks
                    // as that violates the rules of OPT_ instructions.
                    //
                    break;
                }
                InstructionStream[j].EntryPoint = (PENTRYPOINT)EntryPointCounter;
                j++;
            }
            EntryPointCounter++;
            i = j;
        }
    } // if not COMPFL_SLOW

    //
    // At this point, EntryPointCounter holds the number of EntryPoints
    // plus one, because we started the counter at 1, not 0.  Correct
    // that now.
    //
    EntryPointCounter--;

    return EntryPointCounter;
}


VOID
UpdateRegs(
    PINSTRUCTION pInstr,
    POPERAND Operand
    )
/*++
                                                                
Routine Description:

    Updates the list of registers referenced and/or modified based on the
    Operand.

Arguments:

    pInstr -- the instruction to examine

    Operand -- the operand of the instruction to examine

Return Value:

    return-value - none

--*/
{
    switch (Operand->Type) {
    case OPND_NOCODEGEN:
    case OPND_REGREF:
    if (Operand->Reg != NO_REG) {
        pInstr->RegsSet |= MapRegNumToRegBits[Operand->Reg];
    }
        break;

    case OPND_REGVALUE:
    if (Operand->Reg != NO_REG) {
        pInstr->RegsNeeded |= MapRegNumToRegBits[Operand->Reg];
    }
        break;

    case OPND_ADDRREF:
    case OPND_ADDRVALUE8:
    case OPND_ADDRVALUE16:
    case OPND_ADDRVALUE32:
        if (Operand->Reg != NO_REG) {
            pInstr->RegsNeeded |= MapRegNumToRegBits[Operand->Reg];
        }
        if (Operand->IndexReg != NO_REG) {
            pInstr->RegsNeeded |= MapRegNumToRegBits[Operand->IndexReg];
        }
        break;

    default:
        break;
    }
}


VOID
CacheIntelRegs(
    PINSTRUCTION InstructionStream,
    ULONG numInstr)
/*++
                                                                
Routine Description:

    This function deterimes what x86 registers, if any, can be cached in
    RISC preserved registers.

Arguments:

    InstructionStream -- The instruction stream returned by the decoder

    numInstr -- The length of InstructionStream

Return Value:

    return-value - none

--*/
{
    PINSTRUCTION pInstr;
    BYTE RegUsage[REGCOUNT];
    DWORD RegsToCache;
    int i;
    PENTRYPOINT PrevEntryPoint;

    //
    // Calculate the RegsSet and RegsNeeded for the bottommost instruction
    //
    pInstr = &InstructionStream[numInstr-1];
    pInstr->RegsSet = Fragments[pInstr->Operation].RegsSet;
    PrevEntryPoint = pInstr->EntryPoint;
    UpdateRegs(pInstr, &pInstr->Operand1);
    UpdateRegs(pInstr, &pInstr->Operand2);
    UpdateRegs(pInstr, &pInstr->Operand3);

    //
    // For each 32-bit register used as a parameter to this instruction,
    // set the usage count to 1.
    //
    for (i=0; i<REGCOUNT; ++i) {
        if (pInstr->RegsNeeded & (REGMASK<<(REGSHIFT*i))) {
            RegUsage[i] = 1;
        } else {
            RegUsage[i] = 0;
        }
    }

    //
    // Loop over instruction stream from bottom to top, starting at the
    // second-to-last instruction
    //
    for (pInstr--; pInstr >= InstructionStream; pInstr--) {

        //
        // Calculate the RegsSet and RegsNeeded values for this instruction
        //
        pInstr->RegsSet = Fragments[pInstr->Operation].RegsSet;
        UpdateRegs(pInstr, &pInstr->Operand1);
        UpdateRegs(pInstr, &pInstr->Operand2);
        UpdateRegs(pInstr, &pInstr->Operand3);

        RegsToCache = 0;

        if (PrevEntryPoint != pInstr->EntryPoint) {

            //
            // The current instruction marks the end of an Entrypoint.
            //
            PrevEntryPoint = pInstr->EntryPoint;

            //
            // For all x86 registers which have been read more than once
            // but not modified in the basic block, load them into the
            // cache before executing the first instruction in the basic
            // block.
            //
            for (i=0; i<REGCOUNT; ++i) {
                if (RegUsage[i] > 1) {
                    RegsToCache |= (REGMASK<<(REGSHIFT*i));
                }
            }

            //
            // Reset the RegUsage[] array to indicate no registers are
            // cached.
            //
            RtlZeroMemory(RegUsage, REGCOUNT);

        } else {

            //
            // For each 32-bit x86 register modified by this instruction,
            // update the caching info.
            //
            for (i=0; i<REGCOUNT; ++i) {
                DWORD RegBits = pInstr->RegsSet & (REGMASK<<(REGSHIFT*i));
                if (RegBits) {
                    //
                    // The ith 32-bit x86 register has been modified by this
                    // instruction
                    //
                    if (RegUsage[i] > 1) {
                        //
                        // There is more than one consumer of the modified
                        // value so it is worth caching.
                        //
                        RegsToCache |= RegBits;
                    }

                    //
                    // Since this x86 register was dirtied by this instruction,
                    // it usage count must be reset to 0.
                    //
                    RegUsage[i] = 0;
                }
            }
        }

        //
        // Update the list of x86 registers which can be loaded into
        // cache registers before the next instruction executes.
        //
        pInstr[1].RegsToCache |= RegsToCache;

        //
        // For each 32-bit register used as a parameter to this instruction,
        // bump the usage count.
        //
        for (i=0; i<REGCOUNT; ++i) {
            if (pInstr->RegsNeeded & (REGMASK<<(REGSHIFT*i))) {
                RegUsage[i]++;
            }
        }
    }
}


VOID
OptimizeInstructionStream(
    PINSTRUCTION IS,
    ULONG numInstr
    )
/*++
                                                                
Routine Description:

    This function performs various optimization on the instruction stream
    retured by the decoder.

Arguments:

    IS -- The instruction stream returned by the decoder

    numInstr -- The length of IS

Return Value:

    return-value - none

--*/
{
    ULONG i;

    CPUASSERTMSG(numInstr, "Cannot optimize 0-length instruction stream");

    //
    // Pass 1: Optimize x86 instruction stream, replacing single x86
    //         instructions with special-case instructions, and replacing
    //         multiple x86 instructions with single special-case OPT_
    //         instructions
    //
    for (i=0; i<numInstr; ++i) {

        switch  (IS[i].Operation) {
        case OP_Push32:
            if (i < numInstr-2
                && IS[i].Operand1.Type == OPND_REGVALUE){

                if (IS[i].Operand1.Reg == GP_EBP) {
                    // OP_OPT_SetupStack --
                    //      push ebp
                    //      mov ebp, esp
                    //      sub esp, x
                    if ((IS[i+1].Operation == OP_Mov32) &&
                        (IS[i+1].Operand1.Type == OPND_REGREF) &&
                        (IS[i+1].Operand1.Reg == GP_EBP) &&
                        (IS[i+1].Operand2.Type == OPND_REGVALUE) &&
                        (IS[i+1].Operand2.Reg == GP_ESP) &&
                        (IS[i+2].Operation == OP_Sub32) &&
                        (IS[i+2].Operand1.Type == OPND_REGREF) &&
                        (IS[i+2].Operand1.Reg == GP_ESP) &&
                        (IS[i+2].Operand2.Type == OPND_IMM)){

                        IS[i].Operation = OP_OPT_SetupStack;
                        IS[i].Operand1.Type = OPND_IMM;
                        IS[i].Operand1.Immed = IS[i+2].Operand2.Immed;
                        IS[i].Size += IS[i+1].Size + IS[i+2].Size;
                        IS[i].Operand2.Type = OPND_NONE;
                        IS[i+1].Operation = OP_Nop;
                        IS[i+1].Operand1.Type = OPND_NONE;
                        IS[i+1].Operand2.Type = OPND_NONE;
                        IS[i+1].Size = 0;
                        IS[i+2].Operation = OP_Nop;
                        IS[i+2].Operand1.Type = OPND_NONE;
                        IS[i+2].Operand2.Type = OPND_NONE;
                        IS[i+2].Size = 0;
                        i+=2;
                        break;
                    }
                } else if (IS[i].Operand1.Reg == GP_EBX) {
                    // OP_OPT_PushEbxEsiEdi --
                    //      push ebx
                    //      push esi
                    //      push edi
                    if ((IS[i+1].Operation == OP_Push32) &&
                        (IS[i+1].Operand1.Type == OPND_REGVALUE) &&
                        (IS[i+1].Operand1.Reg == GP_ESI) &&
                        (IS[i+2].Operation == OP_Push32) &&
                        (IS[i+2].Operand1.Type == OPND_REGVALUE) &&
                        (IS[i+2].Operand1.Reg == GP_EDI)){

                        IS[i].Operation = OP_OPT_PushEbxEsiEdi;
                        IS[i].Size += IS[i+1].Size + IS[i+2].Size;
                        IS[i].Operand1.Type = OPND_NONE;
                        IS[i].Operand2.Type = OPND_NONE;
                        IS[i+1].Operation = OP_Nop;
                        IS[i+1].Operand1.Type = OPND_NONE;
                        IS[i+1].Operand2.Type = OPND_NONE;
                        IS[i+1].Size = 0;
                        IS[i+2].Operation = OP_Nop;
                        IS[i+2].Operand1.Type = OPND_NONE;
                        IS[i+2].Operand2.Type = OPND_NONE;
                        IS[i+2].Size = 0;
                        i+=2;
                        break;
                    }
                }
            }

            //
            // It is not one of the other special PUSH sequences, so see
            // if there are two consecutive PUSHes to merge together.  Note:
            // If the second PUSH references ESP, the two cannot be merged
            // because the value is computed before 4 is subtracted from ESP.
            //  ie. the following is disallowed:
            //        PUSH EAX
            //        PUSH ESP  ; second operand to Push2 would have been
            //                  ; built before the PUSH EAX was executed.
            //
            if (i < numInstr-1 &&
                !IS[i].FsOverride &&
                !IS[i+1].FsOverride &&
                IS[i+1].Operation == OP_Push32 &&
                IS[i+1].Operand1.Reg != GP_ESP &&
                IS[i+1].Operand1.IndexReg != GP_ESP) {

                IS[i].Operation = OP_OPT_Push232;
                IS[i].Operand2 = IS[i+1].Operand1;
                IS[i].Size += IS[i+1].Size;
                IS[i+1].Operation = OP_Nop;
                IS[i+1].Operand1.Type = OPND_NONE;
                IS[i+1].Size = 0;
                i++;
            }

            break;

        case OP_Pop32:
            // OP_OPT_PopEdiEsiEbx
            //      pop edi
            //      pop esi
            //      pop ebx
            if (i < numInstr-2 &&
                (IS[i].Operand1.Type == OPND_REGREF) &&
                (IS[i].Operand1.Reg == GP_EDI) &&
                (IS[i+1].Operation == OP_Pop32) &&
                (IS[i+1].Operand1.Type == OPND_REGREF) &&
                (IS[i+1].Operand1.Reg == GP_ESI) &&
                (IS[i+2].Operation == OP_Pop32) &&
                (IS[i+2].Operand1.Type == OPND_REGREF) &&
                (IS[i+2].Operand1.Reg == GP_EBX)){

                IS[i].Operation = OP_OPT_PopEdiEsiEbx;
                IS[i].Size += IS[i+1].Size + IS[i+2].Size;
                IS[i].Operand1.Type = OPND_NONE;
                IS[i].Operand2.Type = OPND_NONE;
                IS[i+1].Operation = OP_Nop;
                IS[i+1].Operand1.Type = OPND_NONE;
                IS[i+1].Operand2.Type = OPND_NONE;
                IS[i+1].Size = 0;
                IS[i+2].Operation = OP_Nop;
                IS[i+2].Operand1.Type = OPND_NONE;
                IS[i+2].Operand2.Type = OPND_NONE;
                IS[i+2].Size = 0;
                i+=2;
            } else if (i < numInstr-1 &&
                !IS[i].FsOverride &&
                !IS[i].FsOverride &&
                IS[i].Operand1.Type == OPND_REGREF &&
                IS[i+1].Operation == OP_Pop32 &&
                IS[i+1].Operand1.Type == OPND_REGREF) {

                // Fold the two POPs together.  Both operands are REGREF,
                // so there is no problem with interdependencies between
                // memory touched by the first POP modifying the address
                // of the second POP.  ie. the following is not merged:
                //              POP EAX
                //              POP [EAX]   ; depends on results of first POP
                IS[i].Operation = OP_OPT_Pop232;
                IS[i].Operand2 = IS[i+1].Operand1;
                IS[i].Size += IS[i+1].Size;
                IS[i+1].Operation = OP_Nop;
                IS[i+1].Operand1.Type = OPND_NONE;
                IS[i+1].Size = 0;
                i++;
            }
            break;

        case OP_Xor32:
        case OP_Sub32:
            if (IS[i].Operand1.Type == OPND_REGREF &&
                IS[i].Operand2.Type == OPND_REGVALUE &&
                IS[i].Operand1.Reg == IS[i].Operand2.Reg) {
                // Instruction is XOR samereg, samereg  (ie. XOR EAX, EAX),
                //  or SUB samereg, samereg             (ie. SUB ECX, ECX).
                // Emit OP_OPT_ZERO32 samereg
                IS[i].Operand2.Type = OPND_NONE;
                IS[i].Operation = OP_OPT_ZERO32;
            }
            break;

        case OP_Test8:
            if (IS[i].Operand1.Type == OPND_REGVALUE &&
                IS[i].Operand2.Type == OPND_REGVALUE &&
                IS[i].Operand1.Reg == IS[i].Operand2.Reg) {
                // Instruction is TEST samereg, samereg (ie. TEST EAX, EAX)
                // Emit OP_OPT_FastTest8/16/32
                IS[i].Operand1.Type = OPND_REGVALUE;
                IS[i].Operand2.Type = OPND_NONE;
                IS[i].Operation = OP_OPT_FastTest8;
            }
            break;

        case OP_Test16:
            if (IS[i].Operand1.Type == OPND_REGVALUE &&
                IS[i].Operand2.Type == OPND_REGVALUE &&
                IS[i].Operand1.Reg == IS[i].Operand2.Reg) {
                // Instruction is TEST samereg, samereg (ie. TEST EAX, EAX)
                // Emit OP_OPT_FastTest8/16/32
                IS[i].Operand1.Type = OPND_REGVALUE;
                IS[i].Operand2.Type = OPND_NONE;
                IS[i].Operation = OP_OPT_FastTest16;
            }
            break;

        case OP_Test32:
            if (IS[i].Operand1.Type == OPND_REGVALUE &&
                IS[i].Operand2.Type == OPND_REGVALUE &&
                IS[i].Operand1.Reg == IS[i].Operand2.Reg) {
                // Instruction is TEST samereg, samereg (ie. TEST EAX, EAX)
                // Emit OP_OPT_FastTest8/16/32
                IS[i].Operand1.Type = OPND_REGVALUE;
                IS[i].Operand2.Type = OPND_NONE;
                IS[i].Operation = OP_OPT_FastTest32;
            }
            break;

        case OP_Cmp32:
            if (i<numInstr+1 && IS[i+1].Operation == OP_Sbb32 &&
                IS[i+1].Operand1.Type == OPND_REGREF &&
                IS[i+1].Operand2.Type == OPND_REGVALUE &&
                IS[i+1].Operand1.Reg == IS[i+1].Operand2.Reg) {
                // The two instructions are:
                //     CMP anything1, anything2
                //     SBB samereg, samereg
                // The optimized instruction is:
                //     Operation = either CmpSbb32 or CmpSbbNeg32
                //     Operand1  = &samereg  (passed as REGREF)
                //     Operand2  = anything1 (passed as ADDRVAL32 or REGVAL)
                //     Operand3  = anything2 (passed as ADDRVAL32 or REGVAL)
                IS[i].Operand3 = IS[i].Operand2;
                IS[i].Operand2 = IS[i].Operand1;
                IS[i].Operand1 = IS[i+1].Operand1;
                if (i<numInstr+2 && IS[i+2].Operation == OP_Neg32 &&
                    IS[i+2].Operand1.Type == OPND_REGREF &&
                    IS[i+2].Operand1.Reg == IS[i+1].Operand1.Reg) {
                    // The third instruction is NEG samereg, samereg
                    IS[i].Operation = OP_OPT_CmpSbbNeg32;
                    IS[i+2].Operation = OP_Nop;
                    IS[i+2].Operand1.Type = OPND_NONE;
                    IS[i+2].Operand2.Type = OPND_NONE;
                    IS[i+2].Size = 0;
                } else {
                    IS[i].Operation = OP_OPT_CmpSbb32;
                }
                IS[i+1].Operation = OP_Nop;
                IS[i+1].Operand1.Type = OPND_NONE;
                IS[i+1].Operand2.Type = OPND_NONE;
                IS[i+1].Size = 0;
                i++;
            }
            break;

        case OP_Cwd16:
            if (i<numInstr+1 && IS[i+1].Operation == OP_Idiv16) {
                IS[i].Operation = OP_OPT_CwdIdiv16;
                IS[i].Operand1 = IS[i+1].Operand1;
                IS[i].Size += IS[i+1].Size;
                IS[i+1].Operation = OP_Nop;
                IS[i+1].Operand1.Type = OPND_NONE;
                IS[i+1].Size = 0;
                i++;
            }
            break;

        case OP_Cwd32:
            if (i<numInstr+1 && IS[i+1].Operation == OP_Idiv32) {
                IS[i].Operation = OP_OPT_CwdIdiv32;
                IS[i].Operand1 = IS[i+1].Operand1;
                IS[i].Size += IS[i+1].Size;
                IS[i+1].Operation = OP_Nop;
                IS[i+1].Operand1.Type = OPND_NONE;
                IS[i+1].Size = 0;
                i++;
            }
            break;

        case OP_FP_FNSTSW:
            if (i<numInstr+1 && IS[i+1].Operation == OP_Sahf &&
                IS[i].Operand1.Type == OPND_REGREF &&
                IS[i].Operand1.Reg == GP_AX) {

                // Replace FNSTSW AX / SAHF by one instruction
                IS[i].Operation = OP_OPT_FNSTSWAxSahf;
                IS[i].Operand1.Type = OPND_NONE;
                IS[i].Size += IS[i+1].Size;
                IS[i+1].Operation = OP_Nop;
                IS[i+1].Size = 0;
                i++;
            }
            break;

        case OP_FP_FSTP_STi:
            if (IS[i].Operand1.Immed == 0) {
                IS[i].Operand1.Type = OPND_NONE;
                IS[i].Operation = OP_OPT_FSTP_ST0;
            }
            break;

        }
    }
}


VOID
OptimizeIntelFlags(
    PINSTRUCTION IS,
    ULONG numInstr
    )
/*++
                                                                
Routine Description:

    This function analysis x86 flag register usage and switches instructions
    to use NoFlags versions if possible.

Arguments:

    IS -- The instruction stream returned by the decoder

    numInstr -- The length of IS

Return Value:

    return-value - none

--*/
{
    USHORT FlagsNeeded;     // flags required to execute current x86 instr
    USHORT FlagsToGenerate; // flags which current x86 instr must generate
    PFRAGDESCR pFragDesc;   // ptr to Fragments[] array for current instr
    ULONG i;                // instruction index
    BOOL fPassNeeded = TRUE;// TRUE if the outer loop needs to loop once more
    ULONG PassNumber = 0;   // number of times outer loop has looped
    PENTRYPOINT pEPDest;    // Entrypoint for destination of a ctrl transfer
    USHORT KnownFlagsNeeded[MAX_INSTR_COUNT]; // flags needed for each instr

    while (fPassNeeded) {

        //
        // This loop is executed at most two times.  The second pass is only
        // required if there is a control-transfer instruction whose
        // destination is within the Instruction Stream and at a lower
        // Intel address  (ie. a backwards JMP).
        //
        fPassNeeded = FALSE;
        PassNumber++;
        CPUASSERT(PassNumber <= 2);

        //
        // Iterate over all x86 instructions decoded, from bottom to top,
        // propagating flags info up.  Start off by assuming all x86 flags
        // must be up-to-date at the end of the last basic block.
        //
        FlagsNeeded = ALLFLAGS;
        i = numInstr;
        do {
            i--;
            pFragDesc = &Fragments[IS[i].Operation];

            //
            // Calculate what flags will need to be computed by this
            // instruction and ones before this.
            //
            KnownFlagsNeeded[i] = FlagsNeeded | pFragDesc->FlagsNeeded;
            FlagsToGenerate = FlagsNeeded & pFragDesc->FlagsSet;

            //
            // Calculate what flags this instruction will need to have
            // computed before it can be executed.
            //
            FlagsNeeded = (FlagsNeeded & ~FlagsToGenerate) |
                           pFragDesc->FlagsNeeded;

            if (pFragDesc->Flags & OPFL_CTRLTRNS) {
                ULONG IntelDest = IS[i].Operand1.Immed;

                //
                // For control-transfer instructions, FlagsNeeded also includes
                // the flags required for the destination of the transfer.
                //
                if (IS[0].IntelAddress <= IntelDest &&
                    i > 0 && IS[i-1].IntelAddress >= IntelDest) {
                    //
                    // The destination of the control-transfer is at a lower
                    // address in the Instruction Stream.
                    //

                    if (PassNumber == 1) {
                        //
                        // Need to make a second pass over the flags
                        // optimizations in order to determine what flags are
                        // needed for the destination address.
                        //
                        fPassNeeded = TRUE;
                        FlagsNeeded = ALLFLAGS; // assume all flags are needed
                    } else {
                        ULONG j;
                        USHORT NewFlagsNeeded;

                        //
                        // Search for the IntelDest within the Instruction
                        // Stream.  IntelDest may not be found if there is
                        // a pun.
                        //
                        NewFlagsNeeded = ALLFLAGS;  // assume there is a pun
                        for (j=0; j < i; ++j) {
                            if (IS[j].IntelAddress == IntelDest) {
                                NewFlagsNeeded = KnownFlagsNeeded[j];
                                break;
                            }
                        }

                        FlagsNeeded |= NewFlagsNeeded;
                    }
                } else if (IS[i+1].IntelAddress <= IntelDest &&
                           IntelDest <= IS[numInstr-1].IntelAddress) {
                    //
                    // The destination of the control-transfer is at a higher
                    // address in the Instruction Stream.  Pick up the
                    // already-computed FlagsNeeded for the destination.
                    //
                    ULONG j;
                    USHORT NewFlagsNeeded = ALLFLAGS;   // assume a pun

                    for (j=i+1; j < numInstr; ++j) {
                        if (IS[j].IntelAddress == IntelDest) {
                            NewFlagsNeeded = KnownFlagsNeeded[j];
                            break;
                        }
                    }

                    FlagsNeeded |= NewFlagsNeeded;

                } else {
                    //
                    // Destination of the control-transfer is unknown.  Assume
                    // the worst:  all flags are required.
                    //
                    FlagsNeeded = ALLFLAGS;
                }
            }

            if (!(FlagsToGenerate & pFragDesc->FlagsSet) &&
                (pFragDesc->Flags & OPFL_HASNOFLAGS)) {
                //
                // This instruction is not required to generate any flags, and
                // it has a NOFLAGS version.  Update the flags that need to be
                // computed by instructions before this one, and modify the
                // Operation number to point at the NoFlags fragment.
                //
                FlagsToGenerate &= pFragDesc->FlagsSet;
                if (pFragDesc->Flags & OPFL_ALIGN) {
                    IS[i].Operation += 2;
                } else {
                    IS[i].Operation ++;
                }

                if (IS[i].Operation == OP_OPT_ZERONoFlags32) {
                    //
                    // Special-case this to be a "mov [value], zero" so it is
                    // inlined.
                    //
                    IS[i].Operation = OP_Mov32;
                    IS[i].Operand2.Type = OPND_IMM;
                    IS[i].Operand2.Immed = 0;
                }
            }
        } while (i);
    }
}

VOID
DetermineEbpAlignment(
    PINSTRUCTION InstructionStream,
    ULONG numInstr
    )
/*++
                                                                
Routine Description:

    For each instruction in InstructionStream[], sets Instruction->EbpAligned
    based on whether EBP is assumed to be DWORD-aligned or not.  EBP is
    assumed to be DWORD-aligned if a "MOV EBP, ESP" instruction is seen, and
    it is assumed to become unaligned at the first instruction which is
    flagged as modifying EBP.

Arguments:

    InstructionStream -- The instruction stream returned by the decoder

    numInstr -- The length of InstructionStream

Return Value:

    return-value - none

--*/
{
    ULONG i;
    BOOL EbpAligned = FALSE;

    for (i=0; i<numInstr; ++i) {
        if (InstructionStream[i].RegsSet & REGEBP) {
            //
            // This instruction modified EBP
            //
            if (InstructionStream[i].Operation == OP_OPT_SetupStack ||
                InstructionStream[i].Operation == OP_OPT_SetupStackNoFlags ||
                (InstructionStream[i].Operation == OP_Mov32 &&
                 InstructionStream[i].Operand2.Type == OPND_REGVALUE &&
                 InstructionStream[i].Operand2.Reg == GP_ESP)) {
                //
                // The instruction is either "MOV EBP, ESP" or one of the
                // SetupStack fragments (which contains a "MOV EBP, ESP")
                // assume Ebp is aligned from now on.
                //
                EbpAligned = TRUE;
            } else {
                EbpAligned = FALSE;
            }
        }

        InstructionStream[i].EbpAligned = EbpAligned;
    }
}

ULONG
GetInstructionStream(
    PINSTRUCTION InstructionStream,
    PULONG NumberOfInstructions,
    PVOID pIntelInstruction,
    PVOID pLastIntelInstruction
)
/*++
                                                                
Routine Description:

    Returns an instruction stream to the compiler.  The instruction
    stream is terminated either when the buffer is full, or when
    we reach a control transfer instruction.

Arguments:

    InstructionStream -- A pointer to the buffer where the decoded
        instructions are stored.

    NumberOfInstructions -- Upon entry, this variable contains the
        maximal number of instructions the buffer can hold.  When
        returning, it contains the actual number of instructions
        decoded.

    pIntelInstruction -- A pointer to the first real intel instruction
        to be decoded.

    pLastIntelInstruction -- A pointer to the last intel instruction to be
        compiled, 0xffffffff if not used.

Return Value:

    Number of entrypoints required to describe the decoded instruction
    stream.

--*/
{
    ULONG numInstr=0;
    ULONG maxBufferSize;
    ULONG cEntryPoints;

    maxBufferSize = (*NumberOfInstructions);

    //
    // Zero-fill the InstructionStream.  The decoder depends on this.
    //
    RtlZeroMemory(InstructionStream, maxBufferSize*sizeof(INSTRUCTION));

#if DBG
    //
    // Do a little analysis on the address we're about to decode.  If
    // the address is part of a non-x86 image, log that to the debugger.
    // That probably indicates a thunking problem.  If the address is not
    // part of an image, warn that the app is running generated code.
    //
    try {
        USHORT Instr;

        //
        // Try to read the instruction about to be executed.  If we get
        // an access violation, use 0 as the value of the instruction.
        //
        Instr = 0;

        //
        // Ignore BOP instructions - we assume we know what's going on with
        // them.
        //
        if (Instr != 0xc4c4) {

            NTSTATUS st;
            MEMORY_BASIC_INFORMATION mbi;

            st = NtQueryVirtualMemory(NtCurrentProcess(),
                                      pIntelInstruction,
                                      MemoryBasicInformation,
                                      &mbi,
                                      sizeof(mbi),
                                      NULL);
            if (NT_SUCCESS(st)) {
                PIMAGE_NT_HEADERS Headers;

                Headers = RtlImageNtHeader(mbi.AllocationBase);
                if (!Headers || Headers->FileHeader.Machine != IMAGE_FILE_MACHINE_I386) {
                    LOGPRINT((TRACELOG, "CPU Analysis warning:  jumping from Intel to non-intel code at 0x%X\r\n", pIntelInstruction));
                }
            } else {
                // Eip isn't pointing anywhere???
            }
        }
    } except(EXCEPTION_EXECUTE_HANDLER) {
        ;
    }
#endif  //DBG

    while (numInstr < maxBufferSize) {

        DecodeInstruction ((DWORD) (ULONGLONG)pIntelInstruction, InstructionStream+numInstr);
        if ((STOP_DECODING(InstructionStream[numInstr])) ||
            (pIntelInstruction >= pLastIntelInstruction)) {

            // We reached a control transfer instruction
            numInstr++;
            (*NumberOfInstructions) = numInstr;
            break; // SUCCESS
        }
        pIntelInstruction = (PVOID) ((ULONGLONG)pIntelInstruction + (InstructionStream+numInstr)->Size);

        numInstr++;
    }

    //
    // Optimize x86 code by merging x86 instructions into meta-instructions
    // and cleaning up special x86 idioms.
    //
    if (!(CompilerFlags & COMPFL_SLOW)) {
        OptimizeInstructionStream (InstructionStream, numInstr);
    }

    //
    // Determine where all basic blocks are by filling in the EntryPoint
    // field in each instruction.  This must be done after
    // OptimizeInstructionStream() runs so that EntryPoints don't fall
    // into the middle of meta-instructions.
    //
    cEntryPoints = LocateEntryPoints(InstructionStream, numInstr);

    //
    // Perform optimizations which require knowledge of EntryPoints
    //
    if (numInstr > 2 && !(CompilerFlags & COMPFL_SLOW)) {
        if (!CpuDisableNoFlags) {
            OptimizeIntelFlags(InstructionStream, numInstr);
        }

        if (!CpuDisableRegCache) {
            CacheIntelRegs(InstructionStream, numInstr);
        }

        if (!CpuDisableEbpAlign) {
            DetermineEbpAlignment(InstructionStream, numInstr);
        }
    }

    return cEntryPoints;
}