//++
//
// Copyright (c) 1989-2000  Microsoft Corporation
//
// Component Name:
//
//    NT / KE 
//
// Module Name:
//
//    ctxswap.s
//
// Abstract:
//
//    This module implements the IA64 Process and Thread Context Swaps.
//
// Author:
//
//    David N. Cutler (davec) 5-Mar-1989
//
// Environment:
//
//    Kernel mode only
//
// Revision History:
// 
//    Bernard Lint  Jul-12-1995
//
//         Initial IA64 version
//
//--

#include "ksia64.h"

        .file     "ctxswap.s"
        .text

//
// Globals imported:
//

        .global     KiReadySummary
        .global     KiIdleSummary
        .global     KiDispatcherReadyListHead
        .global     KeTickCount
        .global     KiMasterSequence
        .global     KiMasterRid
        .global     PPerfGlobalGroupMask

        PublicFunction(KiDeliverApc)
        PublicFunction(KiSaveExceptionFrame)
        PublicFunction(KiRestoreExceptionFrame)
        PublicFunction(KiActivateWaiterQueue)
        PublicFunction(KiReadyThread)
        PublicFunction(KeFlushEntireTb)
        PublicFunction(KiQuantumEnd)
        PublicFunction(KiSyncNewRegionId)
        PublicFunction(KiCheckForSoftwareInterrupt)
        PublicFunction(KiSaveHigherFPVolatileAtDispatchLevel)
        PublicFunction(KeAcquireQueuedSpinLockAtDpcLevel)
        PublicFunction(KeReleaseQueuedSpinLockFromDpcLevel)
        PublicFunction(KeTryToAcquireQueuedSpinLockRaiseToSynch)
        PublicFunction(WmiTraceContextSwap)

#if DBG
        PublicFunction(KeBugCheckEx)
#endif // DBG


        SBTTL("Unlock Dispatcher Database")
//++
//--------------------------------------------------------------------
//
// VOID
// KiUnlockDispatcherDatabase (
//    IN KIRQL OldIrql
//    )
//
// Routine Description:
//
//    This routine is entered at synchronization level with the dispatcher
//    database locked. Its function is to either unlock the dispatcher
//    database and return or initiate a context switch if another thread
//    has been selected for execution.
//
//    N.B. A context switch CANNOT be initiated if the previous IRQL
//         is greater than or equal to DISPATCH_LEVEL.
//
//    N.B. This routine is carefully written to be a leaf function. If,
//        however, a context swap should be performed, the routine is
//        switched to a nested fucntion.
//
// Arguments:
//
//    OldIrql (a0) - Supplies the IRQL when the dispatcher database
//        lock was acquired (in low order byte, not zero extended).
//
// Return Value:
//
//    None.
//
//--------------------------------------------------------------------
//--

        NESTED_ENTRY(KiUnlockDispatcherDatabase)
        NESTED_SETUP(1,3,1,0)

//
// Register aliases
//

        rDPC      = loc2                // DPC active flag

        rpT1      = t1                  // temp pointer
        rpT2      = t2                  // temp pointer
        rpT3      = t3                  // temp pointer
        rT1       = t5                  // temp regs
        rT2       = t6
        rPrcb     = t8                  // PRCB pointer

        pNotNl    = pt2                 // true if next thread not NULL
        pIRQGE    = pt3                 // true if DISPATCH_LEVEL <= old irql
        pIRQLT    = pt4                 // true if DISPATCH_LEVEL > old irql
        pDPC      = pt5                 // true if DPC active
        pNoAPC    = pt2                 // do not dispatch APC
        pAPC      = pt9

        PROLOGUE_END

//
// Check if a thread has been scheduled to execute on the current processor
//

        movl      rPrcb = KiPcr + PcPrcb
        ;;

        LDPTR     (rPrcb, rPrcb)                // rPrcb -> PRCB
        ;;
        add       rpT1 = PbNextThread, rPrcb    // -> next thread
        add       rpT2 = PbDpcRoutineActive,rPrcb // -> DPC active flag
        ;;

        LDPTR     (v0, rpT1)                    // v0 = next thread
        ;;
        cmp.ne    pNotNl = zero, v0             // pNotNl = next thread is 0
        zxt1      a0 = a0                       // isolate old IRQL
        ;;

(pNotNl) cmp.leu.unc pIRQGE, pIRQLT = DISPATCH_LEVEL, a0
        mov       rDPC = 1                      // speculate that DPC is active
(pIRQLT) br.spnt   KxUnlockDispatcherDatabase
        ;;

//
// Case 1:
// Next thread is NULL:
// Release dispatcher database lock, restore IRQL to its previous level
// and return
//

//
// Case 2:
// A new thread has been selected to run on the current processor, but
// the new IRQL is not below dispatch level. Release the dispatcher
// lock and restore IRQL. If the current processor is
// not executing a DPC, then request a dispatch interrupt on the current
// processor.
//
// At this point pNotNl = 1 if thread not NULL, 0 if NULL
//

(pIRQGE) ld4       rDPC = [rpT2]                // rDPC.4 = DPC active flag
#if !defined(NT_UP)
        add         out0 = (LockQueueDispatcherLock * 16) + PbLockQueue, rPrcb
        br.call.sptk brp = KeReleaseQueuedSpinLockFromDpcLevel
#endif // !defined(NT_UP)
        ;;

        LOWER_IRQL(a0)
        cmp4.eq    pDPC = rDPC, zero            // pDPC = request DPC intr
        REQUEST_DISPATCH_INT(pDPC)              // request DPC interrupt

        NESTED_RETURN
        NESTED_EXIT(KiUnlockDispatcherDatabase)

//
// N.B. This routine is carefully written as a nested function.
//    Control only reaches this routine from above.
//
//    rPrcb contains the address of PRCB
//    v0 contains the next thread
//

        NESTED_ENTRY(KxUnlockDispatcherDatabase)
        PROLOGUE_BEGIN

        .regstk   1, 2, 1, 0
        alloc     t16 = ar.pfs, 1, 2, 1, 0
        .save     rp, loc0
        mov       loc0 = brp
        .fframe   SwitchFrameLength
        add       sp = -SwitchFrameLength, sp
        ;;

        .save     ar.unat, loc1
        mov       loc1 = ar.unat
        add       t0 = ExFltS19+SwExFrame+STACK_SCRATCH_AREA, sp
        add       t1 = ExFltS18+SwExFrame+STACK_SCRATCH_AREA, sp
        ;;

        .save.gf  0x0, 0xC0000
        stf.spill [t0] = fs19, ExFltS17-ExFltS19
        stf.spill [t1] = fs18, ExFltS16-ExFltS18
        ;;

        .save.gf  0x0, 0x30000
        stf.spill [t0] = fs17, ExFltS15-ExFltS17
        stf.spill [t1] = fs16, ExFltS14-ExFltS16
        mov       t10 = bs4
        ;;

        .save.gf  0x0, 0xC000
        stf.spill [t0] = fs15, ExFltS13-ExFltS15
        stf.spill [t1] = fs14, ExFltS12-ExFltS14
        mov       t11 = bs3
        ;;

        .save.gf  0x0, 0x3000
        stf.spill [t0] = fs13, ExFltS11-ExFltS13
        stf.spill [t1] = fs12, ExFltS10-ExFltS12
        mov       t12 = bs2
        ;;

        .save.gf  0x0, 0xC00
        stf.spill [t0] = fs11, ExFltS9-ExFltS11
        stf.spill [t1] = fs10, ExFltS8-ExFltS10
        mov       t13 = bs1
        ;;

        .save.gf  0x0, 0x300
        stf.spill [t0] = fs9, ExFltS7-ExFltS9
        stf.spill [t1] = fs8, ExFltS6-ExFltS8
        mov       t14 = bs0
        ;;

        .save.gf  0x0, 0xC0
        stf.spill [t0] = fs7, ExFltS5-ExFltS7
        stf.spill [t1] = fs6, ExFltS4-ExFltS6
        mov       t15 = ar.lc
        ;;

        .save.gf  0x0, 0x30
        stf.spill [t0] = fs5, ExFltS3-ExFltS5
        stf.spill [t1] = fs4, ExFltS2-ExFltS4
        ;;

        .save.f   0xC
        stf.spill [t0] = fs3, ExFltS1-ExFltS3         // save fs3
        stf.spill [t1] = fs2, ExFltS0-ExFltS2         // save fs2
        ;;

        .save.f   0x3
        stf.spill [t0] = fs1, ExBrS4-ExFltS1          // save fs1
        stf.spill [t1] = fs0, ExBrS3-ExFltS0          // save fs0
        ;;

        .save.b   0x18
        st8       [t0] = t10, ExBrS2-ExBrS4           // save bs4
        st8       [t1] = t11, ExBrS1-ExBrS3           // save bs3
        ;;

        .save.b   0x6
        st8       [t0] = t12, ExBrS0-ExBrS2           // save bs2
        st8       [t1] = t13, ExIntS2-ExBrS1          // save bs1
        ;;

        .save.b   0x1
        st8       [t0] = t14, ExIntS3-ExBrS0          // save bs0
        movl      t12 = KiPcr + PcCurrentThread
        ;;

        .save.gf  0xC, 0x0
        .mem.offset 0,0
        st8.spill [t0] = s3, ExIntS1-ExIntS3          // save s3
        .mem.offset 8,0
        st8.spill [t1] = s2, ExIntS0-ExIntS2          // save s2
        ;;

        .save.gf  0x3, 0x0
        .mem.offset 0,0
        st8.spill [t0] = s1, ExApLC-ExIntS1           // save s1
        .mem.offset 8,0
        st8.spill [t1] = s0, ExApEC-ExIntS0           // save s0
        ;;

        .savepsp  ar.pfs, ExceptionFrameLength-ExApEC-STACK_SCRATCH_AREA
        st8       [t1] = t16, ExIntNats-ExApEC
        mov       t4 = ar.unat                        // captured Nats of s0-s3
        mov       s0 = rPrcb

        LDPTR     (s1, t12)                           // current thread
        ;;

        .savepsp  ar.lc, ExceptionFrameLength-ExApLC-STACK_SCRATCH_AREA
        st8       [t0] = t15
        .savepsp  @priunat, ExceptionFrameLength-ExIntNats-STACK_SCRATCH_AREA
        st8       [t1] = t4                           // save Nats of s0-s3
        mov       s2 = v0

        PROLOGUE_END

        add       rpT2 = PbNextThread, s0       // -> next thread
        add       out0 = ThWaitIrql, s1         // -> previous IRQL
        ;;

        STPTRINC  (rpT2, zero,PbCurrentThread-PbNextThread)  // clear NextThread
        st1       [out0] = a0, ThIdleSwapBlock-ThWaitIrql    // save old IRQL
        mov       rpT3 = 1
        ;;

//
// Reready current thread for execution and swap context to the selected
// thread.
//
// Note:  Set IdleSwapBlock in the current thread so no idle processor
// can switch to this processor before it is removed from the current
// processor.

        STPTR     (rpT2, s2)                    // set current thread object
        st1       [out0] = rpT3, -ThIdleSwapBlock// out0 -> previous thread
        br.call.sptk brp = KiReadyThread
        ;;

        br.call.sptk brp = SwapContext
        ;;

//
// Lower IRQL, deallocate exception/switch frame.
//
// N.B. SwapContext releases the dispatcher database lock.
//
// N.B. v0 contains the kernel APC pending state on return.
//
// N.B. s2 contains the address of the new thread on return.
//

        add       rpT2 = ThWaitIrql, s2        // -> ThWaitIrql
        cmp.ne    pAPC, pNoAPC = zero, v0
        ;;

        ld1       a0 = [rpT2]                  // a0 = original wait IRQL
        ;;

(pAPC)  cmp.ne    pNoAPC = zero, a0            // APC pending and IRQL == 0
(pNoAPC) br.spnt  Kudd_Exit
        ;;

        .regstk   1, 2, 3, 0
        alloc     t16 = ar.pfs, 1, 2, 3, 0
        mov       rT2 = APC_LEVEL
        ;;

        SET_IRQL(rT2)

        mov       out0 = KernelMode
        mov       out1 = zero
        mov       out2 = zero
        br.call.sptk brp = KiDeliverApc
        ;;

//
// Lower IRQL to wait level, set return status, restore registers, and return.
//

Kudd_Exit:

        LOWER_IRQL(a0)                          // a0 = new irql

        add       out0 = STACK_SCRATCH_AREA+SwExFrame, sp
        br.call.sptk brp = KiRestoreExceptionFrame
        ;;

        add       rpT1 = ExApEC+SwExFrame+STACK_SCRATCH_AREA, sp
        ;;
        ld8       rT1 = [rpT1]
        mov       brp = loc0
        ;;

        mov       ar.unat = loc1
        nop.f     0
        mov       ar.pfs = rT1

        .restore
        add       sp = SwitchFrameLength, sp
        nop.i     0
        br.ret.sptk brp
        ;;

        NESTED_EXIT(KxUnlockDispatcherDatabase)

        SBTTL("Swap Thread")
//++
//--------------------------------------------------------------------
//
// BOOLEAN
// KiSwapContext (
//    IN PKTHREAD Thread
//    )
//
// Routine Description:
//
//       This routine saves the non-volatile registers, marshals the
//       arguments for SwapContext and calls SwapContext to perform
//       the actual thread switch.
//
// Arguments:
//
//       Thread - Supplies the address of the new thread.
//
// Return Value:
//
//       If a kernel APC is pending, then a value of TRUE is returned.
//       Otherwise, FALSE is returned.
//
// Notes:
//
//       GP valid on entry -- GP is not switched, just use kernel GP
//--------------------------------------------------------------------
//--

        NESTED_ENTRY(KiSwapContext)

//
// Register aliases
//

        pNoAPC    = pt2                         // do not dispatch APC

        rpT1      = t0                          // temp pointer
        rpT2      = t1                          // temp pointer
        rT1       = t10                         // temp regs

        PROLOGUE_BEGIN

        .regstk   1, 2, 1, 0
        alloc     t16 = ar.pfs, 1, 2, 1, 0
        .save     rp, loc0
        mov       loc0 = brp
        .fframe   SwitchFrameLength
        add       sp = -SwitchFrameLength, sp
        ;;

        .save     ar.unat, loc1
        mov       loc1 = ar.unat
        add       t0 = ExFltS19+SwExFrame+STACK_SCRATCH_AREA, sp
        add       t1 = ExFltS18+SwExFrame+STACK_SCRATCH_AREA, sp
        ;;

        .save.gf  0x0, 0xC0000
        stf.spill [t0] = fs19, ExFltS17-ExFltS19
        stf.spill [t1] = fs18, ExFltS16-ExFltS18
        ;;

        .save.gf  0x0, 0x30000
        stf.spill [t0] = fs17, ExFltS15-ExFltS17
        stf.spill [t1] = fs16, ExFltS14-ExFltS16
        mov       t10 = bs4
        ;;

        .save.gf  0x0, 0xC000
        stf.spill [t0] = fs15, ExFltS13-ExFltS15
        stf.spill [t1] = fs14, ExFltS12-ExFltS14
        mov       t11 = bs3
        ;;

        .save.gf  0x0, 0x3000
        stf.spill [t0] = fs13, ExFltS11-ExFltS13
        stf.spill [t1] = fs12, ExFltS10-ExFltS12
        mov       t12 = bs2
        ;;

        .save.gf  0x0, 0xC00
        stf.spill [t0] = fs11, ExFltS9-ExFltS11
        stf.spill [t1] = fs10, ExFltS8-ExFltS10
        mov       t13 = bs1
        ;;

        .save.gf  0x0, 0x300
        stf.spill [t0] = fs9, ExFltS7-ExFltS9
        stf.spill [t1] = fs8, ExFltS6-ExFltS8
        mov       t14 = bs0
        ;;

        .save.gf  0x0, 0xC0
        stf.spill [t0] = fs7, ExFltS5-ExFltS7
        stf.spill [t1] = fs6, ExFltS4-ExFltS6
        mov       t15 = ar.lc
        ;;

        .save.gf  0x0, 0x30
        stf.spill [t0] = fs5, ExFltS3-ExFltS5
        stf.spill [t1] = fs4, ExFltS2-ExFltS4
        ;;

        .save.f   0xC
        stf.spill [t0] = fs3, ExFltS1-ExFltS3         // save fs3
        stf.spill [t1] = fs2, ExFltS0-ExFltS2         // save fs2
        ;;

        .save.f   0x3
        stf.spill [t0] = fs1, ExBrS4-ExFltS1          // save fs1
        stf.spill [t1] = fs0, ExBrS3-ExFltS0          // save fs0
        ;;

        .save.b   0x18
        st8       [t0] = t10, ExBrS2-ExBrS4           // save bs4
        st8       [t1] = t11, ExBrS1-ExBrS3           // save bs3
        ;;

        .save.b   0x6
        st8       [t0] = t12, ExBrS0-ExBrS2           // save bs2
        st8       [t1] = t13, ExIntS2-ExBrS1          // save bs1
        ;;

        .save.b   0x1
        st8       [t0] = t14, ExIntS3-ExBrS0          // save bs0
        ;;

        .save.gf  0xC, 0x0
        .mem.offset 0,0
        st8.spill [t0] = s3, ExIntS1-ExIntS3          // save s3
        .mem.offset 8,0
        st8.spill [t1] = s2, ExIntS0-ExIntS2          // save s2
        ;;

        .save.gf  0x3, 0x0
        .mem.offset 0,0
        st8.spill [t0] = s1, ExApLC-ExIntS1           // save s1
        .mem.offset 8,0
        st8.spill [t1] = s0, ExApEC-ExIntS0           // save s0
        ;;

        .savepsp  ar.pfs, ExceptionFrameLength-ExApEC-STACK_SCRATCH_AREA
        st8       [t1] = t16, ExIntNats-ExApEC
        mov       t4 = ar.unat                        // captured Nats of s0-s3
        ;;

        .savepsp  ar.lc, ExceptionFrameLength-ExApLC-STACK_SCRATCH_AREA
        st8       [t0] = t15
        .savepsp  @priunat, ExceptionFrameLength-ExIntNats-STACK_SCRATCH_AREA
        st8       [t1] = t4                           // save Nats of s0-s3

        PROLOGUE_END

        //
        //  For the call to SwapContext-
        //
        //          s0                          // Prcb address
        //          s1                          // old thread address
        //          s2                          // new thread address
        //          pt0 = 1
        //

        mov         s2 = a0                     // s2 <- New Thread
        movl        rpT1 = KiPcr + PcPrcb
        ;;

        LDPTRINC  (s0, rpT1, PcCurrentThread-PcPrcb)// s0 <- Prcb
        ;;
        LDPTR     (s1, rpT1)                    // s1 <- Old Thread
        add       rpT2 = PbCurrentThread, s0
        ;;

//
// Swap context to the next thread.
//

        STPTR     (rpT2, a0)                    // Set new thread current
        cmp.eq    pt0 = zero, zero              // indicate lock context swap
        br.call.sptk brp = SwapContext          // call SwapContext(prcb, OldTh, NewTh)
        ;;

//
// Deallocate exception/switch frame.
//
// N.B. SwapContext releases the dispatcher database lock.
//
// N.B. v0 contains the kernel APC pending state on return, ie, 0 if
//      no APC pending, 1 if APC pending.   v0 will be forced to 0 if
//      the new IRQL doesn't allow APCs.
//
// N.B. KiRestoreExceptionFrame doesn't touch v0, t21 or t22.
//

        add       rpT2 = ThWaitIrql, s2        // -> ThWaitIrql
        add       rpT1 = ExApEC+SwExFrame+STACK_SCRATCH_AREA, sp
        add       out0 = STACK_SCRATCH_AREA+SwExFrame, sp
        ;;

        ld1       t21 = [rpT2]                 // t21 = original wait IRQL
        ld8       t22 = [rpT1]                 // t22 = PFS
        br.call.sptk brp = KiRestoreExceptionFrame
        ;;

        mov       brp = loc0
        cmp.ne    pNoAPC = zero, t21           // no APC if IRQL != 0
        ;;

        mov       ar.unat = loc1
        nop.f     0
        mov       ar.pfs = t22

        .restore
        add       sp = SwitchFrameLength, sp
(pNoAPC) mov      v0 = zero
        br.ret.sptk brp
        ;;

        NESTED_EXIT(KiSwapContext)

        SBTTL("Swap Context to Next Thread")
//++
//--------------------------------------------------------------------
// Routine:
//
//       SwapContext
//
// Routine Description:
//
//       This routine is called to swap context from one thread to the next.
//
// Arguments:
//
//       s0 - Address of Processor Control Block (PRCB).
//       s1 - Address of previous thread object.
//       s2 - Address of next thread object.
//
// Return value:
//
//       v0 - Kernel APC pending flag
//       s0 - Address of Processor Control Block (PRCB).
//       s1 - Address of previous thread object.
//       s2 - Address of current thread object.
//
// Note:
//       Kernel GP is not saved and restored across context switch
//
//       !!WARNING!! - Thierry. 03/01/2000.
//       Be aware that this implementation is a result of performance analysis.
//       Please consider this when you are making changes...
//
//--------------------------------------------------------------------
//--

        NESTED_ENTRY(SwapContext)

//
// Register aliases
//

        rT1       = t1                          // temp
        rT2       = t2                          // temp
        rT3       = t3                          // temp
        rNewproc  = t4                          // next process object
        rOldproc  = t5                          // previous process object
        rpThBSL   = t6                          // pointer to new thread backing store limit
        rpT1      = t7                          // temp pointer
        rpT2      = t8                          // temp pointer
        rpT3      = t9                          // temp pointer
        rAr1      = t10
        rAr2      = t11
        rAr3      = t12
        rAr4      = t13

        rNewIKS   = t14                         // new initial kernel stack
        rNewKSL   = t15                         // new kernel stack limit
        rNewBSP   = t16                         // new thread BSP/BSPSTORE
        rOldBSP   = t16                         // old thread BSP
        rOldRNAT  = t17                         // old thread RNAT
        rNewRNAT  = t17                         // new thread RNAT
        rOldSbase = t18                         // old thread kstack base

        pUsTh     = pt4                         // is user thread?
        pKrTh     = pt5                         // is user thread?
        pSave     = pt7                         // is high fp set dirty?
        pDiff     = ps4                         // if new and old process different
        pSame     = ps5                         // if new and old process same

//
// Set new thread's state to running. Note this must be done
// under the dispatcher lock so that KiSetPriorityThread sees
// the correct state.
//

        PROLOGUE_BEGIN


#if !defined(NT_UP)

        alloc     rT2 = ar.pfs, 0, 0, 4, 0
        mov       rT1 = brp                     // move from brp takes 2 cycles
        add       rpT3 = ThState, s2
        ;;

        lfetch.excl  [rpT3]
        mov       rAr1 = Running
        add       rpT2 = SwPFS+STACK_SCRATCH_AREA, sp
        ;;
 
        add         out0 = (LockQueueContextSwapLock * 16) + PbLockQueue, s0
        .savesp   ar.pfs, SwPFS+STACK_SCRATCH_AREA
        st8.nta   [rpT2] = rT2, SwRp-SwPFS     // save pfs
        ;;

        .savesp   brp, SwRp+STACK_SCRATCH_AREA
        st8.nta   [rpT2] = rT1                 // save return link
        st1.nta   [rpT3] = rAr1                // set thread state to Running
        br.call.sptk brp = KeAcquireQueuedSpinLockAtDpcLevel
        ;;

//
// Release DispatcherLock.
//

        add         out0 = (LockQueueDispatcherLock * 16) + PbLockQueue, s0
        br.call.sptk brp = KeReleaseQueuedSpinLockFromDpcLevel
        ;;

        mov       out0 = ar.fpsr                // move from ar.fpsr takes 12 cycles
        movl      rpT1 = KiPcr+PcHighFpOwner    // setup for prefetching         
        ;;
{ .mmi
        lfetch    [rpT1]
        cmp.ne    pUsTh = zero, teb             // test for ia32 save required
                                                // must not have a nop.f for next 10 cycles--
                                                // Using temporarely the explicit templating
                                                // for the next cycles.
        add       out1 = ThStackBase, s1        // move early to start access for rOldSbase
{ .mmi
        add       rpT1 = SwFPSR+STACK_SCRATCH_AREA, sp
        add       rpT2 = SwPreds+STACK_SCRATCH_AREA, sp
        nop.i     0x0
}
        ;;
{ .mmi
        ld8.nta   rOldSbase = [out1]            // speculative start early for ia32 saves
        lfetch.excl [rpT1]
        add       out2 = ThNumber, s2           // setup for prefetching           
}
{ .mmi
        mov.m     ar.rsc = r0                   // put RSE in lazy mode
        mov       rOldBSP = ar.bsp              // move from ar.bsp takes 12 cycles
        nop.i     0x0
}
        ;;
{ .mmi
        lfetch    [out2]     
        nop.m     0x0
        mov       rT1 = pr                      // move from pr takes 2 cycles
}
        ;;

{ .mmi
        flushrs
        mov       rT3 = psr.um                  // move from psr.um takes 12 cycles
        nop.i     0x0
}
        ;;
{ .mmi
        lfetch.excl  [rpT2]
        mov.m     rOldRNAT = ar.rnat            // move from ar.rnat takes 5 cycles
        add       out2 = @gprel(PPerfGlobalGroupMask), gp
}
        ;;
{ .mli
        lfetch    [out2]                              
        movl      out3 = KiPcr + PcInterruptionCount  // INTERRUPTION_LOGGING on or off, we are prefetching this line.
                                                      // If any real performance problem is detected, we will undef these lines.
}
        ;;
{ .mmi
        lfetch    [out3]      
        add       rpT3 = SwRnat+STACK_SCRATCH_AREA, sp
}
        ;;

#else  // NT_UP
        alloc     rT2 = ar.pfs, 0, 0, 4, 0
        cmp.ne    pUsTh = zero, teb             // test for ia32 save required
        ;;
        mov.m     ar.rsc = r0                   // put RSE in lazy mode
        add       out1 = ThStackBase, s1        // move early to start access for rOldSbase
        mov       out0 = ar.fpsr                // move from ar.fpsr takes 12 cycles
                                                // must not have a nop.f for next 10 cycles--
                                                // Using temporarely the explicit templating
                                                // for the next cycles.
        ;;
{ .mmi
        ld8.nta   rOldSbase = [out1]            // speculative start early for ia32 saves
        mov       rOldBSP = ar.bsp              // move from ar.bsp takes 12 cycles
        add       rpT1 = SwRp+STACK_SCRATCH_AREA, sp
}
        ;;
        flushrs
        mov       rT3 = psr.um                  // move from psr.um takes 12 cycles
        add       rpT2 = SwPFS+STACK_SCRATCH_AREA, sp
        ;;

        mov.m     rOldRNAT = ar.rnat            // move from ar.rnat takes 5 cycles
        mov       rT1 = brp                     // move from brp takes 2 cycles
        add       rpT3 = ThState, s2
        ;;

{ .mmi
        mov       rAr1 = Running
        .savesp   brp, SwRp+STACK_SCRATCH_AREA
        st8.nta   [rpT1] = rT1, SwFPSR-SwRp    // save return link
        nop.i     0x0  
}
        ;;


{ .mii
        st1.nta   [rpT3] = rAr1                 // set thread state to Running
        mov       rT1 = pr                      // move from pr takes 2 cycles
        nop.i     0x0  
}
        ;;

{ .mii
        .savesp   ar.pfs, SwPFS+STACK_SCRATCH_AREA
        st8.nta   [rpT2] = rT2, SwPreds-SwPFS   // save pfs
        add       rpT3 = SwRnat+STACK_SCRATCH_AREA, sp           
        nop.i     0x0  
}
        ;;
#endif // NT_UP
{ .mmi
        st8.nta   [rpT3] = rOldRNAT
        nop.m     0x0
        nop.i     0x0  
}
        st8       [rpT1] = out0, SwBsp-SwFPSR   // save kernel FPSR
        st8       [rpT2] = rT1                  // save preserved predicates
        ;;
        st8.nta   [rpT1] = rOldBSP
        add       rpT3 = ThKernelBStore, s1
        tbit.nz   pSave = rT3, PSR_MFH          // check mfh bit
(pUsTh) br.call.spnt brp = SwapContextIA32Save
        ;;
        st8.nta   [rpT3] = rOldBSP
(pSave) add       out0 = -ThreadStateSaveAreaLength+TsHigherFPVolatile, rOldSbase
(pSave) br.call.spnt brp = KiSaveHigherFPVolatileAtDispatchLevel
        ;;

//
// Acquire the context swap lock so the address space of the old process
// cannot be deleted and then release the dispatcher database lock.
//
// N.B. This lock is used to protect the address space until the context
//    switch has sufficiently progressed to the point where the address
//    space is no longer needed. This lock is also acquired by the reaper
//    thread before it finishes thread termination.
//

       PROLOGUE_END

//
// ***** TBD ****** Save performance counters? (user vs. kernel)
//

//
// Accumlate the total time spent in a thread.
//

#if defined(PERF_DATA)
         **** TBD  **** MIPS code

        addu    a0,sp,ExFltF20          // compute address of result
        move    a1,zero                 // set address of optional frequency
        jal     KeQueryPerformanceCounter // query performance counter
        lw      t0,ExFltF20(sp)         // get current cycle count
        lw      t1,ExFltF20 + 4(sp)     //
        lw      t2,PbStartCount(s0)     // get starting cycle count
        lw      t3,PbStartCount + 4(s0) //
        sw      t0,PbStartCount(s0)     // set starting cycle count
        sw      t1,PbStartCount + 4(s0) //
        lw      t4,EtPerformanceCountLow(s1) // get accumulated cycle count
        lw      t5,EtPerformanceCountHigh(s1) //
        subu    t6,t0,t2                // subtract low parts
        subu    t7,t1,t3                // subtract high parts
        sltu    v0,t0,t2                // generate borrow from high part
        subu    t7,t7,v0                // subtract borrow
        addu    t6,t6,t4                // add low parts
        addu    t7,t7,t5                // add high parts
        sltu    v0,t6,t4                // generate carry into high part
        addu    t7,t7,v0                // add carry
        sw      t6,EtPerformanceCountLow(s1)  // set accumulated cycle count
        sw      t7,EtPerformanceCountHigh(s1) //

#endif // defined(PERF_DATA)

//
// The following entry point is used to switch from the idle thread to
// another thread.
//

        ;;
        ALTERNATE_ENTRY(SwapFromIdle)

        alloc     rT1 = ar.pfs, 2, 0, 2, 0

//
// Check if we are tracing context swaps
//

        mov       out0 = s1     // assign out0 to old ethread pointer
        add       rpT3 = @gprel(PPerfGlobalGroupMask), gp 
        ;;

        ld8.nta   rpT3 = [rpT3] // get value of PperfGlobalGroupMask
        mov       out1 = s2     // assign out1 to new ethread pointer
        ;;

        add       rpT2 = PERF_CONTEXTSWAP_OFFSET, rpT3
        cmp.ne    pt3 = zero, rpT3  // if it's non-zero, then trace on
        ;;

(pt3)   ld4.nta   rpT2 = [rpT2]
        ;;

(pt3)   and       rpT2 = PERF_CONTEXTSWAP_FLAG, rpT2
        ;;

(pt3)   cmp.ne.unc pt4  = zero, rpT2
(pt4)   br.call.spnt brp = WmiTraceContextSwap // optimize for no tracing case
        ;;

//
// Get address of old and new process objects.
//

        add       rpT2 = ThApcState+AsProcess,s2 // -> new thread AsProcess
        add       rpT1 = ThApcState+AsProcess,s1 // -> old thread AsProcess
        ;;

        LDPTR     (rOldproc, rpT1)               // old process
        LDPTR     (rNewproc, rpT2)               // new process

#if !defined(NT_UP)

//
// In MP system,
// should a thread address is recycled and the thread is migrated to a
// processor that holds the stale values in the high fp register set,
// set KiPcr->HighFpOwner to zero (i.e. when pt4 is set to TRUE)
//

        add       rpT1 = ThNumber, s2
        movl      rpT2 = KiPcr+PcHighFpOwner
        ;;

        ld1       rT1 = [rpT1]
        ld8       rT2 = [rpT2], PcNumber-PcHighFpOwner
        add       out0 = ThIdleSwapBlock, s1
        ;;

        ld1       rT3 = [rpT2], PcHighFpOwner-PcNumber
        st1       [out0] = zero                 // clear OldThread->IdleSwapBlock
        cmp.eq    pt3 = rT2, s2
        ;;

 (pt3)  cmp.ne.unc pt4 = rT1, rT3
        ;;
 (pt4)  st8       [rpT2] = zero

#endif // !defined(NT_UP)
        ;;

        flushrs
        FAST_DISABLE_INTERRUPTS
        ;;

//
// Thierry - 03/29/2000
// It should be noticed that the performance analysis for SwapContext
// was done with INTERRUPTION_LOGGING defined as 1.
//

#define INTERRUPTION_LOGGING 1
#if defined(INTERRUPTION_LOGGING)

// For Conditional Interrupt Logging
#define ContextSwitchBit 63

         .global     KiVectorLogMask

         mov       rT3 = gp
         ;;
         movl      gp = _gp
         ;;
         add       rpT1 = @gprel(KiVectorLogMask), gp
         ;;
         ld8       rT1 = [rpT1]
         mov       gp = rT3
         ;;
         tbit.z    pt4 = rT1, ContextSwitchBit
 (pt4)   br.cond.sptk   EndOfLogging0



        movl      rpT1 = KiPcr+PcInterruptionCount
        mov       rT3 = MAX_NUMBER_OF_IHISTORY_RECORDS - 1
        cmp.ne    pDiff,pSame=rOldproc,rNewproc
        ;;
(pDiff) mov       rT1 = 0x91                    // process switch
        ld4.nt1   rT2 = [rpT1]                  // get current count
        ;;

(pSame) mov       rT1 = 0x90                    // thread switch
        add       rpT3 = 1, rT2                 // incr count
        and       rT2 = rT3, rT2                // index of current entry
        add       rpT2 = 0x1000-PcInterruptionCount, rpT1 // base of history
        ;;

        st4.nta   [rpT1] = rpT3                 // save count
        shl       rT2 = rT2, 5                  // offset of current entry
        ;;
        add       rpT2 = rpT2, rT2              // address of current entry
        ;;
        st8       [rpT2] = rT1, 8               // save switch type
        ;;
        st8       [rpT2] = s2, 8                // save new thread pointer
        ;;
        st8       [rpT2] = s1, 8                // save old thread
        ;;
        st8       [rpT2] = sp                   // save old sp
        ;;

// For Conditional Interrupt Logging
EndOfLogging0:

#endif // INTERRUPTION_LOGGING

        mov       ar.rsc = r0                   // put RSE in lazy mode
        add       rpT1 = ThInitialStack, s2
        add       rpT2 = ThKernelStack, s1
        ;;

//
// Store the kernel stack pointer in the previous thread object,
// load the new kernel stack pointer from the new thread object,
// switch backing store pointers, select new process id and swap 
// to the new process.
//

        ld8.nta   rNewIKS = [rpT1], ThKernelStack-ThInitialStack
        st8.nta   [rpT2] = sp                             // save current sp
        ;;

        ld8.nta   sp = [rpT1], ThStackLimit-ThKernelStack
        movl      rpT2 = KiPcr + PcInitialStack
        ;;

        alloc     rT1 = 0,0,0,0              // make current frame 0 size
        ld8.nta   rNewKSL = [rpT1], ThInitialBStore-ThStackLimit
        ;;

        loadrs                               // invalidate RSE and ALAT
        ld8.nta   rT1 = [rpT1], ThBStoreLimit-ThInitialBStore
        ;;

        ld8.nta   rT2 = [rpT1], ThDebugActive-ThBStoreLimit
        st8       [rpT2] = rNewIKS, PcStackLimit-PcInitialStack
        ;;
                                             // get debugger active state
        ld1.nta   rT3 = [rpT1], ThTeb-ThDebugActive
        st8       [rpT2] = rNewKSL, PcInitialBStore-PcStackLimit
        add       rpT3 = SwBsp+STACK_SCRATCH_AREA, sp
        ;;

        ld8       rNewBSP = [rpT3], SwRnat-SwBsp
        st8       [rpT2] = rT1, PcBStoreLimit-PcInitialBStore
        ;;

        ld8       rNewRNAT = [rpT3]
        st8       [rpT2] = rT2, PcDebugActive-PcBStoreLimit
        ;;
                                             // load new teb
        ld8       teb = [rpT1], ThApcState+AsKernelApcPending-ThTeb
                                             // set new debugger active state
        st1       [rpT2] = rT3, PcCurrentThread-PcDebugActive
        invala

//
// Setup PCR intial kernel BSP and BSTORE limit
//

        mov       ar.bspstore = rNewBSP      // load new bspstore
        cmp.ne    pDiff,pSame=rOldproc,rNewproc // if ne, switch process
        ;;
        mov       ar.rnat = rNewRNAT         // load new RNATs
        ;;
        mov       ar.rsc = RSC_KERNEL        // enable RSE
        ;;

//
// If the new process is not the same as the old process, then swap the
// address space to the new process.
//
// N.B. The context swap lock cannot be dropped until all references to the
//      old process address space are complete. This includes any possible
//      TB Misses that could occur referencing the new address space while
//      still executing in the old address space.
//
// N.B. The process address space swap is executed with interrupts disabled.
//

        alloc     rT1 = 0,4,2,0
        STPTR     (rpT2, s2)
        ;;

        mov       kteb = teb                    // update kernel TEB
        FAST_ENABLE_INTERRUPTS
        ld1       loc0 = [rpT1]                 // load the ApcPending flag

#if !defined(NT_UP)

//
// Release the context swap lock
// N.B. ContextSwapLock is always released in KxSwapProcess, if called
//

        add         out0 = (LockQueueContextSwapLock * 16) + PbLockQueue, s0
        add         loc1 = PcApcInterrupt-PcCurrentThread, rpT2
(pSame) br.call.sptk brp = KeReleaseQueuedSpinLockFromDpcLevel
        ;;

#else // !defined(NT_UP)

        add         loc1 = PcApcInterrupt-PcCurrentThread, rpT2
        ;;

#endif // !defined(NT_UP)

        mov       out0 = rNewproc               // set address of new process
        mov       out1 = rOldproc               // set address of old process
(pDiff) br.call.sptk brp = KxSwapProcess        // call swap address space(NewProc, OldProc)
        ;;
//
// In new address space, if changed.
//

        st1       [loc1] = loc0                 // request (or clear) APC pend.
        add       rpT1 = PbContextSwitches, s0
        add       rpT2 = ThContextSwitches, s2
        ;;

//
// If the new thread has a kernel mode APC pending, then request an APC
// interrupt.
//

        ld4       loc1 = [rpT1]
        ld4       loc2 = [rpT2]
        ;;

//
// Increment context switch counters
//

        cmp.ne    pUsTh, pKrTh = zero, teb
        add       loc1 = loc1, zero, 1
        add       loc2 = loc2, zero, 1
        ;;

        st4       [rpT1] = loc1             // increment # of context switches
        st4       [rpT2] = loc2             // increment # of context switches

        add       rpT1 = SwFPSR+STACK_SCRATCH_AREA, sp
        add       rpT2 = SwPFS+STACK_SCRATCH_AREA, sp
        ;;

        ld8       loc1 = [rpT1], SwRp-SwFPSR // restore brp and pfs
        ld8       loc2 = [rpT2], SwPreds-SwPFS
        ;;

        ld8       rT3 = [rpT1]
        ld8       rT2 = [rpT2]

        mov       v0 = loc0                     // set v0 = apc pending
(pUsTh) br.call.spnt brp = SwapContextIA32Restore
        ;;

//
// Note: at this point s0 = Prcb, s1 = previous thread, s2 = current thread
//

        mov       ar.fpsr = loc1
        mov       ar.pfs = loc2
        mov       brp = rT3

        mov       pr = rT2                      // Restore preserved preds

#if 0

//
// Thierry 03/22/2000: 
//
//      The following memory synchronization of the local processor
//      I-cache and D-cache because of I-stream modifications is not
//      required if the modifying code is written following the NT 
//      Core Team specifications:
//         - [Allocate VA]
//         - Modify the code
//         - Call FlushIntructionCache()
//                     -> calls KiSweepIcache[Range]()
//         - Execute the code.
//
//      The removal of this instruction eliminates a "> 100 cycle" stall.
//

        sync.i

#endif // 0
        ;; 
        srlz.i

        br.ret.sptk brp

        NESTED_EXIT(SwapContext)

//++
//--------------------------------------------------------------------
// Routine:
//
//       SwapContextIA32Save
//
// Routine Description:
//
//      This function saves the IA32 context on the kernel stack. 
//      Called from SwapContext.
//
// Arguments:
//
//      rOldSbase : old thread kstack base.
//
// Return value:
//
//      None.
//
// Note:
//
//      SwapContext registers context.
//
//--------------------------------------------------------------------
//--
        LEAF_ENTRY(SwapContextIA32Save)

        mov       rAr1 = ar21             // IA32 FP control register FCR
        ;;
        mov       rAr2 = ar24             // IA32 EFLAG register
        ;;
        mov       rAr3 = ar25
        ;;
        mov       rAr4 = ar26
        ;;
        //
        // we may skip saving ar27 because it cannot be modified by user code
        //
        mov       rT1  = ar30
        ;;
        mov       rT2  = ar28
        ;;
        mov       rT3  = ar29
        ;;
        // these are separated out due to cache miss potential
        add       rpT1 = -ThreadStateSaveAreaLength+TsAppRegisters+TsAr21, rOldSbase  
        add       rpT2 = -ThreadStateSaveAreaLength+TsAppRegisters+TsAr24, rOldSbase
        ;;
        st8       [rpT1] = rAr1, TsAr25-TsAr21
        st8       [rpT2] = rAr2, TsAr26-TsAr24
        ;;
        st8       [rpT1] = rAr3, TsAr29-TsAr25
        st8       [rpT2] = rAr4, TsAr28-TsAr26
        ;;
        st8       [rpT2] = rT2, TsAr30-TsAr28
        ;;
        st8       [rpT2] = rT1  
        st8       [rpT1] = rT3

        br.ret.sptk.few.clr brp
        LEAF_EXIT(SwapContextIA32Save)


//++
//--------------------------------------------------------------------
// Routine:
//
//      SwapContextIA32Restore
//
// Routine Description:
//
//      This function restores the IA32 registers context.
//      Called from SwapContext.
//
// Arguments:
//
//      s2 - Address of next thread object.
//
// Return value:
//
//      None.
//
// Note:
//
//      SwapContext registers context.
//
//--------------------------------------------------------------------
//--
        LEAF_ENTRY(SwapContextIA32Restore)

        add       rpT1 = ThStackBase, s2
        ;;
        ld8.nta   rpT1 = [rpT1]
        ;;

        add       rpT2 = -ThreadStateSaveAreaLength+TsAppRegisters+TsAr21, rpT1
        add       rpT3 = -ThreadStateSaveAreaLength+TsAppRegisters+TsAr24, rpT1
        ;;

        ld8.nta   rAr1 = [rpT2], TsAr25-TsAr21
        ld8.nta   rAr2 = [rpT3], TsAr26-TsAr24
        ;;

        ld8.nta   rAr3 = [rpT2], TsAr27-TsAr25
        ld8.nta   rAr4 = [rpT3], TsAr28-TsAr26
        ;;

        mov       ar21 = rAr1
        mov       ar24 = rAr2

        mov       ar25 = rAr3
        mov       ar26 = rAr4

        ld8.nta   rAr1 = [rpT2], TsAr29-TsAr27
        ld8.nta   rAr2 = [rpT3], TsAr30-TsAr28
        ;;

        ld8.nta   rAr3 = [rpT2]
        ld8.nta   rAr4 = [rpT3]
        ;;
        mov       ar27 = rAr1
        mov       ar28 = rAr2

        mov       ar29 = rAr3
        mov       ar30 = rAr4

        br.ret.sptk.few.clr brp
        LEAF_EXIT(SwapContextIA32Restore)


        SBTTL("Swap Process")
//++
//--------------------------------------------------------------------
//
// VOID
// KiSwapProcess (
//    IN PKPROCESS NewProcess,
//    IN PKPROCESS OldProcess
//    )
//
// Routine Description:
//
//    This function swaps the address space from one process to another by
//    assigning a new region id, if necessary, and loading the fixed entry
//    in the TB that maps the process page directory page. This routine follows
//    the PowerPC design for handling RID wrap.
//
// On entry/exit:
//
//    Interrupt enabled.
//
// Arguments:
//
//    NewProcess (a0) - Supplies a pointer to a control object of type process
//      which represents the new process that is switched to (32-bit address).
//
//    OldProcess (a1) - Supplies a pointer to a control object of type process
//      which represents the old process that is switched from (32-bit address).
//
// Return Value:
//
//    None.
//
//--------------------------------------------------------------------
//--
        NESTED_ENTRY(KiSwapProcess)
        NESTED_SETUP(2,3,3,0)

        PROLOGUE_END

//
// Register aliases
//

         rNewProc  = a0
         rOldProc  = a1

         rpCSLock  = loc2

         rpT1      = t0
         rpT2      = t1
         rProcSet  = t2
         rNewActive= t3
         rOldActive= t4
         rMasterSeq= t5
         rNewSeq   = t6
         rOldPsrL  = t7
         rVa       = t8
         rPDE0     = t9                          // PDE for page directory page 0
         rVa2      = t10
         rSessionBase = t11
         rSessionInfo = t12
         rT1       = t13
         rT2       = t14

//
// KiSwapProcess must get the context swap lock
// KxSwapProcess is called from SwapContext with the lock held
//

#if !defined(NT_UP)
        movl        rpT1 = KiPcr+PcPrcb
        ;;
        ld8         rpT1 = [rpT1]
        ;;
        add         out0 = (LockQueueContextSwapLock * 16) + PbLockQueue, rpT1
        br.call.sptk brp = KeAcquireQueuedSpinLockAtDpcLevel
        ;;
        br.sptk     Ksp_Continue
#endif // !defined(NT_UP)
        ;;

        ALTERNATE_ENTRY(KxSwapProcess)
        NESTED_SETUP(2,3,3,0)

        PROLOGUE_END
//
// Clear the processor set member number in the old process and set the
// processor member number in the new process.
//

Ksp_Continue:

#if !defined(NT_UP)

        add       rpT2 = PrActiveProcessors, rOldProc     // -> old active processor set
        movl      rpT1 = KiPcr + PcSetMember              // -> processor set member
        ;;

        ld4       rProcSet= [rpT1]                        // rProcSet.4 =  processor set member
        add       rpT1 = PrActiveProcessors, rNewProc     // -> new active processor set
        ;;

        ld4       rNewActive = [rpT1]                     // rNewActive.4 = new active processor set
        ld4       rOldActive = [rpT2]                     // rOldActive.4 = old active processor set
        ;;

        or        rNewActive = rNewActive,rProcSet        // set processor member in new set
        xor       rOldActive = rOldActive,rProcSet        // clear processor member in old set
        ;;

        st4       [rpT1] = rNewActive           // set new active processor set
        st4       [rpT2] = rOldActive           // set old active processor set

#endif // !defined(NT_UP)

//
// If the process sequence number matches the system sequence number, then
// use the process RID. Otherwise, allocate a new process RID.
//
// N.B. KiMasterRid, KiMasterSequence are changed only when holding the
//      KiContextSwapLock.
//

        add       rT2 = PrSessionMapInfo, rNewProc
        add       out0 = PrProcessRegion, rNewProc
        ;;
        ld8       out1 = [rT2]
        br.call.sptk brp = KiSyncNewRegionId
        ;;

//
// Switch address space to new process
// v0 = rRid = new process rid
//

        fwb                                     // hint to flush write buffers

        FAST_DISABLE_INTERRUPTS     

        add       rpT1 = PrDirectoryTableBase, rNewProc
        movl      rVa = KiPcr+PcPdeUtbase
        add       rpT2 = PrSessionParentBase, rNewProc
        movl      rVa2 = KiPcr+PcPdeStbase
        ;;

        ld8.nta   rPDE0 = [rpT1]                // rPDE0 = Page directory page 0
        ld8.nta   rSessionBase = [rpT2]
        ld8.nta   rVa = [rVa]
        ld8.nta   rVa2 = [rVa2]
        ;;

//
// To access IFA, ITDR registers, PSR.ic bit must be 0. Otherwise,
// it causes an illegal operation fault. While PSR.ic=0, any
// interruption can not be afforded. Make sure there will be no
// TLB miss and no interrupt coming in during this period.
//

        rsm       1 << PSR_IC                   // PSR.ic=0
        ;;

        srlz.d                                  // must serialize
        mov       rT1 = PAGE_SHIFT << IDTR_PS   // load page size field for IDTR
        ;;

        mov       cr.itir = rT1                 // set up IDTR for dirbase
        ptr.d     rVa, rT1                      // remove DTR for user space
        ;;
        mov       cr.ifa = rVa                  // set up IFA for dirbase vaddr
        mov       rT2   = DTR_UTBASE_INDEX
        ;;

        itr.d     dtr[rT2] = rPDE0              // insert PDE0 to DTR
        ;;

        ptr.d     rVa2, rT1                      // remove DTR for session
        ;;                                      // to avoid a overlapping error
        mov       cr.ifa = rVa2
        mov       rT2 = DTR_STBASE_INDEX
        ;;

        itr.d     dtr[rT2] = rSessionBase       // insert the root for session space
        ;;

        ssm       1 << PSR_IC                   // PSR.ic=1
        ;;
        srlz.i                                  // must I serialize

#if DBG

        mov     t0 = PbProcessorState+KpsSpecialRegisters+KsTrD0+(8*DTR_UTBASE_INDEX)
        movl    t3 = KiPcr + PcPrcb
        ;;

        ld8     t3 = [t3]
        mov     t1 = PbProcessorState+KpsSpecialRegisters+KsTrD0+(8*DTR_STBASE_INDEX)
        ;;

        add     t0 = t3, t0
        add     t1 = t3, t1
        ;;

        st8     [t0] = rPDE0
        st8     [t1] = rSessionBase
        ;;

#endif

        FAST_ENABLE_INTERRUPTS     

        //
        // Now make sure branch history is enabled for non wow processes
        // and disabled for wow processes
        //

        add       t1 = @gprel(KiVectorLogMask), gp
        ;;
        ld8       t1 = [t1]
        ;;
        cmp.eq    pt0 = t1, r0
(pt0)   br.cond.sptk   SkipBranchHistory

        mov     t1 = 3
        ;;
        mov     t2 = cpuid[t1]
        add     t3 = PrWow64Process, rNewProc
        ;;
        extr.u  t2 = t2, 24, 8
        ld4     t4 = [t3];
        ;;
        cmp.ne  pt1 = 7, t2
        ;;
        mov     t1 = 675
(pt1)   br.dpnt     SkipBranchHistory
        ;;
        mov     t2 = msr[t1]
        cmp.eq  pt1,pt2 = zero, t4      // Wow64 is non-zero
        ;;
(pt1)   mov t3 = 2                      // Enable the HB for ia64 procs
(pt2)   mov t3 = 256                    // Disable the HB for wow64 procs
        ;;
        dep     t2 = t3, t2, 0, 9      // Disable the HB for wow64 procs
        ;;
        mov     msr[t1] = t2;
        ;;

SkipBranchHistory:

#if !defined(NT_UP)
//
// Can now release the context swap lock
//

        movl        rpT1 = KiPcr+PcPrcb
        ;;
        ld8         rpT1 = [rpT1]
        ;;
        add         out0 = (LockQueueContextSwapLock * 16) + PbLockQueue, rpT1
        br.call.sptk brp = KeReleaseQueuedSpinLockFromDpcLevel
        ;;

#endif // !defined(NT_UP)

        NESTED_RETURN
        NESTED_EXIT(KiSwapProcess)

        SBTTL("Retire Deferred Procedure Call List")
//++
// Routine:
//
//    VOID
//    KiRetireDpcList (
//      PKPRCB Prcb,
//      )
//
// Routine Description:
//
//    This routine is called to retire the specified deferred procedure
//    call list. DPC routines are called using the idle thread (current)
//    stack.
//
//    N.B. Interrupts must be disabled on entry to this routine. Control is returned
//         to the caller with the same conditions true.
//
// Arguments:
//
//    a0 - Address of the current PRCB.
//
// Return value:
//
//    None.
//
//--

        NESTED_ENTRY(KiRetireDpcList)
        NESTED_SETUP(1,2,4,0)

        PROLOGUE_END


Krdl_Restart:

        add       t0 = PbDpcQueueDepth, a0
        add       t1 = PbDpcRoutineActive, a0
        add       t2 = PbDpcLock, a0
        ;;

        ld4       t4 = [t0]
        add       t3 = PbDpcListHead+LsFlink, a0
        ;;

Krdl_Restart2:

        cmp4.eq   pt1 = zero, t4
        st4       [t1] = t4
 (pt1)  br.spnt   Krdl_Exit
        ;;

#if !defined(NT_UP)
        ACQUIRE_SPINLOCK(t2, a0, Krdl_20)
#endif  // !defined(NT_UP)

        ld4       t4 = [t0]
        LDPTR     (t5, t3)             // -> first DPC entry
        ;;
        cmp4.eq   pt1, pt2 = zero, t4
        ;;

 (pt2)  add       t10 = LsFlink, t5
 (pt2)  add       out0 = -DpDpcListEntry, t5
 (pt1)  br.spnt   Krdl_Unlock
        ;;

        LDPTR     (t6, t10)
        add       t11 = DpDeferredRoutine, out0
        add       t12 = DpSystemArgument1, out0
        ;;

//
// Setup call to DPC routine
//
// arguments are:
//      dpc object address (out0)
//      deferred context   (out1)
//      system argument 1  (out2)
//      system argument 2  (out3)
//
// N.B. the arguments must be loaded from the DPC object BEFORE
//      the inserted flag is cleared to prevent the object being
//      overwritten before its time.
//

        ld8.nt1   t13 = [t11], DpDeferredContext-DpDeferredRoutine
        ld8.nt1   out2 = [t12], DpSystemArgument2-DpSystemArgument1
        ;;

        ld8.nt1   out1 = [t11], DpLock-DpDeferredContext
        ld8.nt1   out3 = [t12]
        add       t4 = -1, t4

        STPTRINC  (t3, t6, -LsFlink)
        ld8.nt1   t14 = [t13], 8
        add       t15 = LsBlink, t6
        ;;

        ld8.nt1   gp = [t13]
        STPTR     (t15, t3)

        STPTR     (t11, zero)
        st4       [t0] = t4

#if !defined(NT_UP)
        RELEASE_SPINLOCK(t2)             // set spin lock not owned
#endif //!defined(NT_UP)

        FAST_ENABLE_INTERRUPTS
        mov       bt0 = t14
        br.call.sptk.few.clr brp = bt0          // call DPC routine
        ;;

//
// Check to determine if any more DPCs are available to process.
//

        FAST_DISABLE_INTERRUPTS
        br        Krdl_Restart
        ;;

//
// The DPC list became empty while we were acquiring the DPC queue lock.
// Clear DPC routine active.  The race condition mentioned above doesn't
// exist here because we hold the DPC queue lock.
//

Krdl_Unlock:

#if !defined(NT_UP)
        add       t2 = PbDpcLock, a0
        ;;
        RELEASE_SPINLOCK(t2)
#endif // !defined(NT_UP)

Krdl_Exit:

        add       t0 = PbDpcQueueDepth, a0
        add       t1 = PbDpcRoutineActive, a0
        add       out0 = PbDpcInterruptRequested, a0
        ;;

        st4.nta   [t1] = zero
        st4.rel.nta [out0] = zero
        add       t2 = PbDpcLock, a0

        ld4       t4 = [t0]
        add       t3 = PbDpcListHead+LsFlink, a0
        ;;

        cmp4.eq   pt1, pt2 = zero, t4
 (pt2)  br.spnt   Krdl_Restart2
        ;;

        NESTED_RETURN
        NESTED_EXIT(KiRetireDpcList)

        SBTTL("Dispatch Interrupt")
//++
//--------------------------------------------------------------------
// Routine:
//
//     KiDispatchInterrupt
//
// Routine Description:
//
//    This routine is entered as the result of a software interrupt generated
//    at DISPATCH_LEVEL. Its function is to process the Deferred Procedure Call
//    (DPC) list, and then perform a context switch if a new thread has been
//    selected for execution on the processor.
//
//    This routine is entered at IRQL DISPATCH_LEVEL with the dispatcher
//    database unlocked. When a return to the caller finally occurs, the
//    IRQL remains at DISPATCH_LEVEL, and the dispatcher database is still
//    unlocked.
//
//    N.B. On entry to this routine the volatile states (excluding high
//         floating point register set) have been saved.
//
// On entry:
//
//    sp - points to stack scratch area.
//
// Arguments:
//
//    None
//
// Return Value:
//
//    None.
//--------------------------------------------------------------------
//--
        NESTED_ENTRY(KiDispatchInterrupt)
        PROLOGUE_BEGIN

        .regstk   0, 4, 2, 0
        alloc     t16 = ar.pfs, 0, 4, 2, 0
        .save     rp, loc0
        mov       loc0 = brp
        .fframe   SwitchFrameLength
        add       sp = -SwitchFrameLength, sp
        ;;

        .save     ar.unat, loc1
        mov       loc1 = ar.unat
        add       t0 = ExFltS19+SwExFrame+STACK_SCRATCH_AREA, sp
        add       t1 = ExFltS18+SwExFrame+STACK_SCRATCH_AREA, sp
        ;;

        .save.gf  0x0, 0xC0000
        stf.spill [t0] = fs19, ExFltS17-ExFltS19
        stf.spill [t1] = fs18, ExFltS16-ExFltS18
        ;;

        .save.gf  0x0, 0x30000
        stf.spill [t0] = fs17, ExFltS15-ExFltS17
        stf.spill [t1] = fs16, ExFltS14-ExFltS16
        mov       t10 = bs4
        ;;

        .save.gf  0x0, 0xC000
        stf.spill [t0] = fs15, ExFltS13-ExFltS15
        stf.spill [t1] = fs14, ExFltS12-ExFltS14
        mov       t11 = bs3
        ;;

        .save.gf  0x0, 0x3000
        stf.spill [t0] = fs13, ExFltS11-ExFltS13
        stf.spill [t1] = fs12, ExFltS10-ExFltS12
        mov       t12 = bs2
        ;;

        .save.gf  0x0, 0xC00
        stf.spill [t0] = fs11, ExFltS9-ExFltS11
        stf.spill [t1] = fs10, ExFltS8-ExFltS10
        mov       t13 = bs1
        ;;

        .save.gf  0x0, 0x300
        stf.spill [t0] = fs9, ExFltS7-ExFltS9
        stf.spill [t1] = fs8, ExFltS6-ExFltS8
        mov       t14 = bs0
        ;;

        .save.gf  0x0, 0xC0
        stf.spill [t0] = fs7, ExFltS5-ExFltS7
        stf.spill [t1] = fs6, ExFltS4-ExFltS6
        mov       t15 = ar.lc
        ;;

        .save.gf  0x0, 0x30
        stf.spill [t0] = fs5, ExFltS3-ExFltS5
        stf.spill [t1] = fs4, ExFltS2-ExFltS4
        ;;

        .save.f   0xC
        stf.spill [t0] = fs3, ExFltS1-ExFltS3         // save fs3
        stf.spill [t1] = fs2, ExFltS0-ExFltS2         // save fs2
        ;;

        .save.f   0x3
        stf.spill [t0] = fs1, ExBrS4-ExFltS1          // save fs1
        stf.spill [t1] = fs0, ExBrS3-ExFltS0          // save fs0
        ;;

        .save.b   0x18
        st8       [t0] = t10, ExBrS2-ExBrS4           // save bs4
        st8       [t1] = t11, ExBrS1-ExBrS3           // save bs3
        ;;

        .save.b   0x6
        st8       [t0] = t12, ExBrS0-ExBrS2           // save bs2
        st8       [t1] = t13, ExIntS2-ExBrS1          // save bs1
        ;;

        .save.b   0x1
        st8       [t0] = t14, ExIntS3-ExBrS0          // save bs0
        ;;

        .save.gf  0xC, 0x0
        .mem.offset 0,0
        st8.spill [t0] = s3, ExIntS1-ExIntS3          // save s3
        .mem.offset 8,0
        st8.spill [t1] = s2, ExIntS0-ExIntS2          // save s2
        ;;

        .save.gf  0x3, 0x0
        .mem.offset 0,0
        st8.spill [t0] = s1, ExApLC-ExIntS1           // save s1
        .mem.offset 8,0
        st8.spill [t1] = s0, ExApEC-ExIntS0           // save s0
        ;;

        .savepsp  ar.pfs, ExceptionFrameLength-ExApEC-STACK_SCRATCH_AREA
        st8       [t1] = t16, ExIntNats-ExApEC
        mov       t4 = ar.unat                        // captured Nats of s0-s3
        ;;

        .savepsp  ar.lc, ExceptionFrameLength-ExApLC-STACK_SCRATCH_AREA
        st8       [t0] = t15
        .savepsp  @priunat, ExceptionFrameLength-ExIntNats-STACK_SCRATCH_AREA
        st8       [t1] = t4                           // save Nats of s0-s3

        PROLOGUE_END

//
// Register aliases
//

        rPrcb     = loc2
        rKerGP    = loc3

        rpT1      = t0
        rpT2      = t1
        rT1       = t2
        rT2       = t3
        rpDPLock  = t4                          // pointer to dispatcher lock

        pNoTh     = pt1                         // No next thread to run
        pNext     = pt2                         // next thread not null
        pNull     = pt3                         // no thread available
        pOwned    = pt4                         // dispatcher lock already owned
        pNotOwned = pt5
        pQEnd     = pt6                         // quantum end request pending
        pNoQEnd   = pt7                         // no quantum end request pending

//
// Increment the dispatch interrupt count
//

        mov       rKerGP = gp                   // save gp
        movl      rPrcb = KiPcr + PcPrcb
        ;;

        LDPTR     (rPrcb, rPrcb)                 // rPrcb -> Prcb
        ;;
        add       rpT1 = PbDispatchInterruptCount, rPrcb
        ;;
        ld4       rT1 = [rpT1]
        ;;
        add       rT1 = rT1, zero, 1
        ;;
        st4       [rpT1] = rT1

// **** TBD **** use alpha optimization to first check Dpc Q depth


//
// Process the DPC list
//

Kdi_PollDpcList:

//
// Process the deferred procedure call list.
//

        FAST_ENABLE_INTERRUPTS
        ;;
        srlz.d

//
// **** TBD ***** No stack switch as in alpha, mips...
// Save current initial stack address and set new initial stack address.
//

        FAST_DISABLE_INTERRUPTS
        mov      out0 = rPrcb
        br.call.sptk brp = KiRetireDpcList
        ;;


//
// Check to determine if quantum end has occured.
//
// N.B. If a new thread is selected as a result of processing a quantum
//      end request, then the new thread is returned with the dispatcher
//      database locked. Otherwise, NULL is returned with the dispatcher
//      database unlocked.
//

        FAST_ENABLE_INTERRUPTS
        add       rpT1 = PbQuantumEnd, rPrcb
        ;;

        ld4       rT1 = [rpT1]                  // get quantum end indicator
        ;;
        cmp4.ne   pQEnd, pNoQEnd = rT1, zero    // if zero, no quantum end reqs
        mov       gp = rKerGP                   // restore gp
        ;;

(pQEnd) st4       [rpT1] = zero                 // clear quantum end indicator
(pNoQEnd) br.cond.sptk Kdi_NoQuantumEnd
(pQEnd) br.call.spnt brp = KiQuantumEnd         // call KiQuantumEnd (C code)
        ;;

        cmp4.eq   pNoTh, pNext = v0, zero       // pNoTh = no next thread
(pNoTh) br.dpnt   Kdi_Exit                      // br to exit if no next thread
(pNext) br.dpnt   Kdi_Swap                      // br to swap to next thread

//
// If no quantum end requests:
// Check to determine if a new thread has been selected for execution on
// this processor.
//

Kdi_NoQuantumEnd:
        add       rpT2 = PbNextThread, rPrcb
        ;;
        LDPTR     (rT1, rpT2)                   // rT1 = address of next thread object
        ;;

        cmp.eq    pNull = rT1, zero             // pNull => no thread selected
(pNull) br.dpnt   Kdi_Exit                      // exit if no thread selected

#if !defined(NT_UP)

//
// try to acquire the dispatcher database lock.
//

        mov       out0 = LockQueueDispatcherLock
        movl      out1 = KiPcr+PcSystemReserved+8
        br.call.sptk brp = KeTryToAcquireQueuedSpinLockRaiseToSynch
        ;;

        cmp.ne    pOwned, pNotOwned = TRUE, v0  // pOwned = 1 if not free
(pOwned) br.dpnt   Kdi_PollDpcList              // br out if owned
        ;;

#else

        mov       rT1 = SYNCH_LEVEL
        ;;
        SET_IRQL  (rT1)

#endif // !defined(NT_UP)

//
// Reread address of next thread object since it is possible for it to
// change in a multiprocessor system.
//

Kdi_Swap:

        add       rpT2 = PbNextThread, rPrcb    // -> next thread
        movl      rpT1 = KiPcr + PcCurrentThread
        ;;

        LDPTR     (s1, rpT1)                    // current thread object
        LDPTR     (s2, rpT2)                    // next thread object
        add       rpT1 = PbCurrentThread, rPrcb
        ;;


//
// Reready current thread for execution and swap context to the selected
// thread.
//
// Note:  Set IdleSwapBlock in the current thread so no idle processor
// can switch to this processor before it is removed from the current
// processor.
//

        STPTR     (rpT2, zero)                  // clear addr of next thread
        add       out0 = ThIdleSwapBlock, s1    // block swap from idle
        mov       rT1 = 1
        ;;

        STPTR     (rpT1, s2)                    // set addr of current thread
        st1       [out0] = rT1, -ThIdleSwapBlock// set addr of previous thread
        br.call.sptk brp = KiReadyThread        // call KiReadyThread(OldTh)
        ;;

        mov       s0 = rPrcb                    // setup call
        cmp.ne    pt0 = zero, zero              // no need to lock context swap
        br.call.sptk brp = SwapContext          // call SwapContext(Prcb, OldTh, NewTh)
        ;;

//
// Restore saved registers, and return.
//

        add       out0 = STACK_SCRATCH_AREA+SwExFrame, sp
        br.call.sptk brp = KiRestoreExceptionFrame
        ;;

Kdi_Exit:

        add       rpT1 = ExApEC+SwExFrame+STACK_SCRATCH_AREA, sp
        ;;
        ld8       rT1 = [rpT1]
        mov       brp = loc0
        ;;

        mov       ar.unat = loc1
        mov       ar.pfs = rT1
        .restore
        add       sp = SwitchFrameLength, sp
        br.ret.sptk brp

        NESTED_EXIT(KiDispatchInterrupt)