#include "kxia64.h"
#include "regia64.h"
 
//++
//
// VOID
// KiZeroPages (
//    IN PVOID PageBase,
//    IN SIZE_T NumberOfBytes
//    )
//
//--

//
// Based on the original code assumption, NumberOfBytes >= 2048 and
// is a multiple of 128.
//
// This code is optimized for McKinley CPU.
//

     LEAF_ENTRY(KiZeroPages)
     .prologue
     .regstk                 2,0,0,0

//
// Note: Do not delete the nop bundle below. It seemed to improve the performance
// by 150 cycles with this extra bundle. But the reason for it is unexplanable at
// this time... we're in the process of investigating it.
//

{    .mmi
     nop.m                   0
     nop.m                   0
     nop.i                   0
}

//
// Do 16 lfetch.fault.excl.nt1 to ensure that the L2 cache line is ready to receive the store data.
// The .fault is to ensure that the data enters into the cache hierarchy.
// The .nt1 is to ensure that the data will not displace data residing in the L1D.
// The .excl is to ensure that the data is ready to be modified.
// 16 lfetches seemed to be an optimal value for McKinley.
//

     .save                   ar.lc, r31

{    .mmi
     add                     r14 = r0, r32			// pointer to 0th cache line
     add                     r15 = 0x400, r32			// pointer to 8th cache line
     mov.i                   r31 = ar.lc			// save ar.lc; to be restored at the end
     ;;
}
{    .mmi
     lfetch.fault.excl.nt1   [r14], 0x80			// Note: lfetch increment must be in
     lfetch.fault.excl.nt1   [r15], 0x80			// this range (-256 to 255).
     add                     r16 = r0, r32                      // r16 == 1st store pointer
     ;;
}
{    .mmi
     lfetch.fault.excl.nt1   [r14], 0x80
     lfetch.fault.excl.nt1   [r15], 0x80
     add                     r17 = 0x10, r32                    // r17 == 2nd store pointer
     ;;
}
{    .mmi
     lfetch.fault.excl.nt1   [r14], 0x80
     lfetch.fault.excl.nt1   [r15], 0x80
     shr.u                   r18 = r33, 7			// number of 128-byte blocks
     ;;
}
{    .mmi
     lfetch.fault.excl.nt1   [r14], 0x80
     lfetch.fault.excl.nt1   [r15], 0x80
     adds                    r18 = -1, r18			// Loop Count
     ;;
}
{    .mmi
     lfetch.fault.excl.nt1   [r14], 0x80
     lfetch.fault.excl.nt1   [r15], 0x80
     mov.i                   ar.lc = r18
     ;;
}
{    .mmi
     lfetch.fault.excl.nt1   [r14], 0x80
     lfetch.fault.excl.nt1   [r15], 0x80
     add                     r19 = r32, r33                     // r19 == lfetch stop address
     ;;
}
{    .mmi
     lfetch.fault.excl.nt1   [r14], 0x80
     lfetch.fault.excl.nt1   [r15], 0x80
     nop.i                   0
     ;;
}
{    .mmi
     lfetch.fault.excl.nt1   [r14], 0x80
     lfetch.fault.excl.nt1   [r15], 0x80			// r15 will continue to be used for lfetch below
     nop.i                   0
     ;;
}

Mizp10:
{    .mmi
     stf.spill.nta           [r16] = f0, 0x20			// store 16 bytes at 1st pointer
     stf.spill.nta           [r17] = f0, 0x20			// store 16 bytes at 2nd pointer
     cmp.lt                  p8, p0 = r15, r19			// if r15 >= r32+r33, don't lfetch
     ;;
}
{    .mmi
     stf.spill.nta           [r16] = f0, 0x20
     stf.spill.nta           [r17] = f0, 0x20
     nop.i                   0
     ;;
}
{    .mmi
     stf.spill.nta           [r16] = f0, 0x20
     stf.spill.nta           [r17] = f0, 0x20
     nop.i                   0
     ;;
}
{    .mmi
     stf.spill.nta           [r16] = f0, 0x20
     stf.spill.nta           [r17] = f0, 0x20
     nop.i                   0
}

//
// Note: On McKinley, this added lfetch instruction below does not add any extra cycle.
// Since the bundle above and this bundle can be issued in one cycle (since no stop bits
// in between). Without the lfetch, the br instr could be combined with the above bundle,
// but only one bundle can be issued in this case.
//
{    .mib
(p8) lfetch.fault.excl.nt1   [r15], 0x80
     nop.i                   0
     br.cloop.sptk           Mizp10
     ;;
}
{    .mib
     nop.m                   0
     mov.i                   ar.lc = r31			// restore ar.lc for the caller
     br.ret.sptk             b0
}
     LEAF_EXIT(KiZeroPages)