Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

144 lines
4.2 KiB

#include "kxia64.h"
#include "regia64.h"
//++
//
// VOID
// KiZeroPages (
// IN PVOID PageBase,
// IN SIZE_T NumberOfBytes
// )
//
//--
//
// Based on the original code assumption, NumberOfBytes >= 2048 and
// is a multiple of 128.
//
// This code is optimized for McKinley CPU.
//
LEAF_ENTRY(KiZeroPages)
.prologue
.regstk 2,0,0,0
//
// Note: Do not delete the nop bundle below. It seemed to improve the performance
// by 150 cycles with this extra bundle. But the reason for it is unexplanable at
// this time... we're in the process of investigating it.
//
{ .mmi
nop.m 0
nop.m 0
nop.i 0
}
//
// Do 16 lfetch.fault.excl.nt1 to ensure that the L2 cache line is ready to receive the store data.
// The .fault is to ensure that the data enters into the cache hierarchy.
// The .nt1 is to ensure that the data will not displace data residing in the L1D.
// The .excl is to ensure that the data is ready to be modified.
// 16 lfetches seemed to be an optimal value for McKinley.
//
.save ar.lc, r31
{ .mmi
add r14 = r0, r32 // pointer to 0th cache line
add r15 = 0x400, r32 // pointer to 8th cache line
mov.i r31 = ar.lc // save ar.lc; to be restored at the end
;;
}
{ .mmi
lfetch.fault.excl.nt1 [r14], 0x80 // Note: lfetch increment must be in
lfetch.fault.excl.nt1 [r15], 0x80 // this range (-256 to 255).
add r16 = r0, r32 // r16 == 1st store pointer
;;
}
{ .mmi
lfetch.fault.excl.nt1 [r14], 0x80
lfetch.fault.excl.nt1 [r15], 0x80
add r17 = 0x10, r32 // r17 == 2nd store pointer
;;
}
{ .mmi
lfetch.fault.excl.nt1 [r14], 0x80
lfetch.fault.excl.nt1 [r15], 0x80
shr.u r18 = r33, 7 // number of 128-byte blocks
;;
}
{ .mmi
lfetch.fault.excl.nt1 [r14], 0x80
lfetch.fault.excl.nt1 [r15], 0x80
adds r18 = -1, r18 // Loop Count
;;
}
{ .mmi
lfetch.fault.excl.nt1 [r14], 0x80
lfetch.fault.excl.nt1 [r15], 0x80
mov.i ar.lc = r18
;;
}
{ .mmi
lfetch.fault.excl.nt1 [r14], 0x80
lfetch.fault.excl.nt1 [r15], 0x80
add r19 = r32, r33 // r19 == lfetch stop address
;;
}
{ .mmi
lfetch.fault.excl.nt1 [r14], 0x80
lfetch.fault.excl.nt1 [r15], 0x80
nop.i 0
;;
}
{ .mmi
lfetch.fault.excl.nt1 [r14], 0x80
lfetch.fault.excl.nt1 [r15], 0x80 // r15 will continue to be used for lfetch below
nop.i 0
;;
}
Mizp10:
{ .mmi
stf.spill.nta [r16] = f0, 0x20 // store 16 bytes at 1st pointer
stf.spill.nta [r17] = f0, 0x20 // store 16 bytes at 2nd pointer
cmp.lt p8, p0 = r15, r19 // if r15 >= r32+r33, don't lfetch
;;
}
{ .mmi
stf.spill.nta [r16] = f0, 0x20
stf.spill.nta [r17] = f0, 0x20
nop.i 0
;;
}
{ .mmi
stf.spill.nta [r16] = f0, 0x20
stf.spill.nta [r17] = f0, 0x20
nop.i 0
;;
}
{ .mmi
stf.spill.nta [r16] = f0, 0x20
stf.spill.nta [r17] = f0, 0x20
nop.i 0
}
//
// Note: On McKinley, this added lfetch instruction below does not add any extra cycle.
// Since the bundle above and this bundle can be issued in one cycle (since no stop bits
// in between). Without the lfetch, the br instr could be combined with the above bundle,
// but only one bundle can be issued in this case.
//
{ .mib
(p8) lfetch.fault.excl.nt1 [r15], 0x80
nop.i 0
br.cloop.sptk Mizp10
;;
}
{ .mib
nop.m 0
mov.i ar.lc = r31 // restore ar.lc for the caller
br.ret.sptk b0
}
LEAF_EXIT(KiZeroPages)