You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
144 lines
4.2 KiB
144 lines
4.2 KiB
#include "kxia64.h"
|
|
#include "regia64.h"
|
|
|
|
//++
|
|
//
|
|
// VOID
|
|
// KiZeroPages (
|
|
// IN PVOID PageBase,
|
|
// IN SIZE_T NumberOfBytes
|
|
// )
|
|
//
|
|
//--
|
|
|
|
//
|
|
// Based on the original code assumption, NumberOfBytes >= 2048 and
|
|
// is a multiple of 128.
|
|
//
|
|
// This code is optimized for McKinley CPU.
|
|
//
|
|
|
|
LEAF_ENTRY(KiZeroPages)
|
|
.prologue
|
|
.regstk 2,0,0,0
|
|
|
|
//
|
|
// Note: Do not delete the nop bundle below. It seemed to improve the performance
|
|
// by 150 cycles with this extra bundle. But the reason for it is unexplanable at
|
|
// this time... we're in the process of investigating it.
|
|
//
|
|
|
|
{ .mmi
|
|
nop.m 0
|
|
nop.m 0
|
|
nop.i 0
|
|
}
|
|
|
|
//
|
|
// Do 16 lfetch.fault.excl.nt1 to ensure that the L2 cache line is ready to receive the store data.
|
|
// The .fault is to ensure that the data enters into the cache hierarchy.
|
|
// The .nt1 is to ensure that the data will not displace data residing in the L1D.
|
|
// The .excl is to ensure that the data is ready to be modified.
|
|
// 16 lfetches seemed to be an optimal value for McKinley.
|
|
//
|
|
|
|
.save ar.lc, r31
|
|
|
|
{ .mmi
|
|
add r14 = r0, r32 // pointer to 0th cache line
|
|
add r15 = 0x400, r32 // pointer to 8th cache line
|
|
mov.i r31 = ar.lc // save ar.lc; to be restored at the end
|
|
;;
|
|
}
|
|
{ .mmi
|
|
lfetch.fault.excl.nt1 [r14], 0x80 // Note: lfetch increment must be in
|
|
lfetch.fault.excl.nt1 [r15], 0x80 // this range (-256 to 255).
|
|
add r16 = r0, r32 // r16 == 1st store pointer
|
|
;;
|
|
}
|
|
{ .mmi
|
|
lfetch.fault.excl.nt1 [r14], 0x80
|
|
lfetch.fault.excl.nt1 [r15], 0x80
|
|
add r17 = 0x10, r32 // r17 == 2nd store pointer
|
|
;;
|
|
}
|
|
{ .mmi
|
|
lfetch.fault.excl.nt1 [r14], 0x80
|
|
lfetch.fault.excl.nt1 [r15], 0x80
|
|
shr.u r18 = r33, 7 // number of 128-byte blocks
|
|
;;
|
|
}
|
|
{ .mmi
|
|
lfetch.fault.excl.nt1 [r14], 0x80
|
|
lfetch.fault.excl.nt1 [r15], 0x80
|
|
adds r18 = -1, r18 // Loop Count
|
|
;;
|
|
}
|
|
{ .mmi
|
|
lfetch.fault.excl.nt1 [r14], 0x80
|
|
lfetch.fault.excl.nt1 [r15], 0x80
|
|
mov.i ar.lc = r18
|
|
;;
|
|
}
|
|
{ .mmi
|
|
lfetch.fault.excl.nt1 [r14], 0x80
|
|
lfetch.fault.excl.nt1 [r15], 0x80
|
|
add r19 = r32, r33 // r19 == lfetch stop address
|
|
;;
|
|
}
|
|
{ .mmi
|
|
lfetch.fault.excl.nt1 [r14], 0x80
|
|
lfetch.fault.excl.nt1 [r15], 0x80
|
|
nop.i 0
|
|
;;
|
|
}
|
|
{ .mmi
|
|
lfetch.fault.excl.nt1 [r14], 0x80
|
|
lfetch.fault.excl.nt1 [r15], 0x80 // r15 will continue to be used for lfetch below
|
|
nop.i 0
|
|
;;
|
|
}
|
|
|
|
Mizp10:
|
|
{ .mmi
|
|
stf.spill.nta [r16] = f0, 0x20 // store 16 bytes at 1st pointer
|
|
stf.spill.nta [r17] = f0, 0x20 // store 16 bytes at 2nd pointer
|
|
cmp.lt p8, p0 = r15, r19 // if r15 >= r32+r33, don't lfetch
|
|
;;
|
|
}
|
|
{ .mmi
|
|
stf.spill.nta [r16] = f0, 0x20
|
|
stf.spill.nta [r17] = f0, 0x20
|
|
nop.i 0
|
|
;;
|
|
}
|
|
{ .mmi
|
|
stf.spill.nta [r16] = f0, 0x20
|
|
stf.spill.nta [r17] = f0, 0x20
|
|
nop.i 0
|
|
;;
|
|
}
|
|
{ .mmi
|
|
stf.spill.nta [r16] = f0, 0x20
|
|
stf.spill.nta [r17] = f0, 0x20
|
|
nop.i 0
|
|
}
|
|
|
|
//
|
|
// Note: On McKinley, this added lfetch instruction below does not add any extra cycle.
|
|
// Since the bundle above and this bundle can be issued in one cycle (since no stop bits
|
|
// in between). Without the lfetch, the br instr could be combined with the above bundle,
|
|
// but only one bundle can be issued in this case.
|
|
//
|
|
{ .mib
|
|
(p8) lfetch.fault.excl.nt1 [r15], 0x80
|
|
nop.i 0
|
|
br.cloop.sptk Mizp10
|
|
;;
|
|
}
|
|
{ .mib
|
|
nop.m 0
|
|
mov.i ar.lc = r31 // restore ar.lc for the caller
|
|
br.ret.sptk b0
|
|
}
|
|
LEAF_EXIT(KiZeroPages)
|