#include "kxia64.h" #include "regia64.h" //++ // // VOID // KiZeroPages ( // IN PVOID PageBase, // IN SIZE_T NumberOfBytes // ) // //-- // // Based on the original code assumption, NumberOfBytes >= 2048 and // is a multiple of 128. // // This code is optimized for McKinley CPU. // LEAF_ENTRY(KiZeroPages) .prologue .regstk 2,0,0,0 // // Note: Do not delete the nop bundle below. It seemed to improve the performance // by 150 cycles with this extra bundle. But the reason for it is unexplanable at // this time... we're in the process of investigating it. // { .mmi nop.m 0 nop.m 0 nop.i 0 } // // Do 16 lfetch.fault.excl.nt1 to ensure that the L2 cache line is ready to receive the store data. // The .fault is to ensure that the data enters into the cache hierarchy. // The .nt1 is to ensure that the data will not displace data residing in the L1D. // The .excl is to ensure that the data is ready to be modified. // 16 lfetches seemed to be an optimal value for McKinley. // .save ar.lc, r31 { .mmi add r14 = r0, r32 // pointer to 0th cache line add r15 = 0x400, r32 // pointer to 8th cache line mov.i r31 = ar.lc // save ar.lc; to be restored at the end ;; } { .mmi lfetch.fault.excl.nt1 [r14], 0x80 // Note: lfetch increment must be in lfetch.fault.excl.nt1 [r15], 0x80 // this range (-256 to 255). add r16 = r0, r32 // r16 == 1st store pointer ;; } { .mmi lfetch.fault.excl.nt1 [r14], 0x80 lfetch.fault.excl.nt1 [r15], 0x80 add r17 = 0x10, r32 // r17 == 2nd store pointer ;; } { .mmi lfetch.fault.excl.nt1 [r14], 0x80 lfetch.fault.excl.nt1 [r15], 0x80 shr.u r18 = r33, 7 // number of 128-byte blocks ;; } { .mmi lfetch.fault.excl.nt1 [r14], 0x80 lfetch.fault.excl.nt1 [r15], 0x80 adds r18 = -1, r18 // Loop Count ;; } { .mmi lfetch.fault.excl.nt1 [r14], 0x80 lfetch.fault.excl.nt1 [r15], 0x80 mov.i ar.lc = r18 ;; } { .mmi lfetch.fault.excl.nt1 [r14], 0x80 lfetch.fault.excl.nt1 [r15], 0x80 add r19 = r32, r33 // r19 == lfetch stop address ;; } { .mmi lfetch.fault.excl.nt1 [r14], 0x80 lfetch.fault.excl.nt1 [r15], 0x80 nop.i 0 ;; } { .mmi lfetch.fault.excl.nt1 [r14], 0x80 lfetch.fault.excl.nt1 [r15], 0x80 // r15 will continue to be used for lfetch below nop.i 0 ;; } Mizp10: { .mmi stf.spill.nta [r16] = f0, 0x20 // store 16 bytes at 1st pointer stf.spill.nta [r17] = f0, 0x20 // store 16 bytes at 2nd pointer cmp.lt p8, p0 = r15, r19 // if r15 >= r32+r33, don't lfetch ;; } { .mmi stf.spill.nta [r16] = f0, 0x20 stf.spill.nta [r17] = f0, 0x20 nop.i 0 ;; } { .mmi stf.spill.nta [r16] = f0, 0x20 stf.spill.nta [r17] = f0, 0x20 nop.i 0 ;; } { .mmi stf.spill.nta [r16] = f0, 0x20 stf.spill.nta [r17] = f0, 0x20 nop.i 0 } // // Note: On McKinley, this added lfetch instruction below does not add any extra cycle. // Since the bundle above and this bundle can be issued in one cycle (since no stop bits // in between). Without the lfetch, the br instr could be combined with the above bundle, // but only one bundle can be issued in this case. // { .mib (p8) lfetch.fault.excl.nt1 [r15], 0x80 nop.i 0 br.cloop.sptk Mizp10 ;; } { .mib nop.m 0 mov.i ar.lc = r31 // restore ar.lc for the caller br.ret.sptk b0 } LEAF_EXIT(KiZeroPages)