Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

144 lines
4.2 KiB

  1. #include "kxia64.h"
  2. #include "regia64.h"
  3. //++
  4. //
  5. // VOID
  6. // KiZeroPages (
  7. // IN PVOID PageBase,
  8. // IN SIZE_T NumberOfBytes
  9. // )
  10. //
  11. //--
  12. //
  13. // Based on the original code assumption, NumberOfBytes >= 2048 and
  14. // is a multiple of 128.
  15. //
  16. // This code is optimized for McKinley CPU.
  17. //
  18. LEAF_ENTRY(KiZeroPages)
  19. .prologue
  20. .regstk 2,0,0,0
  21. //
  22. // Note: Do not delete the nop bundle below. It seemed to improve the performance
  23. // by 150 cycles with this extra bundle. But the reason for it is unexplanable at
  24. // this time... we're in the process of investigating it.
  25. //
  26. { .mmi
  27. nop.m 0
  28. nop.m 0
  29. nop.i 0
  30. }
  31. //
  32. // Do 16 lfetch.fault.excl.nt1 to ensure that the L2 cache line is ready to receive the store data.
  33. // The .fault is to ensure that the data enters into the cache hierarchy.
  34. // The .nt1 is to ensure that the data will not displace data residing in the L1D.
  35. // The .excl is to ensure that the data is ready to be modified.
  36. // 16 lfetches seemed to be an optimal value for McKinley.
  37. //
  38. .save ar.lc, r31
  39. { .mmi
  40. add r14 = r0, r32 // pointer to 0th cache line
  41. add r15 = 0x400, r32 // pointer to 8th cache line
  42. mov.i r31 = ar.lc // save ar.lc; to be restored at the end
  43. ;;
  44. }
  45. { .mmi
  46. lfetch.fault.excl.nt1 [r14], 0x80 // Note: lfetch increment must be in
  47. lfetch.fault.excl.nt1 [r15], 0x80 // this range (-256 to 255).
  48. add r16 = r0, r32 // r16 == 1st store pointer
  49. ;;
  50. }
  51. { .mmi
  52. lfetch.fault.excl.nt1 [r14], 0x80
  53. lfetch.fault.excl.nt1 [r15], 0x80
  54. add r17 = 0x10, r32 // r17 == 2nd store pointer
  55. ;;
  56. }
  57. { .mmi
  58. lfetch.fault.excl.nt1 [r14], 0x80
  59. lfetch.fault.excl.nt1 [r15], 0x80
  60. shr.u r18 = r33, 7 // number of 128-byte blocks
  61. ;;
  62. }
  63. { .mmi
  64. lfetch.fault.excl.nt1 [r14], 0x80
  65. lfetch.fault.excl.nt1 [r15], 0x80
  66. adds r18 = -1, r18 // Loop Count
  67. ;;
  68. }
  69. { .mmi
  70. lfetch.fault.excl.nt1 [r14], 0x80
  71. lfetch.fault.excl.nt1 [r15], 0x80
  72. mov.i ar.lc = r18
  73. ;;
  74. }
  75. { .mmi
  76. lfetch.fault.excl.nt1 [r14], 0x80
  77. lfetch.fault.excl.nt1 [r15], 0x80
  78. add r19 = r32, r33 // r19 == lfetch stop address
  79. ;;
  80. }
  81. { .mmi
  82. lfetch.fault.excl.nt1 [r14], 0x80
  83. lfetch.fault.excl.nt1 [r15], 0x80
  84. nop.i 0
  85. ;;
  86. }
  87. { .mmi
  88. lfetch.fault.excl.nt1 [r14], 0x80
  89. lfetch.fault.excl.nt1 [r15], 0x80 // r15 will continue to be used for lfetch below
  90. nop.i 0
  91. ;;
  92. }
  93. Mizp10:
  94. { .mmi
  95. stf.spill.nta [r16] = f0, 0x20 // store 16 bytes at 1st pointer
  96. stf.spill.nta [r17] = f0, 0x20 // store 16 bytes at 2nd pointer
  97. cmp.lt p8, p0 = r15, r19 // if r15 >= r32+r33, don't lfetch
  98. ;;
  99. }
  100. { .mmi
  101. stf.spill.nta [r16] = f0, 0x20
  102. stf.spill.nta [r17] = f0, 0x20
  103. nop.i 0
  104. ;;
  105. }
  106. { .mmi
  107. stf.spill.nta [r16] = f0, 0x20
  108. stf.spill.nta [r17] = f0, 0x20
  109. nop.i 0
  110. ;;
  111. }
  112. { .mmi
  113. stf.spill.nta [r16] = f0, 0x20
  114. stf.spill.nta [r17] = f0, 0x20
  115. nop.i 0
  116. }
  117. //
  118. // Note: On McKinley, this added lfetch instruction below does not add any extra cycle.
  119. // Since the bundle above and this bundle can be issued in one cycle (since no stop bits
  120. // in between). Without the lfetch, the br instr could be combined with the above bundle,
  121. // but only one bundle can be issued in this case.
  122. //
  123. { .mib
  124. (p8) lfetch.fault.excl.nt1 [r15], 0x80
  125. nop.i 0
  126. br.cloop.sptk Mizp10
  127. ;;
  128. }
  129. { .mib
  130. nop.m 0
  131. mov.i ar.lc = r31 // restore ar.lc for the caller
  132. br.ret.sptk b0
  133. }
  134. LEAF_EXIT(KiZeroPages)