Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

170 lines
4.8 KiB

  1. // memset_ita.s: function to set a number of bytes to a char value
  2. // Copyright (C) 1998 Intel Corporation.
  3. //
  4. // The information and source code contained herein is the exclusive property
  5. // of Intel Corporation and may not be disclosed, examined, or
  6. // reproduced in whole or in part without explicit written authorization from
  7. // the Company.
  8. // Author: Steve Skedzielewski
  9. // Date: June, 2000
  10. //
  11. .section .text
  12. // -- Begin _memset_ita
  13. .proc _memset_ita#
  14. .align 32
  15. // Replicate the value into all bytes using mmx broadcast
  16. // live out: r21 (alignment), r11(ar.lc), r33(replicated c),
  17. // r32(s), r34(n)
  18. .global _memset_ita#
  19. .prologue
  20. _memset_ita:
  21. and r21=7,r32
  22. .save ar.lc,r11,t01
  23. [t01:] mov r11=ar.lc //0: 2 MS
  24. brp.dptk.imp Longloop, Longloop_br
  25. mov r8=r32 //0:
  26. mux1 r33=r33,@brcst
  27. ;;
  28. // If we're not on an 8-byte boundary, move to one
  29. // live out: r11(ar.lc), r33(unsigned c), r32(sext s), r34(unsigned n)
  30. // p14 (n>=MINIMUM_LONG)
  31. .body
  32. MINIMUM_LONG=0x4f
  33. Check_align:
  34. cmp.le p14,p0=MINIMUM_LONG,r34 //0: MINIMUM_LONG < n?
  35. cmp.ne p15,p0=0,r21 //0: Low 3 bits zero?
  36. (p15) br.cond.dpnt Align //0:
  37. ;;
  38. // Now that p is aligned,
  39. // use straight-line code for n<=64, a loop otherwise
  40. // Exit if n<=0
  41. // live out: r11(ar.lc), r33(unsigned c), r32(sext s), r34(n)
  42. // r17(s+8), p13(n>8), p12(n>16), p14 (n>=MINIMUM_LONG)
  43. Is_aligned:
  44. cmp.ge p15,p0=0,r34 //0: n <= 0?
  45. cmp.le p13,p0=0x10,r34 //0: 16 <= n?
  46. cmp.le p12,p0=0x20,r34 //0: 32 <= n?
  47. add r17=8,r32 //0: second pointer
  48. (p15) br.cond.dpnt Exit //0: 21 MS
  49. (p14) br.cond.dpnt Long //0: 21 MS
  50. ;;
  51. // Short memsets are done with predicated straightline code
  52. // live out: r8 (return value, original value of r32
  53. ;; // stall 1 cycle for MMX to complete
  54. (p13) st8 [r32]=r33,16 //0:
  55. (p13) st8 [r17]=r33,16 //0:
  56. cmp.le p11,p0=0x30,r34 //0: 48 <= n?
  57. ;;
  58. (p12) st8 [r32]=r33,16 //1:
  59. (p12) st8 [r17]=r33,16 //1:
  60. cmp.le p10,p0=0x40,r34 //1: 64 <= n?
  61. ;;
  62. (p11) st8 [r32]=r33,16 //2:
  63. (p11) st8 [r17]=r33,16 //2:
  64. tbit.nz p9,p0=r34,3 //2: odd number of st8s?
  65. ;;
  66. (p10) st8 [r32]=r33,16 //3:
  67. (p10) st8 [r17]=r33 //3:
  68. tbit.nz p8,p0=r34,2 //3: bit 2 on?
  69. ;;
  70. (p9) st8 [r32]=r33,8 //4:
  71. tbit.nz p7,p0=r34,1 //4: bit 1 on?
  72. and r18=1,r34 //4: bit 0 on?
  73. ;;
  74. //
  75. // Clean up any partial word stores.
  76. //
  77. (p8) st4 [r32]=r33,4 //5:
  78. ;;
  79. (p7) st2 [r32]=r33,2 //6:
  80. cmp.ne p6,p0=0,r18 //6:
  81. ;;
  82. (p6) st1 [r32]=r33,1 //7:
  83. br.ret.sptk.many b0 //7:
  84. ;;
  85. // Cycles = 8 , Instr = 21
  86. //
  87. // Block 11: Bchanged Pred: 8 Succ: 15
  88. // Counted loop setup. We know n>0 (exit above otherwise),
  89. // so we can just shift n right 4 bits (2 st8/iteration)
  90. // live out: r8(return value), r11(ar.lc), r17(s+8), r32(sext s)
  91. // r33(replicated c), r34(n), p5(n&4), p6(n&8)
  92. Long:
  93. add r17=8,r32 //0: second pointer
  94. shr.u r30=r34,4 //0: 29 MS
  95. and r18=0x8,r34 //0:
  96. and r19=0x4,r34 //0:
  97. ;;
  98. cmp.ne p6,p0=0,r18 //1:
  99. add r30=-2,r30 //1:
  100. cmp.ne p15,p0=0,r19 //1:
  101. ;;
  102. st8 [r32]=r33,16 //2: Use the otherwise empty
  103. st8 [r17]=r33,16 //2: m slots
  104. mov ar.lc=r30 //2:
  105. ;;
  106. // Cycles = 2, Instr = 9
  107. // Block 15: lentry lexit Bchanged Pred: 15 11 Succ: 15 13
  108. // Counted loop storing 16 bytes/iteration, TAR hinted.
  109. // live out: r11(ar.lc), r17(s+8), r21(n&7), r32(sext s)
  110. // r33(replicated c), r34(n), p5(n&4), p6(n&8)
  111. Longloop:
  112. Longloop_br:
  113. st8 [r32]=r33,16 //0: 30 MS
  114. st8 [r17]=r33,16 //0: 31 MS
  115. br.cloop.sptk Longloop //0: 0 MS
  116. ;;
  117. // Cycles = 1, Instr = 3
  118. // Block 13: Pred: 11 15 Succ: 12 24
  119. // Exit, or cleanup and exit
  120. // live out: r17(s+8), r32(sext s), r33(replicated c), r34(n)
  121. // p4(n&2), p5(n&4)
  122. Loopdone:
  123. (p6) st8 [r32]=r33,8 //0:
  124. tbit.nz p14,p0=r34,1 //0:
  125. ;;
  126. // Block 24: Bchanged Pred: 13 Succ:
  127. // Cleanup partial words after loop
  128. (p15) st4 [r32]=r33,4 //0:
  129. ;;
  130. (p14) st2 [r32]=r33,2 //1:
  131. tbit.nz p13,p0=r34,0 //1:
  132. ;;
  133. Loopexit:
  134. (p13) st1 [r32]=r33 //2:
  135. mov ar.lc=r11 //2:
  136. br.ret.sptk.many b0 //2:
  137. ;;
  138. // Cycles = 6, Instr = 12
  139. ///
  140. /// Align the input pointer to an 8-byte boundary
  141. // Block 5: lentry lexit Bchanged Pred: 3 6 Succ: 8 6
  142. // Freq 0, prob 0
  143. .b1_5:
  144. Align:
  145. cmp.ge p9,p0=0,r34 //0: 18 MS
  146. sub r22=8,r21 //0: 16 B6 MS S
  147. (p9) br.cond.dpnt Exit //0: 18 MS
  148. ;;
  149. // Cycles = 1, Instr = 3
  150. // Block 6: lexit Bchanged Pred: 5 Succ: 5 8
  151. Align_loop:
  152. st1 [r32]=r33,1 //0: 19 MS
  153. cmp.ge p10,p0=1,r22 //0:
  154. add r34=-1,r34 //0:
  155. (p10) br.cond.dpnt Is_aligned //0:
  156. ;;
  157. add r22=-1,r22
  158. cmp.lt p9,p0=0,r34 //0: 16 MS
  159. (p9) br.cond.dptk Align_loop //0: 16 MS
  160. ;;
  161. // Cycles = 2, Instr = 6
  162. Exit:
  163. br.ret.sptk.many b0 //0:
  164. ;;
  165. // Cycles = 1, Instr = 3
  166. // -- End _memset_ita
  167. .endp _memset_ita#
  168. // End