Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

344 lines
7.5 KiB

  1. #include "ksia64.h"
  2. LEAF_ENTRY(memcpy)
  3. .prologue
  4. .regstk 3,7,0,8
  5. alloc t17 = ar.pfs,3,31,0,32
  6. .save pr, r64
  7. mov r64 = pr
  8. and t3 = -32, a1
  9. ;;
  10. lfetch [t3], 32 //0
  11. .save ar.lc, r65
  12. mov.i r65 = ar.lc
  13. and t1 = 7, a1
  14. ;;
  15. .body
  16. lfetch [t3], 32 //32
  17. mov v0 = a0
  18. and t0 = 7, a0
  19. ;;
  20. or t2 = t0, t1
  21. cmp.eq pt1 = zero, a2
  22. (pt1) br.ret.spnt brp
  23. ;;
  24. lfetch [t3], 32 //64
  25. cmp.lt pt2 = 16, a2
  26. nop.b 0
  27. ;;
  28. lfetch [t3], 32 //96
  29. cmp.lt pt6 = 127, a2
  30. cmp.le pt4 = 8, a2
  31. ;;
  32. (pt6) lfetch [t3], 32 //128
  33. (pt4) cmp.eq.unc pt3 = 0, t2
  34. (pt4) cmp.eq.unc pt5 = t0, t1
  35. (pt3) br.cond.sptk QwordMoveUp
  36. (pt5) br.cond.spnt AlignedMove
  37. (pt2) br.cond.sptk UnalignedMove
  38. ByteMoveUpLoop:
  39. ld1 t10 = [a1], 1
  40. nop.f 0
  41. add a2 = -1, a2
  42. ;;
  43. st1 [a0] = t10, 1
  44. cmp.ne pt1 = zero, a2
  45. (pt1) br.cond.sptk ByteMoveUpLoop
  46. nop.m 0
  47. nop.f 0
  48. br.ret.sptk brp
  49. UnalignedMove:
  50. cmp.eq pt0 = 0, t1
  51. sub t1 = 8, t1
  52. (pt0) br.cond.spnt SkipUnalignedMoveByteLoop
  53. ;;
  54. UnalignedMoveByteLoop:
  55. ld1 t10 = [a1], 1
  56. add t1 = -1, t1
  57. add a2 = -1, a2
  58. ;;
  59. st1 [a0] = t10, 1
  60. cmp.eq p0, pt1 = zero, t1
  61. (pt1) br.cond.sptk UnalignedMoveByteLoop
  62. ;;
  63. SkipUnalignedMoveByteLoop:
  64. and t0 = 7, a0
  65. mov pr.rot = 3<<16
  66. or t1 = a1, r0
  67. ;;
  68. add t2 = a2, t0
  69. mov.i ar.ec = 32
  70. sub t21 = 8, t0
  71. ;;
  72. sub t4 = a0, t0
  73. shr t10 = t2, 3
  74. shl t21 = t21, 3
  75. ;;
  76. ld8 r33 = [t4], 0
  77. add t10 = -1,t10
  78. and t2 = 7, t2
  79. ;;
  80. cmp.eq pt0 = 2, t0
  81. cmp.eq pt3 = 4, t0
  82. cmp.eq pt5 = 6, t0
  83. ;;
  84. nop.m 0
  85. shl r33 = r33,t21 // Prime r39
  86. mov.i ar.lc = t10
  87. (pt0) br.cond.spnt SpecialLoop2
  88. (pt3) br.cond.spnt SpecialLoop4
  89. (pt5) br.cond.spnt SpecialLoop6
  90. cmp.eq pt1 = 3, t0
  91. cmp.eq pt4 = 5, t0
  92. cmp.eq pt6 = 7, t0
  93. (pt1) br.cond.spnt SpecialLoop3
  94. (pt4) br.cond.spnt SpecialLoop5
  95. (pt6) br.cond.spnt SpecialLoop7
  96. ;;
  97. SpecialLoop1:
  98. (p16) ld8 r32 = [t1], 8
  99. nop.f 0
  100. brp.sptk.imp SpecialLoop1E, SpecialLoop1
  101. SpecialLoop1E:
  102. (p48) st8 [t4] = r10, 8
  103. (p47) shrp r10 = r62,r63,56
  104. br.ctop.sptk.many SpecialLoop1
  105. br UnalignedByteDone
  106. SpecialLoop2:
  107. (p16) ld8 r32 = [t1], 8
  108. nop.f 0
  109. brp.sptk.imp SpecialLoop2E, SpecialLoop2
  110. SpecialLoop2E:
  111. (p48) st8 [t4] = r10, 8
  112. (p47) shrp r10 = r62,r63,48
  113. br.ctop.sptk.many SpecialLoop2
  114. br UnalignedByteDone
  115. SpecialLoop3:
  116. (p16) ld8 r32 = [t1], 8
  117. nop.f 0
  118. brp.sptk.imp SpecialLoop3E, SpecialLoop3
  119. SpecialLoop3E:
  120. (p48) st8 [t4] = r10, 8
  121. (p47) shrp r10 = r62,r63,40
  122. br.ctop.sptk.many SpecialLoop3
  123. br UnalignedByteDone
  124. SpecialLoop4:
  125. (p16) ld8 r32 = [t1], 8
  126. nop.f 0
  127. brp.sptk.imp SpecialLoop4E, SpecialLoop4
  128. SpecialLoop4E:
  129. (p48) st8 [t4] = r10, 8
  130. (p47) shrp r10 = r62,r63,32
  131. br.ctop.sptk.many SpecialLoop4
  132. br UnalignedByteDone
  133. SpecialLoop5:
  134. (p16) ld8 r32 = [t1], 8
  135. nop.f 0
  136. brp.sptk.imp SpecialLoop5E, SpecialLoop5
  137. SpecialLoop5E:
  138. (p48) st8 [t4] = r10, 8
  139. (p47) shrp r10 = r62,r63,24
  140. br.ctop.sptk.many SpecialLoop5
  141. br UnalignedByteDone
  142. SpecialLoop6:
  143. (p16) ld8 r32 = [t1], 8
  144. nop.f 0
  145. brp.sptk.imp SpecialLoop6E, SpecialLoop6
  146. SpecialLoop6E:
  147. (p48) st8 [t4] = r10, 8
  148. (p47) shrp r10 = r62,r63,16
  149. br.ctop.sptk.many SpecialLoop6
  150. br UnalignedByteDone
  151. SpecialLoop7:
  152. (p16) ld8 r32 = [t1], 8
  153. nop.f 0
  154. brp.sptk.imp SpecialLoop7E, SpecialLoop7
  155. SpecialLoop7E:
  156. (p48) st8 [t4] = r10, 8
  157. (p47) shrp r10 = r62,r63,8
  158. br.ctop.sptk.many SpecialLoop7;;
  159. UnalignedByteDone:
  160. sub t1 = t1, t0
  161. mov pr = r64
  162. mov.i ar.lc = r65
  163. ;;
  164. cmp.eq pt0 = zero, t2
  165. (pt0) br.ret.spnt brp
  166. UnAlignedByteDoneLoop:
  167. ld1 t10 = [t1], 1
  168. add t2 = -1, t2
  169. ;;
  170. cmp.ne pt1 = zero, t2
  171. st1 [t4] = t10, 1
  172. (pt1) br.cond.sptk UnAlignedByteDoneLoop
  173. br.ret.spnt brp
  174. AlignedMove:
  175. add t4 = 64, t3
  176. (pt6) lfetch [t3], 32 //160
  177. sub t22 = 8, t0
  178. ;;
  179. (pt6) lfetch [t3], 64 //192
  180. (pt6) lfetch [t4], 96 //224
  181. sub a2 = a2, t22
  182. ;;
  183. AlignedMoveByteLoop:
  184. ld1 t10 = [a1], 1
  185. nop.f 0
  186. add t22 = -1, t22
  187. ;;
  188. st1 [a0] = t10, 1
  189. cmp.ne pt1 = zero, t22
  190. (pt1) br.cond.sptk AlignedMoveByteLoop
  191. ;;
  192. (pt6) lfetch [t3], 32 //256
  193. cmp.eq.unc pt0 = zero, a2
  194. cmp.gt pt2 = 8, a2
  195. (pt6) lfetch [t4], 128 //320
  196. (pt0) br.ret.spnt brp
  197. (pt2) br.cond.sptk ByteMoveUpLoop
  198. ;;
  199. //
  200. // both src & dest are now 8-byte aligned
  201. //
  202. QwordMoveUp:
  203. add t3 = 128, a1
  204. add t4 = 288, a1
  205. add t7 = 8, a1
  206. add t8 = 8, a0
  207. cmp.gt pt3 = 64, a2
  208. (pt3) br.cond.spnt QwordMoveUpLoop
  209. ;;
  210. UnrolledQwordMoveUpLoop:
  211. ld8 t10 = [a1], 16
  212. ld8 t11 = [t7], 16
  213. add a2 = -64, a2
  214. ;;
  215. ld8 t12 = [a1], 16
  216. ld8 t13 = [t7], 16
  217. cmp.le pt3 = 128, a2
  218. ;;
  219. ld8 t14 = [a1], 16
  220. ld8 t15 = [t7], 16
  221. cmp.gt pt2 = 8, a2
  222. ;;
  223. ld8 t16 = [a1], 16
  224. ld8 t17 = [t7], 16
  225. ;;
  226. (pt3) lfetch [t3], 64
  227. (pt3) lfetch [t4], 64
  228. st8 [a0] = t10, 16
  229. st8 [t8] = t11, 16
  230. ;;
  231. st8 [a0] = t12, 16
  232. st8 [t8] = t13, 16
  233. ;;
  234. st8 [a0] = t14, 16
  235. st8 [t8] = t15, 16
  236. ;;
  237. st8 [a0] = t16, 16
  238. st8 [t8] = t17, 16
  239. (pt3) br.cond.dptk UnrolledQwordMoveUpLoop
  240. (pt2) br.cond.spnt ByteMoveUp
  241. ;;
  242. QwordMoveUpLoop:
  243. ld8 t10 = [a1], 8
  244. add a2 = -8, a2
  245. ;;
  246. cmp.le pt1 = 8, a2
  247. st8 [a0] = t10, 8
  248. (pt1) br.cond.sptk QwordMoveUpLoop
  249. ;;
  250. ByteMoveUp:
  251. cmp.eq pt0 = zero, a2
  252. (pt0) br.ret.spnt brp
  253. ;;
  254. AlignedByteDoneLoop:
  255. ld1 t10 = [a1], 1
  256. add a2 = -1, a2
  257. ;;
  258. cmp.ne pt1 = zero, a2
  259. st1 [a0] = t10, 1
  260. (pt1) br.cond.sptk AlignedByteDoneLoop
  261. br.ret.spnt brp
  262. ;;
  263. LEAF_EXIT(memcpy)