Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

433 lines
9.5 KiB

  1. .section .text
  2. .proc memcpy#
  3. .global memcpy#
  4. .align 64
  5. .prologue
  6. memcpy:
  7. { .mmi
  8. add r10 = 0x80, r33
  9. add r11 = 0x80, r32
  10. and r3 = 7, r33
  11. } { .mmi
  12. cmp.gt p9, p7 = r34, r0
  13. mov r8 = r32
  14. and r2 = 7, r32
  15. ;;
  16. } { .mmi
  17. (p9) lfetch [r10], 0x40
  18. cmp.gt p14 = 0x40, r34
  19. cmp.le p15 = 0x80, r34
  20. } { .mmb
  21. or r9 = r2, r3
  22. (p9) cmp.eq p7 = r32, r33
  23. (p7) br.ret.spnt b0
  24. ;;
  25. } { .mmi
  26. lfetch [r10], 0x40
  27. lfetch.excl.nt1 [r11], 0x80
  28. cmp.le p10, p11 = 8, r34
  29. } {
  30. .mbb
  31. (p14) cmp.eq.unc p9 = 0, r9
  32. (p11) br.cond.spnt ByteMoveUp // len < 8
  33. (p9) br.cond.spnt QwordMoveUpLoop // len < 64 and both src and dst 8-byte aligned
  34. ;;
  35. } { .mmi
  36. (p15) lfetch [r10], 0x40
  37. (p15) lfetch.excl.nt1 [r11], 0x80
  38. sub r31 = 8, r2 // for AlignedMove
  39. } { .mmi
  40. (p10) cmp.eq.unc p9 = 0, r9
  41. (p10) cmp.eq.unc p11 = r2, r3
  42. cmp.le p8 = 0x18, r34
  43. ;;
  44. } { .mmi
  45. (p15) lfetch [r10], 0x40
  46. (p15) lfetch.excl.nt1 [r11], 0x80
  47. sub r3 = 0x10, r3 // for UnalignedMove
  48. } { .bbb
  49. (p9) br.cond.sptk QwordMoveUp // len >= 8 and src and dst are 8-byte aligned
  50. (p11) br.cond.spnt AlignedMove // len >= 8 and src and dst have same alignment
  51. (p8) br.cond.sptk UnalignedMove // len > 24
  52. ;;
  53. }
  54. // len <=7
  55. ByteMoveUp:
  56. { .mmi
  57. add r20 = 1, r33
  58. add r21 = 1, r32
  59. cmp.le p6 = 2, r34
  60. ;;
  61. }
  62. ByteMoveUpLoop:
  63. { .mmi
  64. ld1 r2 = [r33], 2
  65. (p6) ld1 r3 = [r20], 2
  66. nop.i 0
  67. } { .mmi
  68. cmp.le p7,p10 = 3, r34
  69. cmp.le p8 = 4, r34
  70. nop.i 0
  71. ;;
  72. } { .mmi
  73. (p7) ld1 r28 = [r33], 2
  74. (p8) ld1 r29 = [r20], 2
  75. (p8) cmp.lt.unc p9 = 4, r34
  76. } { .mmb
  77. st1 [r32] = r2, 2
  78. (p6) st1 [r21] = r3, 2
  79. (p10) br.ret.dptk b0
  80. ;;
  81. } { .mmi
  82. (p7) st1 [r32] = r28, 2
  83. (p8) st1 [r21] = r29, 2
  84. cmp.le p6 = 6, r34
  85. } { .mbb
  86. add r34 = -4, r34
  87. (p9) br.cond.dpnt ByteMoveUpLoop
  88. br.ret.dptk b0
  89. ;;
  90. }
  91. //
  92. // src & dest have same alignment
  93. //
  94. AlignedMove:
  95. AlignedMoveByteLoop:
  96. { .mmi
  97. ld1 r19 = [r33], 1
  98. add r31 = -1, r31
  99. add r34 = -1, r34
  100. ;;
  101. } { .mmb
  102. st1 [r32] = r19, 1
  103. cmp.ne p7 = r0, r31
  104. (p7) br.cond.sptk AlignedMoveByteLoop
  105. } { .mmi
  106. cmp.eq.unc p6 = r0, r34
  107. cmp.gt p8 = 8, r34
  108. cmp.le p15 = 0x80, r34
  109. } { .mbb
  110. nop.m 0
  111. (p6) br.ret.spnt b0
  112. (p8) br.cond.sptk ByteMoveUp
  113. ;;
  114. }
  115. // both src & dest are 8-byte aligned
  116. QwordMoveUp:
  117. { .mmi
  118. (p15) lfetch [r10], 0x40
  119. ;;
  120. (p15) lfetch [r10], 0x40
  121. cmp.le p0, p14 = 0x80, r34
  122. } { .mmb
  123. add r22 = 8, r32
  124. add r25 = 8, r33
  125. (p14) br.cond.spnt QwordMoveUpLoop
  126. ;;
  127. }
  128. .align 32
  129. UnrolledQwordMoveUpLoop:
  130. { .mmi
  131. ld8 r20 = [r25], 0x10
  132. ld8 r30 = [r33], 0x10
  133. add r34 = -0x40, r34
  134. ;;
  135. } { .mmi
  136. ld8 r21 = [r25], 0x10
  137. ld8 r31 = [r33], 0x10
  138. cmp.le p9 = 0x40, r34
  139. } { .mmi
  140. st8 [r22] = r20, 0x10
  141. st8 [r32] = r30, 0x10
  142. cmp.gt p8 = 8, r34
  143. ;;
  144. } { .mmi
  145. ld8 r20 = [r25], 0x10
  146. ld8 r30 = [r33], 0x10
  147. tbit.z p15 = r10, 6
  148. } { .mmi
  149. st8 [r22] = r21, 0x10
  150. st8 [r32] = r31, 0x10
  151. nop.i 0
  152. ;;
  153. } { .mmi
  154. ld8 r21 = [r25], 0x10
  155. ld8 r31 = [r33], 0x10
  156. nop.i 0
  157. } { .mmi
  158. st8 [r22] = r20, 0x10
  159. st8 [r32] = r30, 0x10
  160. nop.i 0
  161. ;;
  162. } { .mmi
  163. lfetch [r10], 0x40
  164. (p15) lfetch.excl.nt1 [r11], 0x80
  165. nop.i 0
  166. } { .mmb
  167. st8 [r22] = r21, 0x10
  168. st8 [r32] = r31, 0x10
  169. (p9) br.cond.sptk UnrolledQwordMoveUpLoop
  170. ;;
  171. } { .mbb
  172. cmp.eq p6 = r0, r34
  173. (p6) br.ret.spnt b0
  174. (p8) br.cond.spnt ByteMoveUp
  175. ;;
  176. }
  177. QwordMoveUpLoop:
  178. { .mii
  179. ld8 r19 = [r33], 8
  180. add r34 = -8, r34
  181. nop.i 0
  182. ;;
  183. } { .mmi
  184. st8 [r32] = r19, 8
  185. cmp.leu p7 = 8, r34
  186. cmp.ne p6 = r0, r34
  187. } { .bbb
  188. (p7) br.cond.sptk QwordMoveUpLoop
  189. (p6) br.cond.spnt ByteMoveUp
  190. br.ret.sptk b0
  191. ;;
  192. }
  193. //
  194. // Copy long unaligned region
  195. //
  196. NUMBER_OF_ROTATING_REGISTERS = 24 //40
  197. RP1 = p39 //p55
  198. RP2 = p40 //p56
  199. RR1 = r54 //r70
  200. RR2 = r55 //r71
  201. UnalignedMove:
  202. { .mmi
  203. .regstk 3, NUMBER_OF_ROTATING_REGISTERS - 3, 0, NUMBER_OF_ROTATING_REGISTERS
  204. alloc r26 = ar.pfs, 3, NUMBER_OF_ROTATING_REGISTERS - 3, 0, NUMBER_OF_ROTATING_REGISTERS
  205. (p13) lfetch [r10], 0x40
  206. .save pr, r18
  207. mov r18 = pr
  208. ;;
  209. } { .mmi
  210. (p13) lfetch [r10], 0x40
  211. (p13) lfetch.excl.nt1 [r11], 0x80
  212. .save ar.lc, r27
  213. mov.i r27 = ar.lc
  214. } { .mmi
  215. mov r28 = r0
  216. ;;
  217. }
  218. .body
  219. UnalignedMoveByteLoop:
  220. { .mmi
  221. ld1 r19 = [r33], 1
  222. cmp.ne p6 = 1, r3
  223. mov pr.rot = 3<<0x10
  224. ;;
  225. } { .mib
  226. add r3 = -1, r3
  227. shrp r28 = r19, r28, 8
  228. nop.b 0
  229. } { .mib
  230. st1 [r32] = r19, 1
  231. add r34 = -1, r34
  232. (p6) br.cond.sptk UnalignedMoveByteLoop
  233. ;;
  234. } { .mmi
  235. mov r3 = r33
  236. and r2 = 7, r32
  237. mov r33 = r28
  238. ;;
  239. } { .mmi
  240. add r9 = r34, r2
  241. sub r29 = r32, r2
  242. cmp.eq p6 = 2, r2
  243. ;;
  244. } { .mii
  245. cmp.eq p9 = 4, r2
  246. shr r19 = r9, 3
  247. cmp.eq p11 = 6, r2
  248. ;;
  249. } { .mii
  250. add r19 = -1, r19
  251. and r9 = 7, r9
  252. mov.i ar.ec = NUMBER_OF_ROTATING_REGISTERS
  253. ;;
  254. } { .mmi
  255. lfetch [r10], 0x40
  256. lfetch.excl.nt1 [r11], 0x40
  257. mov.i ar.lc = r19
  258. } { .bbb
  259. (p6) br.cond.spnt SpecialLoop2
  260. (p9) br.cond.spnt SpecialLoop4
  261. (p11) br.cond.spnt SpecialLoop6
  262. ;;
  263. } { .mii
  264. cmp.eq p7 = 3, r2
  265. cmp.eq p10 = 5, r2
  266. cmp.eq p12 = 7, r2
  267. } { .bbb
  268. (p7) br.cond.spnt SpecialLoop3
  269. (p10) br.cond.spnt SpecialLoop5
  270. (p12) br.cond.spnt SpecialLoop7
  271. ;;
  272. }
  273. .align 32
  274. SpecialLoop1:
  275. { .mmi
  276. (p16) ld8 r32 = [r3], 8
  277. (RP2) st8 [r29] = r28, 8
  278. (RP1) shrp r28 = RR1, RR2, 0x38
  279. } { .mib
  280. br.ctop.sptk.many SpecialLoop1
  281. ;;
  282. } { .mib
  283. sub r3 = r3, r2
  284. mov pr = r18
  285. br UnalignedByteDone
  286. ;;
  287. }
  288. .align 32
  289. SpecialLoop2:
  290. { .mmi
  291. (p16) ld8 r32 = [r3], 8
  292. (RP2) st8 [r29] = r28, 8
  293. (RP1) shrp r28 = RR1, RR2, 0x30
  294. } { .mib
  295. br.ctop.sptk.many SpecialLoop2
  296. ;;
  297. } { .mib
  298. sub r3 = r3, r2
  299. mov pr = r18
  300. br UnalignedByteDone
  301. ;;
  302. }
  303. .align 32
  304. SpecialLoop3:
  305. { .mmi
  306. (p16) ld8 r32 = [r3], 8
  307. (RP2) st8 [r29] = r28, 8
  308. (RP1) shrp r28 = RR1, RR2, 0x28
  309. } { .mib
  310. br.ctop.sptk.many SpecialLoop3
  311. ;;
  312. } { .mib
  313. sub r3 = r3, r2
  314. mov pr = r18
  315. br UnalignedByteDone
  316. ;;
  317. }
  318. .align 32
  319. SpecialLoop4:
  320. { .mmi
  321. (p16) ld8 r32 = [r3], 8
  322. (RP2) st8 [r29] = r28, 8
  323. (RP1) shrp r28 = RR1, RR2, 0x20
  324. } { .mib
  325. br.ctop.sptk.many SpecialLoop4
  326. ;;
  327. } { .mib
  328. sub r3 = r3, r2
  329. mov pr = r18
  330. br UnalignedByteDone
  331. ;;
  332. }
  333. .align 32
  334. SpecialLoop5:
  335. { .mmi
  336. (p16) ld8 r32 = [r3], 8
  337. (RP2) st8 [r29] = r28, 8
  338. (RP1) shrp r28 = RR1, RR2, 0x18
  339. } { .mib
  340. br.ctop.sptk.many SpecialLoop5
  341. ;;
  342. } { .mib
  343. sub r3 = r3, r2
  344. mov pr = r18
  345. br UnalignedByteDone
  346. ;;
  347. }
  348. .align 32
  349. SpecialLoop6:
  350. { .mmi
  351. (p16) ld8 r32 = [r3], 8
  352. (RP2) st8 [r29] = r28, 8
  353. (RP1) shrp r28 = RR1, RR2, 0x10
  354. } { .mib
  355. br.ctop.sptk.many SpecialLoop6
  356. ;;
  357. } { .mib
  358. sub r3 = r3, r2
  359. mov pr = r18
  360. br UnalignedByteDone
  361. ;;
  362. }
  363. .align 32
  364. SpecialLoop7:
  365. { .mmi
  366. (p16) ld8 r32 = [r3], 8
  367. (RP2) st8 [r29] = r28, 8
  368. (RP1) shrp r28 = RR1, RR2, 0x8
  369. } { .mib
  370. br.ctop.sptk.many SpecialLoop7
  371. ;;
  372. } { .mii
  373. sub r3 = r3, r2
  374. mov pr = r18
  375. nop.i 0
  376. ;;
  377. }
  378. UnalignedByteDone:
  379. { .mib
  380. cmp.eq p6 = r0, r9
  381. mov.i ar.lc = r27
  382. (p6) br.ret.spnt b0
  383. ;;
  384. }
  385. UnAlignedByteDoneLoop:
  386. { .mii
  387. ld1 r19 = [r3], 1
  388. add r9 = -1, r9
  389. ;;
  390. cmp.ne p7 = r0, r9
  391. } { .mbb
  392. st1 [r29] = r19, 1
  393. (p7) br.cond.sptk UnAlignedByteDoneLoop
  394. br.ret.spnt b0
  395. ;;
  396. }
  397. .endp memcpy#