Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

603 lines
14 KiB

  1. #include "ksia64.h"
  2. LEAF_ENTRY(memmove)
  3. .prologue
  4. .regstk 3,7,0,8
  5. alloc t17 = ar.pfs,3,31,0,32
  6. .save pr, r64
  7. mov r64 = pr
  8. and t3 = -32, a1
  9. ;;
  10. lfetch [t3], 32 //0
  11. .save ar.lc, r65
  12. mov.i r65 = ar.lc
  13. and t1 = 7, a1
  14. ;;
  15. .body
  16. lfetch [t3], 32 //32
  17. mov v0 = a0
  18. and t0 = 7, a0
  19. ;;
  20. add t21 = a1, a2
  21. cmp.gtu pt0 = a0, a1
  22. or t2 = t0, t1
  23. ;;
  24. (pt0) cmp.ltu.unc pt0 = a0, t21
  25. cmp.eq pt1 = zero, a2
  26. (pt1) br.ret.spnt brp
  27. lfetch [t3], 32 //64
  28. cmp.lt pt2 = 16, a2
  29. (pt0) br.cond.spnt CopyDown
  30. ;;
  31. lfetch [t3], 32 //96
  32. cmp.lt pt6 = 127, a2
  33. cmp.le pt4 = 8, a2
  34. ;;
  35. (pt6) lfetch [t3], 32 //128
  36. (pt4) cmp.eq.unc pt3 = 0, t2
  37. (pt4) cmp.eq.unc pt5 = t0, t1
  38. (pt3) br.cond.sptk QwordMoveUp
  39. (pt5) br.cond.spnt AlignedMove
  40. (pt2) br.cond.sptk UnalignedMove
  41. ByteMoveUpLoop:
  42. ld1 t10 = [a1], 1
  43. nop.f 0
  44. add a2 = -1, a2
  45. ;;
  46. st1 [a0] = t10, 1
  47. cmp.ne pt1 = zero, a2
  48. (pt1) br.cond.sptk ByteMoveUpLoop
  49. nop.m 0
  50. nop.f 0
  51. br.ret.sptk brp
  52. UnalignedMove:
  53. cmp.eq pt0 = 0, t1
  54. sub t1 = 8, t1
  55. (pt0) br.cond.spnt SkipUnalignedMoveByteLoop
  56. ;;
  57. UnalignedMoveByteLoop:
  58. ld1 t10 = [a1], 1
  59. add t1 = -1, t1
  60. add a2 = -1, a2
  61. ;;
  62. st1 [a0] = t10, 1
  63. cmp.eq p0, pt1 = zero, t1
  64. (pt1) br.cond.sptk UnalignedMoveByteLoop
  65. ;;
  66. SkipUnalignedMoveByteLoop:
  67. and t0 = 7, a0
  68. mov pr.rot = 3<<16
  69. or t1 = a1, r0
  70. ;;
  71. add t2 = a2, t0
  72. mov.i ar.ec = 32
  73. sub t21 = 8, t0
  74. ;;
  75. sub t4 = a0, t0
  76. shr t10 = t2, 3
  77. shl t21 = t21, 3
  78. ;;
  79. ld8 r33 = [t4], 0
  80. add t10 = -1,t10
  81. and t2 = 7, t2
  82. ;;
  83. cmp.eq pt0 = 2, t0
  84. cmp.eq pt3 = 4, t0
  85. cmp.eq pt5 = 6, t0
  86. ;;
  87. nop.m 0
  88. shl r33 = r33,t21 // Prime r39
  89. mov.i ar.lc = t10
  90. (pt0) br.cond.spnt SpecialLoop2
  91. (pt3) br.cond.spnt SpecialLoop4
  92. (pt5) br.cond.spnt SpecialLoop6
  93. cmp.eq pt1 = 3, t0
  94. cmp.eq pt4 = 5, t0
  95. cmp.eq pt6 = 7, t0
  96. (pt1) br.cond.spnt SpecialLoop3
  97. (pt4) br.cond.spnt SpecialLoop5
  98. (pt6) br.cond.spnt SpecialLoop7
  99. ;;
  100. SpecialLoop1:
  101. (p16) ld8 r32 = [t1], 8
  102. nop.f 0
  103. brp.sptk.imp SpecialLoop1E, SpecialLoop1
  104. SpecialLoop1E:
  105. (p48) st8 [t4] = r10, 8
  106. (p47) shrp r10 = r62,r63,56
  107. br.ctop.sptk.many SpecialLoop1
  108. br UnalignedByteDone
  109. SpecialLoop2:
  110. (p16) ld8 r32 = [t1], 8
  111. nop.f 0
  112. brp.sptk.imp SpecialLoop2E, SpecialLoop2
  113. SpecialLoop2E:
  114. (p48) st8 [t4] = r10, 8
  115. (p47) shrp r10 = r62,r63,48
  116. br.ctop.sptk.many SpecialLoop2
  117. br UnalignedByteDone
  118. SpecialLoop3:
  119. (p16) ld8 r32 = [t1], 8
  120. nop.f 0
  121. brp.sptk.imp SpecialLoop3E, SpecialLoop3
  122. SpecialLoop3E:
  123. (p48) st8 [t4] = r10, 8
  124. (p47) shrp r10 = r62,r63,40
  125. br.ctop.sptk.many SpecialLoop3
  126. br UnalignedByteDone
  127. SpecialLoop4:
  128. (p16) ld8 r32 = [t1], 8
  129. nop.f 0
  130. brp.sptk.imp SpecialLoop4E, SpecialLoop4
  131. SpecialLoop4E:
  132. (p48) st8 [t4] = r10, 8
  133. (p47) shrp r10 = r62,r63,32
  134. br.ctop.sptk.many SpecialLoop4
  135. br UnalignedByteDone
  136. SpecialLoop5:
  137. (p16) ld8 r32 = [t1], 8
  138. nop.f 0
  139. brp.sptk.imp SpecialLoop5E, SpecialLoop5
  140. SpecialLoop5E:
  141. (p48) st8 [t4] = r10, 8
  142. (p47) shrp r10 = r62,r63,24
  143. br.ctop.sptk.many SpecialLoop5
  144. br UnalignedByteDone
  145. SpecialLoop6:
  146. (p16) ld8 r32 = [t1], 8
  147. nop.f 0
  148. brp.sptk.imp SpecialLoop6E, SpecialLoop6
  149. SpecialLoop6E:
  150. (p48) st8 [t4] = r10, 8
  151. (p47) shrp r10 = r62,r63,16
  152. br.ctop.sptk.many SpecialLoop6
  153. br UnalignedByteDone
  154. SpecialLoop7:
  155. (p16) ld8 r32 = [t1], 8
  156. nop.f 0
  157. brp.sptk.imp SpecialLoop7E, SpecialLoop7
  158. SpecialLoop7E:
  159. (p48) st8 [t4] = r10, 8
  160. (p47) shrp r10 = r62,r63,8
  161. br.ctop.sptk.many SpecialLoop7;;
  162. UnalignedByteDone:
  163. sub t1 = t1, t0
  164. mov pr = r64
  165. mov.i ar.lc = r65
  166. ;;
  167. cmp.eq pt0 = zero, t2
  168. (pt0) br.ret.spnt brp
  169. UnAlignedByteDoneLoop:
  170. ld1 t10 = [t1], 1
  171. add t2 = -1, t2
  172. ;;
  173. cmp.ne pt1 = zero, t2
  174. st1 [t4] = t10, 1
  175. (pt1) br.cond.sptk UnAlignedByteDoneLoop
  176. br.ret.spnt brp
  177. AlignedMove:
  178. add t4 = 64, t3
  179. (pt6) lfetch [t3], 32 //160
  180. sub t22 = 8, t0
  181. ;;
  182. (pt6) lfetch [t3], 64 //192
  183. (pt6) lfetch [t4], 96 //224
  184. sub a2 = a2, t22
  185. ;;
  186. AlignedMoveByteLoop:
  187. ld1 t10 = [a1], 1
  188. nop.f 0
  189. add t22 = -1, t22
  190. ;;
  191. st1 [a0] = t10, 1
  192. cmp.ne pt1 = zero, t22
  193. (pt1) br.cond.sptk AlignedMoveByteLoop
  194. ;;
  195. (pt6) lfetch [t3], 32 //256
  196. cmp.eq.unc pt0 = zero, a2
  197. cmp.gt pt2 = 8, a2
  198. (pt6) lfetch [t4], 128 //320
  199. (pt0) br.ret.spnt brp
  200. (pt2) br.cond.sptk ByteMoveUpLoop
  201. ;;
  202. //
  203. // both src & dest are now 8-byte aligned
  204. //
  205. QwordMoveUp:
  206. add t3 = 128, a1
  207. add t4 = 288, a1
  208. add t7 = 8, a1
  209. add t8 = 8, a0
  210. cmp.gt pt3 = 64, a2
  211. (pt3) br.cond.spnt QwordMoveUpLoop
  212. ;;
  213. UnrolledQwordMoveUpLoop:
  214. ld8 t10 = [a1], 16
  215. ld8 t11 = [t7], 16
  216. add a2 = -64, a2
  217. ;;
  218. ld8 t12 = [a1], 16
  219. ld8 t13 = [t7], 16
  220. cmp.le pt3 = 128, a2
  221. ;;
  222. ld8 t14 = [a1], 16
  223. ld8 t15 = [t7], 16
  224. cmp.gt pt2 = 8, a2
  225. ;;
  226. ld8 t16 = [a1], 16
  227. ld8 t17 = [t7], 16
  228. ;;
  229. (pt3) lfetch [t3], 64
  230. (pt3) lfetch [t4], 64
  231. st8 [a0] = t10, 16
  232. st8 [t8] = t11, 16
  233. ;;
  234. st8 [a0] = t12, 16
  235. st8 [t8] = t13, 16
  236. ;;
  237. st8 [a0] = t14, 16
  238. st8 [t8] = t15, 16
  239. ;;
  240. st8 [a0] = t16, 16
  241. st8 [t8] = t17, 16
  242. (pt3) br.cond.dptk UnrolledQwordMoveUpLoop
  243. (pt2) br.cond.spnt ByteMoveUp
  244. ;;
  245. QwordMoveUpLoop:
  246. ld8 t10 = [a1], 8
  247. add a2 = -8, a2
  248. ;;
  249. cmp.le pt1 = 8, a2
  250. st8 [a0] = t10, 8
  251. (pt1) br.cond.sptk QwordMoveUpLoop
  252. ;;
  253. ByteMoveUp:
  254. cmp.eq pt0 = zero, a2
  255. (pt0) br.ret.spnt brp
  256. ;;
  257. AlignedByteDoneLoop:
  258. ld1 t10 = [a1], 1
  259. add a2 = -1, a2
  260. ;;
  261. cmp.ne pt1 = zero, a2
  262. st1 [a0] = t10, 1
  263. (pt1) br.cond.sptk AlignedByteDoneLoop
  264. br.ret.spnt brp
  265. ;;
  266. CopyDown:
  267. cmp.eq pt0 = zero, a2
  268. cmp.ne pt6 = t0, t1
  269. (pt0) br.ret.spnt brp // return if length is zero
  270. cmp.gt pt4 = 16, a2
  271. add t20 = a2, a0
  272. add t21 = a2, a1
  273. nop.m 0
  274. (pt4) br.cond.sptk ByteMoveDown // less than 16 bytes to copy
  275. (pt6) br.cond.spnt UnalignedMoveDown // incompatible alignment
  276. ;;
  277. nop.m 0
  278. nop.m 0
  279. and t22 = 0x7, t21
  280. ;;
  281. add t20 = -1, t20
  282. add t21 = -1, t21
  283. sub a2 = a2, t22
  284. ;;
  285. TailMove:
  286. cmp.eq pt0, pt1 = zero, t22
  287. ;;
  288. (pt1) ld1 t10 = [t21], -1
  289. (pt1) add t22 = -1, t22
  290. ;;
  291. (pt1) st1 [t20] = t10, -1
  292. (pt1) br.cond.sptk TailMove
  293. Block8Move:
  294. nop.m 0
  295. add t20 = -7, t20
  296. add t21 = -7, t21
  297. ;;
  298. Block8MoveLoop:
  299. cmp.gt pt5, pt6 = 8, a2
  300. ;;
  301. (pt6) ld8 t10 = [t21], -8
  302. (pt6) add a2 = -8, a2
  303. ;;
  304. (pt6) st8 [t20] = t10, -8
  305. (pt6) br.cond.sptk Block8MoveLoop
  306. add t20 = 8, t20 // adjust dest
  307. add t21 = 8, t21 // adjust source
  308. br.cond.sptk ByteMoveDown
  309. ;;
  310. UnalignedMoveDown:
  311. and t1 = 7, t21
  312. ;;
  313. cmp.eq pt0 = 0, t1
  314. (pt0) br.cond.spnt SkipUnalignedMoveDownByteLoop
  315. ;;
  316. add t20 = -1, t20
  317. add t21 = -1, t21
  318. ;;
  319. UnalignedMoveDownByteLoop:
  320. ld1 t10 = [t21], -1
  321. add t1 = -1, t1
  322. add a2 = -1, a2
  323. ;;
  324. st1 [t20] = t10, -1
  325. cmp.eq p0, pt1 = zero, t1
  326. (pt1) br.cond.sptk UnalignedMoveDownByteLoop
  327. ;;
  328. add t20 = 1, t20
  329. add t21 = 1, t21
  330. ;;
  331. SkipUnalignedMoveDownByteLoop:
  332. add t21 = -8, t21
  333. ;;
  334. and t0 = 7, t20
  335. mov pr.rot = 3<<16
  336. or t1 = t21, r0
  337. ;;
  338. sub t7 = 8, t0
  339. ;;
  340. add t2 = a2, t7
  341. mov.i ar.ec = 32
  342. ;;
  343. sub t4 = t20, t0
  344. shr t10 = t2, 3
  345. shl t6 = t0, 3
  346. ;;
  347. ld8 r33 = [t4], 0
  348. add t10 = -1,t10
  349. and t2 = 7, t2
  350. ;;
  351. cmp.eq pt0 = 2, t0
  352. cmp.eq pt3 = 4, t0
  353. cmp.eq pt5 = 6, t0
  354. ;;
  355. shr r33 = r33,t6 // Prime r39
  356. mov.i ar.lc = t10
  357. (pt0) br.cond.spnt SpecialLoopDown2
  358. (pt3) br.cond.spnt SpecialLoopDown4
  359. (pt5) br.cond.spnt SpecialLoopDown6
  360. cmp.eq pt1 = 3, t0
  361. cmp.eq pt4 = 5, t0
  362. cmp.eq pt6 = 7, t0
  363. (pt1) br.cond.spnt SpecialLoopDown3
  364. (pt4) br.cond.spnt SpecialLoopDown5
  365. (pt6) br.cond.spnt SpecialLoopDown7
  366. ;;
  367. SpecialLoopDown1:
  368. (p16) ld8 r32 = [t1], -8
  369. nop.f 0
  370. brp.sptk.imp SpecialLoopDown1E, SpecialLoopDown1
  371. SpecialLoopDown1E:
  372. (p48) st8 [t4] = r10, -8
  373. (p47) shrp r10 = r63,r62,56
  374. br.ctop.sptk.many SpecialLoopDown1
  375. br UnalignedByteDownDone
  376. SpecialLoopDown2:
  377. (p16) ld8 r32 = [t1], -8
  378. nop.f 0
  379. brp.sptk.imp SpecialLoopDown2E, SpecialLoopDown2
  380. SpecialLoopDown2E:
  381. (p48) st8 [t4] = r10, -8
  382. (p47) shrp r10 = r63,r62,48
  383. br.ctop.sptk.many SpecialLoopDown2
  384. br UnalignedByteDownDone
  385. SpecialLoopDown3:
  386. (p16) ld8 r32 = [t1], -8
  387. nop.f 0
  388. brp.sptk.imp SpecialLoopDown3E, SpecialLoopDown3
  389. SpecialLoopDown3E:
  390. (p48) st8 [t4] = r10, -8
  391. (p47) shrp r10 = r63,r62,40
  392. br.ctop.sptk.many SpecialLoopDown3
  393. br UnalignedByteDownDone
  394. SpecialLoopDown4:
  395. (p16) ld8 r32 = [t1], -8
  396. nop.f 0
  397. brp.sptk.imp SpecialLoopDown4E, SpecialLoopDown4
  398. SpecialLoopDown4E:
  399. (p48) st8 [t4] = r10, -8
  400. (p47) shrp r10 = r63,r62,32
  401. br.ctop.sptk.many SpecialLoopDown4
  402. br UnalignedByteDownDone
  403. SpecialLoopDown5:
  404. (p16) ld8 r32 = [t1], -8
  405. nop.f 0
  406. brp.sptk.imp SpecialLoopDown5E, SpecialLoopDown5
  407. SpecialLoopDown5E:
  408. (p48) st8 [t4] = r10, -8
  409. (p47) shrp r10 = r63,r62,24
  410. br.ctop.sptk.many SpecialLoopDown5
  411. br UnalignedByteDownDone
  412. SpecialLoopDown6:
  413. (p16) ld8 r32 = [t1], -8
  414. nop.f 0
  415. brp.sptk.imp SpecialLoopDown6E, SpecialLoopDown6
  416. SpecialLoopDown6E:
  417. (p48) st8 [t4] = r10, -8
  418. (p47) shrp r10 = r63,r62,16
  419. br.ctop.sptk.many SpecialLoopDown6
  420. br UnalignedByteDownDone
  421. SpecialLoopDown7:
  422. (p16) ld8 r32 = [t1], -8
  423. nop.f 0
  424. brp.sptk.imp SpecialLoopDown7E, SpecialLoopDown7
  425. SpecialLoopDown7E:
  426. (p48) st8 [t4] = r10, -8
  427. (p47) shrp r10 = r63,r62,8
  428. br.ctop.sptk.many SpecialLoopDown7;;
  429. UnalignedByteDownDone:
  430. add t1 = 7, t1
  431. add t4 = 7, t4
  432. ;;
  433. add t1 = t1, t7
  434. mov pr = r64
  435. mov.i ar.lc = r65
  436. ;;
  437. cmp.eq pt0 = zero, t2
  438. (pt0) br.ret.spnt brp
  439. ;;
  440. UnAlignedByteDoneDownLoop:
  441. ld1 t10 = [t1], -1
  442. add t2 = -1, t2
  443. ;;
  444. cmp.ne pt1 = zero, t2
  445. st1 [t4] = t10, -1
  446. (pt1) br.cond.sptk UnAlignedByteDoneDownLoop
  447. br.ret.spnt brp
  448. ByteMoveDown:
  449. nop.m 0
  450. add t20 = -1, t20 // adjust source
  451. add t21 = -1, t21 // adjust destination
  452. ;;
  453. ByteMoveDownLoop:
  454. cmp.ne pt1 = zero, a2
  455. ;;
  456. (pt1) ld1 t10 = [t21], -1
  457. (pt1) add a2 = -1, a2
  458. ;;
  459. (pt1) st1 [t20] = t10, -1
  460. (pt1) br.cond.sptk ByteMoveDownLoop
  461. br.ret.spnt brp
  462. ;;
  463. LEAF_EXIT(memmove)