Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

618 lines
14 KiB

  1. //depot/Lab01_N/Base/crts/crtw32/string/ia64/memmove.s#4 - integrate change 31248 (text)
  2. #include "ksia64.h"
  3. LEAF_ENTRY(memmove)
  4. .prologue
  5. .regstk 3,7,0,8
  6. alloc t17 = ar.pfs,3,31,0,32
  7. .save pr, r64
  8. mov r64 = pr
  9. and t3 = -32, a1
  10. ;;
  11. lfetch [t3], 32 //0
  12. .save ar.lc, r65
  13. mov.i r65 = ar.lc
  14. and t1 = 7, a1
  15. ;;
  16. .body
  17. lfetch [t3], 32 //32
  18. mov v0 = a0
  19. and t0 = 7, a0
  20. ;;
  21. add t21 = a1, a2
  22. cmp.gtu pt0 = a0, a1
  23. or t2 = t0, t1
  24. ;;
  25. (pt0) cmp.ltu.unc pt0 = a0, t21
  26. cmp.eq pt1 = zero, a2
  27. (pt1) br.ret.spnt brp
  28. lfetch [t3], 32 //64
  29. cmp.lt pt2 = 24, a2
  30. (pt0) br.cond.spnt CopyDown
  31. ;;
  32. lfetch [t3], 32 //96
  33. cmp.lt pt6 = 127, a2
  34. cmp.le pt4 = 8, a2
  35. ;;
  36. (pt6) lfetch [t3], 32 //128
  37. (pt4) cmp.eq.unc pt3 = 0, t2
  38. (pt4) cmp.eq.unc pt5 = t0, t1
  39. (pt3) br.cond.sptk QwordMoveUp
  40. (pt5) br.cond.spnt AlignedMove
  41. (pt2) br.cond.sptk UnalignedMove
  42. ByteMoveUpLoop:
  43. ld1 t10 = [a1], 1
  44. nop.f 0
  45. add a2 = -1, a2
  46. ;;
  47. st1 [a0] = t10, 1
  48. cmp.ne pt1 = zero, a2
  49. (pt1) br.cond.sptk ByteMoveUpLoop
  50. nop.m 0
  51. nop.f 0
  52. br.ret.sptk brp
  53. UnalignedMove:
  54. cmp.eq pt0, pt1 = 0, t1
  55. sub t1 = 8, t1
  56. (pt0) br.cond.spnt SkipUnalignedMoveByteLoop
  57. ;;
  58. UnalignedMoveByteLoop:
  59. ld1 t10 = [a1], 1
  60. add t1 = -1, t1
  61. add a2 = -1, a2
  62. ;;
  63. st1 [a0] = t10, 1
  64. cmp.eq p0, pt1 = zero, t1
  65. (pt1) br.cond.sptk UnalignedMoveByteLoop
  66. ;;
  67. SkipUnalignedMoveByteLoop:
  68. (pt2) cmp.eq pt1, pt2 = r0, r0
  69. mov t1 = 8
  70. (pt1) br.cond.dptk UnalignedMoveByteLoop
  71. ;;
  72. and t0 = 7, a0
  73. mov pr.rot = 3<<16
  74. or t1 = a1, r0
  75. ;;
  76. add t2 = a2, t0
  77. mov.i ar.ec = 32
  78. sub t21 = 8, t0
  79. ;;
  80. sub t4 = a0, t0
  81. shr t10 = t2, 3
  82. shl t21 = t21, 3
  83. ;;
  84. ld8 r33 = [t4], 0
  85. add t10 = -1,t10
  86. and t2 = 7, t2
  87. ;;
  88. cmp.eq pt0 = 2, t0
  89. cmp.eq pt3 = 4, t0
  90. cmp.eq pt5 = 6, t0
  91. ;;
  92. nop.m 0
  93. shl r33 = r33,t21 // Prime r39
  94. mov.i ar.lc = t10
  95. (pt0) br.cond.spnt SpecialLoop2
  96. (pt3) br.cond.spnt SpecialLoop4
  97. (pt5) br.cond.spnt SpecialLoop6
  98. cmp.eq pt1 = 3, t0
  99. cmp.eq pt4 = 5, t0
  100. cmp.eq pt6 = 7, t0
  101. (pt1) br.cond.spnt SpecialLoop3
  102. (pt4) br.cond.spnt SpecialLoop5
  103. (pt6) br.cond.spnt SpecialLoop7
  104. ;;
  105. SpecialLoop1:
  106. (p16) ld8 r32 = [t1], 8
  107. nop.f 0
  108. brp.sptk.imp SpecialLoop1, SpecialLoop1E
  109. SpecialLoop1E:
  110. (p48) st8 [t4] = r10, 8
  111. (p47) shrp r10 = r62,r63,56
  112. br.ctop.sptk.many SpecialLoop1
  113. br UnalignedByteDone
  114. SpecialLoop2:
  115. (p16) ld8 r32 = [t1], 8
  116. nop.f 0
  117. brp.sptk.imp SpecialLoop2, SpecialLoop2E
  118. SpecialLoop2E:
  119. (p48) st8 [t4] = r10, 8
  120. (p47) shrp r10 = r62,r63,48
  121. br.ctop.sptk.many SpecialLoop2
  122. br UnalignedByteDone
  123. SpecialLoop3:
  124. (p16) ld8 r32 = [t1], 8
  125. nop.f 0
  126. brp.sptk.imp SpecialLoop3, SpecialLoop3E
  127. SpecialLoop3E:
  128. (p48) st8 [t4] = r10, 8
  129. (p47) shrp r10 = r62,r63,40
  130. br.ctop.sptk.many SpecialLoop3
  131. br UnalignedByteDone
  132. SpecialLoop4:
  133. (p16) ld8 r32 = [t1], 8
  134. nop.f 0
  135. brp.sptk.imp SpecialLoop4, SpecialLoop4E
  136. SpecialLoop4E:
  137. (p48) st8 [t4] = r10, 8
  138. (p47) shrp r10 = r62,r63,32
  139. br.ctop.sptk.many SpecialLoop4
  140. br UnalignedByteDone
  141. SpecialLoop5:
  142. (p16) ld8 r32 = [t1], 8
  143. nop.f 0
  144. brp.sptk.imp SpecialLoop5, SpecialLoop5E
  145. SpecialLoop5E:
  146. (p48) st8 [t4] = r10, 8
  147. (p47) shrp r10 = r62,r63,24
  148. br.ctop.sptk.many SpecialLoop5
  149. br UnalignedByteDone
  150. SpecialLoop6:
  151. (p16) ld8 r32 = [t1], 8
  152. nop.f 0
  153. brp.sptk.imp SpecialLoop6, SpecialLoop6E
  154. SpecialLoop6E:
  155. (p48) st8 [t4] = r10, 8
  156. (p47) shrp r10 = r62,r63,16
  157. br.ctop.sptk.many SpecialLoop6
  158. br UnalignedByteDone
  159. SpecialLoop7:
  160. (p16) ld8 r32 = [t1], 8
  161. nop.f 0
  162. brp.sptk.imp SpecialLoop7, SpecialLoop7E
  163. SpecialLoop7E:
  164. (p48) st8 [t4] = r10, 8
  165. (p47) shrp r10 = r62,r63,8
  166. br.ctop.sptk.many SpecialLoop7;;
  167. UnalignedByteDone:
  168. sub t1 = t1, t0
  169. mov pr = r64
  170. mov.i ar.lc = r65
  171. ;;
  172. cmp.eq pt0 = zero, t2
  173. (pt0) br.ret.spnt brp
  174. UnAlignedByteDoneLoop:
  175. ld1 t10 = [t1], 1
  176. add t2 = -1, t2
  177. ;;
  178. cmp.ne pt1 = zero, t2
  179. st1 [t4] = t10, 1
  180. (pt1) br.cond.sptk UnAlignedByteDoneLoop
  181. br.ret.spnt brp
  182. AlignedMove:
  183. add t4 = 64, t3
  184. (pt6) lfetch [t3], 32 //160
  185. sub t22 = 8, t0
  186. ;;
  187. (pt6) lfetch [t3], 64 //192
  188. (pt6) lfetch [t4], 96 //224
  189. sub a2 = a2, t22
  190. ;;
  191. AlignedMoveByteLoop:
  192. ld1 t10 = [a1], 1
  193. nop.f 0
  194. add t22 = -1, t22
  195. ;;
  196. st1 [a0] = t10, 1
  197. cmp.ne pt1 = zero, t22
  198. (pt1) br.cond.sptk AlignedMoveByteLoop
  199. ;;
  200. (pt6) lfetch [t3], 32 //256
  201. cmp.eq.unc pt0 = zero, a2
  202. cmp.gt pt2 = 8, a2
  203. (pt6) lfetch [t4], 128 //320
  204. (pt0) br.ret.spnt brp
  205. (pt2) br.cond.sptk ByteMoveUpLoop
  206. ;;
  207. //
  208. // both src & dest are now 8-byte aligned
  209. //
  210. QwordMoveUp:
  211. add t3 = 128, a1
  212. add t4 = 288, a1
  213. add t7 = 8, a1
  214. add t8 = 8, a0
  215. cmp.gt pt3 = 64, a2
  216. (pt3) br.cond.spnt QwordMoveUpLoop
  217. ;;
  218. UnrolledQwordMoveUpLoop:
  219. ld8 t10 = [a1], 16
  220. ld8 t11 = [t7], 16
  221. add a2 = -64, a2
  222. ;;
  223. ld8 t12 = [a1], 16
  224. ld8 t13 = [t7], 16
  225. cmp.le pt3 = 128, a2
  226. ;;
  227. ld8 t18 = [a1], 16
  228. ld8 t19 = [t7], 16
  229. cmp.gt pt2 = 8, a2
  230. ;;
  231. ld8 t16 = [a1], 16
  232. ld8 t17 = [t7], 16
  233. ;;
  234. (pt3) lfetch [t3], 64
  235. (pt3) lfetch [t4], 64
  236. st8 [a0] = t10, 16
  237. st8 [t8] = t11, 16
  238. ;;
  239. st8 [a0] = t12, 16
  240. st8 [t8] = t13, 16
  241. ;;
  242. st8 [a0] = t18, 16
  243. st8 [t8] = t19, 16
  244. ;;
  245. st8 [a0] = t16, 16
  246. st8 [t8] = t17, 16
  247. (pt3) br.cond.dptk UnrolledQwordMoveUpLoop
  248. (pt2) br.cond.spnt ByteMoveUp
  249. ;;
  250. QwordMoveUpLoop:
  251. ld8 t10 = [a1], 8
  252. add a2 = -8, a2
  253. ;;
  254. cmp.le pt1 = 8, a2
  255. st8 [a0] = t10, 8
  256. (pt1) br.cond.sptk QwordMoveUpLoop
  257. ;;
  258. ByteMoveUp:
  259. cmp.eq pt0 = zero, a2
  260. (pt0) br.ret.spnt brp
  261. ;;
  262. AlignedByteDoneLoop:
  263. ld1 t10 = [a1], 1
  264. add a2 = -1, a2
  265. ;;
  266. cmp.ne pt1 = zero, a2
  267. st1 [a0] = t10, 1
  268. (pt1) br.cond.sptk AlignedByteDoneLoop
  269. br.ret.spnt brp
  270. ;;
  271. CopyDown:
  272. cmp.eq pt0 = zero, a2
  273. cmp.ne pt6 = t0, t1
  274. (pt0) br.ret.spnt brp // return if length is zero
  275. cmp.gt pt4 = 24, a2
  276. add t20 = a2, a0
  277. add t21 = a2, a1
  278. nop.m 0
  279. (pt4) br.cond.sptk ByteMoveDown // less than 24 bytes to copy
  280. (pt6) br.cond.spnt UnalignedMoveDown // incompatible alignment
  281. ;;
  282. nop.m 0
  283. nop.m 0
  284. and t22 = 0x7, t21
  285. ;;
  286. add t20 = -1, t20
  287. add t21 = -1, t21
  288. sub a2 = a2, t22
  289. ;;
  290. TailMove:
  291. cmp.eq pt0, pt1 = zero, t22
  292. ;;
  293. (pt1) ld1 t10 = [t21], -1
  294. (pt1) add t22 = -1, t22
  295. ;;
  296. (pt1) st1 [t20] = t10, -1
  297. (pt1) br.cond.sptk TailMove
  298. Block8Move:
  299. nop.m 0
  300. add t20 = -7, t20
  301. add t21 = -7, t21
  302. ;;
  303. Block8MoveLoop:
  304. cmp.gt pt5, pt6 = 8, a2
  305. ;;
  306. (pt6) ld8 t10 = [t21], -8
  307. (pt6) add a2 = -8, a2
  308. ;;
  309. (pt6) st8 [t20] = t10, -8
  310. (pt6) br.cond.sptk Block8MoveLoop
  311. add t20 = 8, t20 // adjust dest
  312. add t21 = 8, t21 // adjust source
  313. br.cond.sptk ByteMoveDown
  314. ;;
  315. UnalignedMoveDown:
  316. and t1 = 7, t21
  317. ;;
  318. cmp.eq pt0, pt1 = 0, t1
  319. (pt0) br.cond.spnt SkipUnalignedMoveDownByteLoop
  320. ;;
  321. add t20 = -1, t20
  322. add t21 = -1, t21
  323. ;;
  324. UnalignedMoveDownByteLoop:
  325. ld1 t10 = [t21], -1
  326. add t1 = -1, t1
  327. add a2 = -1, a2
  328. ;;
  329. st1 [t20] = t10, -1
  330. cmp.eq p0, pt1 = zero, t1
  331. (pt1) br.cond.sptk UnalignedMoveDownByteLoop
  332. ;;
  333. add t20 = 1, t20
  334. add t21 = 1, t21
  335. ;;
  336. SkipUnalignedMoveDownByteLoop:
  337. (pt2) cmp.eq pt1, pt2 = r0, r0
  338. mov t1 = 8
  339. ;;
  340. (pt1) add t20 = -1, t20
  341. (pt1) add t21 = -1, t21
  342. (pt1) br.cond.dptk UnalignedMoveDownByteLoop
  343. ;;
  344. add t21 = -8, t21
  345. ;;
  346. and t0 = 7, t20
  347. mov pr.rot = 3<<16
  348. or t1 = t21, r0
  349. ;;
  350. sub t7 = 8, t0
  351. ;;
  352. add t2 = a2, t7
  353. mov.i ar.ec = 32
  354. ;;
  355. sub t4 = t20, t0
  356. shr t10 = t2, 3
  357. shl t6 = t0, 3
  358. ;;
  359. ld8 r33 = [t4], 0
  360. add t10 = -1,t10
  361. and t2 = 7, t2
  362. ;;
  363. cmp.eq pt0 = 2, t0
  364. cmp.eq pt3 = 4, t0
  365. cmp.eq pt5 = 6, t0
  366. ;;
  367. shr r33 = r33,t6 // Prime r39
  368. mov.i ar.lc = t10
  369. (pt0) br.cond.spnt SpecialLoopDown2
  370. (pt3) br.cond.spnt SpecialLoopDown4
  371. (pt5) br.cond.spnt SpecialLoopDown6
  372. cmp.eq pt1 = 3, t0
  373. cmp.eq pt4 = 5, t0
  374. cmp.eq pt6 = 7, t0
  375. (pt1) br.cond.spnt SpecialLoopDown3
  376. (pt4) br.cond.spnt SpecialLoopDown5
  377. (pt6) br.cond.spnt SpecialLoopDown7
  378. ;;
  379. SpecialLoopDown1:
  380. (p16) ld8 r32 = [t1], -8
  381. nop.f 0
  382. brp.sptk.imp SpecialLoopDown1, SpecialLoopDown1E
  383. SpecialLoopDown1E:
  384. (p48) st8 [t4] = r10, -8
  385. (p47) shrp r10 = r63,r62,56
  386. br.ctop.sptk.many SpecialLoopDown1
  387. br UnalignedByteDownDone
  388. SpecialLoopDown2:
  389. (p16) ld8 r32 = [t1], -8
  390. nop.f 0
  391. brp.sptk.imp SpecialLoopDown2, SpecialLoopDown2E
  392. SpecialLoopDown2E:
  393. (p48) st8 [t4] = r10, -8
  394. (p47) shrp r10 = r63,r62,48
  395. br.ctop.sptk.many SpecialLoopDown2
  396. br UnalignedByteDownDone
  397. SpecialLoopDown3:
  398. (p16) ld8 r32 = [t1], -8
  399. nop.f 0
  400. brp.sptk.imp SpecialLoopDown3, SpecialLoopDown3E
  401. SpecialLoopDown3E:
  402. (p48) st8 [t4] = r10, -8
  403. (p47) shrp r10 = r63,r62,40
  404. br.ctop.sptk.many SpecialLoopDown3
  405. br UnalignedByteDownDone
  406. SpecialLoopDown4:
  407. (p16) ld8 r32 = [t1], -8
  408. nop.f 0
  409. brp.sptk.imp SpecialLoopDown4, SpecialLoopDown4E
  410. SpecialLoopDown4E:
  411. (p48) st8 [t4] = r10, -8
  412. (p47) shrp r10 = r63,r62,32
  413. br.ctop.sptk.many SpecialLoopDown4
  414. br UnalignedByteDownDone
  415. SpecialLoopDown5:
  416. (p16) ld8 r32 = [t1], -8
  417. nop.f 0
  418. brp.sptk.imp SpecialLoopDown5, SpecialLoopDown5E
  419. SpecialLoopDown5E:
  420. (p48) st8 [t4] = r10, -8
  421. (p47) shrp r10 = r63,r62,24
  422. br.ctop.sptk.many SpecialLoopDown5
  423. br UnalignedByteDownDone
  424. SpecialLoopDown6:
  425. (p16) ld8 r32 = [t1], -8
  426. nop.f 0
  427. brp.sptk.imp SpecialLoopDown6, SpecialLoopDown6E
  428. SpecialLoopDown6E:
  429. (p48) st8 [t4] = r10, -8
  430. (p47) shrp r10 = r63,r62,16
  431. br.ctop.sptk.many SpecialLoopDown6
  432. br UnalignedByteDownDone
  433. SpecialLoopDown7:
  434. (p16) ld8 r32 = [t1], -8
  435. nop.f 0
  436. brp.sptk.imp SpecialLoopDown7, SpecialLoopDown7E
  437. SpecialLoopDown7E:
  438. (p48) st8 [t4] = r10, -8
  439. (p47) shrp r10 = r63,r62,8
  440. br.ctop.sptk.many SpecialLoopDown7;;
  441. UnalignedByteDownDone:
  442. add t1 = 7, t1
  443. add t4 = 7, t4
  444. ;;
  445. add t1 = t1, t7
  446. mov pr = r64
  447. mov.i ar.lc = r65
  448. ;;
  449. cmp.eq pt0 = zero, t2
  450. (pt0) br.ret.spnt brp
  451. ;;
  452. UnAlignedByteDoneDownLoop:
  453. ld1 t10 = [t1], -1
  454. add t2 = -1, t2
  455. ;;
  456. cmp.ne pt1 = zero, t2
  457. st1 [t4] = t10, -1
  458. (pt1) br.cond.sptk UnAlignedByteDoneDownLoop
  459. br.ret.spnt brp
  460. ByteMoveDown:
  461. nop.m 0
  462. add t20 = -1, t20 // adjust source
  463. add t21 = -1, t21 // adjust destination
  464. ;;
  465. ByteMoveDownLoop:
  466. cmp.ne pt1 = zero, a2
  467. ;;
  468. (pt1) ld1 t10 = [t21], -1
  469. (pt1) add a2 = -1, a2
  470. ;;
  471. (pt1) st1 [t20] = t10, -1
  472. (pt1) br.cond.sptk ByteMoveDownLoop
  473. br.ret.spnt brp
  474. ;;
  475. LEAF_EXIT(memmove)