Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

567 lines
13 KiB

  1. .section .text
  2. .file "memcpy.s"
  3. .proc _memcpy_ita#
  4. .global _memcpy_ita#
  5. .align 32
  6. .prologue
  7. _memcpy_ita:
  8. { .mii
  9. and r10 = -32, r33
  10. .save pr, r18
  11. mov r18 = pr
  12. cmp.eq p7, p9 = r0, r34
  13. } { .mmi
  14. cmp.ltu p8 = 24, r34
  15. ;;
  16. (p9) lfetch [r10], 32 //0
  17. .save ar.lc, r27
  18. mov.i r27 = ar.lc
  19. } { .mib
  20. mov r8 = r32
  21. cmp.eq.or p7 = r32, r33
  22. (p7) br.ret.spnt b0
  23. ;;
  24. } { .mii
  25. (p8) lfetch [r10], 32 //32 //(p8)
  26. and r2 = 7, r32
  27. and r3 = 7, r33
  28. ;;
  29. } { .mii
  30. (p8) lfetch [r10], 32 //64 //(p8)
  31. or r9 = r2, r3
  32. cmp.gtu p14, p13 = 64, r34
  33. ;;
  34. } { .mii
  35. (p13) lfetch [r10], 32 //96 //(p13)
  36. cmp.ltu p12 = 127, r34
  37. cmp.leu p10, p11 = 8, r34
  38. } { .mbb
  39. (p14) cmp.eq.unc p9 = 0, r9 // This bundle is just a shortcut
  40. (p11) br.cond.spnt ByteMoveUpLoop_ita
  41. (p9) br.cond.spnt QwordMoveUpLoop_ita
  42. ;;
  43. } { .mii
  44. (p12) lfetch [r10], 32 //128
  45. (p10) cmp.eq.unc p9 = 0, r9
  46. (p10) cmp.eq.unc p11 = r2, r3
  47. } { .bbb
  48. (p9) br.cond.sptk QwordMoveUp_ita
  49. (p11) br.cond.spnt AlignedMove_ita
  50. (p8) br.cond.sptk UnalignedMove_ita
  51. ;;
  52. }
  53. ByteMoveUpLoop_ita:
  54. { .mii
  55. add r20 = 2, r33
  56. cmp.leu p6 = 2, r34
  57. cmp.leu p7 = 3, r34
  58. } { .mmi
  59. ld1 r19 = [r33], 1
  60. ;;
  61. (p6) ld1 r2 = [r33], 2
  62. add r21 = 1, r32
  63. } { .mii
  64. (p7) ld1 r3 = [r20]
  65. cmp.leu p8 = 4, r34
  66. ;;
  67. add r34 = -4, r34
  68. } { .mmi
  69. (p8) ld1 r22 = [r33], 1
  70. ;;
  71. st1 [r32] = r19, 2
  72. (p8) cmp.ltu.unc p9 = r0, r34
  73. } { .mmi
  74. (p6) st1 [r21] = r2, 2
  75. ;;
  76. (p7) st1 [r32] = r3, 2
  77. nop.i 0
  78. } { .mbb
  79. (p8) st1 [r21] = r22
  80. (p9) br.cond.dpnt ByteMoveUpLoop_ita
  81. br.ret.dptk b0
  82. ;;
  83. }
  84. //
  85. // src & dest have same alignment, 0 != (align mod 8)
  86. //
  87. AlignedMove_ita:
  88. { .mmi
  89. add r11 = 64, r10
  90. (p12) lfetch [r10], 32 //160
  91. sub r31 = 8, r2
  92. ;;
  93. } { .mmi
  94. (p12) lfetch [r10], 64 //192
  95. (p12) lfetch [r11], 96 //224
  96. sub r34 = r34, r31
  97. ;;
  98. }
  99. AlignedMoveByteLoop_ita:
  100. { .mii
  101. ld1 r19 = [r33], 1
  102. add r31 = -1, r31
  103. cmp.gtu p14 = 64, r34
  104. ;;
  105. } { .mib
  106. st1 [r32] = r19, 1
  107. cmp.ne p7 = r0, r31
  108. (p7) br.cond.sptk AlignedMoveByteLoop_ita
  109. ;;
  110. } { .mii
  111. (p12) lfetch [r10], 32 //256
  112. cmp.eq.unc p6 = r0, r34
  113. cmp.gtu p8 = 8, r34
  114. } { .mbb
  115. (p12) lfetch [r11], 128 //320
  116. (p6) br.ret.spnt b0
  117. (p8) br.cond.sptk ByteMoveUpLoop_ita
  118. ;;
  119. }
  120. //
  121. // both src & dest are now 8-byte aligned
  122. //
  123. QwordMoveUp_ita:
  124. #if defined (USE_HIGH_FP_REGISTERS)
  125. { .mii
  126. add r16 = 8, r33
  127. add r10 = 128, r33
  128. add r11 = 288, r33
  129. } { .mmi
  130. mov r19 = 1536
  131. ;;
  132. add r17 = 8, r32
  133. tbit.nz p6 = r33, 3
  134. } { .mbb
  135. cmp.leu p9 = r19, r34
  136. (p9) br.cond.spnt LargeAlignedUp_ita
  137. (p14) br.cond.spnt QwordMoveUpLoop_ita
  138. ;;
  139. }
  140. #else
  141. { .mii
  142. add r16 = 8, r33
  143. add r10 = 128, r33
  144. add r11 = 288, r33
  145. } { .mfb
  146. add r17 = 8, r32
  147. nop.f 0
  148. (p14) br.cond.spnt QwordMoveUpLoop_ita
  149. ;;
  150. }
  151. #endif
  152. UnrolledQwordMoveUpLoop_ita:
  153. { .mmi
  154. ld8 r19 = [r33], 16
  155. ld8 r20 = [r16], 16
  156. add r34 = -64, r34
  157. ;;
  158. } { .mmi
  159. ld8 r21 = [r33], 16
  160. ld8 r22 = [r16], 16
  161. cmp.leu p9 = 128, r34
  162. ;;
  163. } { .mmi
  164. ld8 r30 = [r33], 16
  165. ld8 r29 = [r16], 16
  166. cmp.gtu p8 = 8, r34
  167. ;;
  168. } { .mmi
  169. ld8 r25 = [r33], 16
  170. ld8 r26 = [r16], 16
  171. cmp.leu p7 = 64, r34
  172. ;;
  173. } { .mmi
  174. (p9) lfetch [r10], 64
  175. (p9) lfetch [r11], 64
  176. nop.i 0
  177. ;;
  178. } { .mmi
  179. st8 [r32] = r19, 16
  180. st8 [r17] = r20, 16
  181. nop.i 0
  182. ;;
  183. } { .mmi
  184. st8 [r32] = r21, 16
  185. st8 [r17] = r22, 16
  186. nop.i 0
  187. ;;
  188. } { .mmi
  189. st8 [r32] = r30, 16
  190. st8 [r17] = r29, 16
  191. nop.i 0
  192. ;;
  193. } { .mmb
  194. st8 [r32] = r25, 16
  195. st8 [r17] = r26, 16
  196. (p7) br.cond.dptk UnrolledQwordMoveUpLoop_ita
  197. ;;
  198. } { .mbb
  199. cmp.eq p6 = r0, r34
  200. (p6) br.ret.spnt b0
  201. (p8) br.cond.spnt ByteMoveUpLoop_ita
  202. ;;
  203. }
  204. QwordMoveUpLoop_ita:
  205. { .mii
  206. ld8 r19 = [r33], 8
  207. add r34 = -8, r34
  208. nop.i 0
  209. ;;
  210. } { .mmi
  211. st8 [r32] = r19, 8
  212. cmp.leu p7 = 8, r34
  213. cmp.ne p6 = r0, r34
  214. } { .bbb
  215. (p7) br.cond.sptk QwordMoveUpLoop_ita
  216. (p6) br.cond.spnt ByteMoveUpLoop_ita
  217. br.ret.sptk b0
  218. ;;
  219. }
  220. #if defined (USE_HIGH_FP_REGISTERS)
  221. .align 32
  222. //
  223. // Copy large aligned region -- we can use FP registers for that
  224. // NOTE: still use unrolled loop for *very* large blocks,
  225. // as there are good chances that data is not in cache.
  226. //
  227. LargeAlignedUp_ita:
  228. { .mmi
  229. mov r20 = 48*1024
  230. and r31 = 7, r34
  231. mov.i ar.ec = 23
  232. ;;
  233. } { .mbb
  234. cmp.ltu p8 = r20, r34
  235. (p8) br.cond.spnt UnrolledQwordMoveUpLoop_ita
  236. brp.sptk.imp Move32UpLoop_ita, Move32UpLoopE_ita
  237. ;;
  238. } { .mii
  239. (p6) ld8 r9 = [r33], 8
  240. mov pr.rot = 1<<16
  241. (p6) add r34 = -8, r34
  242. ;;
  243. } { .mii
  244. (p6) st8 [r32] = r9, 8
  245. shr.u r30 = r34, 5
  246. mov r10 = r33
  247. } { .mmi
  248. add r11 = 16, r33
  249. ;;
  250. mov r20 = r32
  251. add r30 = -1, r30
  252. } { .mii
  253. and r9 = 31, r34
  254. add r21 = 8, r32
  255. ;;
  256. mov.i ar.lc = r30
  257. }
  258. Move32UpLoop_ita:
  259. { .mmi
  260. (p16) ldfp8 f32, f55 = [r10]
  261. (p16) ldfp8 f78, f101 = [r11]
  262. (p16) add r10 = 32, r10
  263. ;;
  264. } { .mmi
  265. (p38) stf8 [r20] = f54, 16
  266. (p38) stf8 [r21] = f77, 16
  267. (p16) add r11 = 32, r11
  268. ;;
  269. } { .mmb
  270. Move32UpLoopE_ita:
  271. (p38) stf8 [r20] = f100, 16
  272. (p38) stf8 [r21] = f123, 16
  273. br.ctop.sptk.many Move32UpLoop_ita
  274. ;;
  275. } { .mii
  276. nop.m 0
  277. mov pr = r18
  278. nop.i 0
  279. ;;
  280. } { .mii
  281. cmp.eq p6 = r0, r9
  282. mov.i ar.lc = r27
  283. cmp.gt p8 = 8, r9
  284. } { .mbb
  285. cmp.eq p9 = r0, r31
  286. (p6) br.ret.spnt b0
  287. (p8) br.cond.spnt LargeByteDoneUpLoop_ita
  288. ;;
  289. }
  290. LargeMoveUpLoop_ita:
  291. { .mii
  292. ld8 r19 = [r10], 8
  293. add r9 = -8, r9
  294. ;;
  295. cmp.le p7 = 8, r9
  296. } { .mbb
  297. st8 [r20] = r19, 8
  298. (p7) br.cond.sptk LargeMoveUpLoop_ita
  299. (p9) br.ret.spnt b0
  300. ;;
  301. }
  302. LargeByteDoneUpLoop_ita:
  303. { .mii
  304. ld1 r19 = [r10], 1
  305. add r9 = -1, r9
  306. ;;
  307. cmp.ne p7 = r0, r9
  308. } { .mbb
  309. st1 [r20] = r19, 1
  310. (p7) br.cond.sptk LargeByteDoneUpLoop_ita
  311. br.ret.spnt b0
  312. ;;
  313. }
  314. #endif
  315. //
  316. // Copy long unaligned region
  317. //
  318. .align 32
  319. UnalignedMove_ita:
  320. { .mii
  321. .regstk 3, 29, 0, 32
  322. alloc r26 = ar.pfs, 3, 29, 0, 32
  323. mov.i ar.ec = 32
  324. sub r3 = 16, r3
  325. ;;
  326. }
  327. .body
  328. UnalignedMoveByteLoop_ita:
  329. { .mmi
  330. ld1 r19 = [r33], 1
  331. cmp.ne p6 = 1, r3
  332. mov pr.rot = 3<<16
  333. ;;
  334. } { .mib
  335. add r3 = -1, r3
  336. shrp r10 = r19, r10, 8
  337. nop.b 0
  338. } { .mib
  339. st1 [r32] = r19, 1
  340. add r34 = -1, r34
  341. (p6) br.cond.sptk UnalignedMoveByteLoop_ita
  342. ;;
  343. } { .mmi
  344. mov r3 = r33
  345. and r2 = 7, r32
  346. mov r33 = r10
  347. ;;
  348. } { .mmi
  349. add r9 = r34, r2
  350. sub r11 = r32, r2
  351. cmp.eq p6 = 2, r2
  352. ;;
  353. } { .mii
  354. cmp.eq p9 = 4, r2
  355. shr r19 = r9, 3
  356. cmp.eq p11 = 6, r2
  357. ;;
  358. } { .mii
  359. add r19 = -1, r19
  360. and r9 = 7, r9
  361. ;;
  362. mov.i ar.lc = r19
  363. } { .bbb
  364. (p6) br.cond.spnt SpecialLoop2_ita
  365. (p9) br.cond.spnt SpecialLoop4_ita
  366. (p11) br.cond.spnt SpecialLoop6_ita
  367. ;;
  368. } { .mii
  369. cmp.eq p7 = 3, r2
  370. cmp.eq p10 = 5, r2
  371. cmp.eq p12 = 7, r2
  372. } { .bbb
  373. (p7) br.cond.spnt SpecialLoop3_ita
  374. (p10) br.cond.spnt SpecialLoop5_ita
  375. (p12) br.cond.spnt SpecialLoop7_ita
  376. ;;
  377. }
  378. .align 32
  379. SpecialLoop1_ita:
  380. { .mfb
  381. (p16) ld8 r32 = [r3], 8
  382. nop.f 0
  383. brp.sptk.imp SpecialLoop1_ita, SpecialLoop1E_ita
  384. } { .mib
  385. SpecialLoop1E_ita:
  386. (p48) st8 [r11] = r10, 8
  387. (p47) shrp r10 = r62, r63, 56
  388. br.ctop.sptk.many SpecialLoop1_ita
  389. ;;
  390. } { .mib
  391. sub r3 = r3, r2
  392. mov pr = r18
  393. br UnalignedByteDone_ita
  394. ;;
  395. }
  396. .align 32
  397. SpecialLoop2_ita:
  398. { .mfb
  399. (p16) ld8 r32 = [r3], 8
  400. nop.f 0
  401. brp.sptk.imp SpecialLoop2_ita, SpecialLoop2E_ita
  402. } { .mib
  403. SpecialLoop2E_ita:
  404. (p48) st8 [r11] = r10, 8
  405. (p47) shrp r10 = r62, r63, 48
  406. br.ctop.sptk.many SpecialLoop2_ita
  407. ;;
  408. } { .mib
  409. sub r3 = r3, r2
  410. mov pr = r18
  411. br UnalignedByteDone_ita
  412. ;;
  413. }
  414. .align 32
  415. SpecialLoop3_ita:
  416. { .mfb
  417. (p16) ld8 r32 = [r3], 8
  418. nop.f 0
  419. brp.sptk.imp SpecialLoop3_ita, SpecialLoop3E_ita
  420. } { .mib
  421. SpecialLoop3E_ita:
  422. (p48) st8 [r11] = r10, 8
  423. (p47) shrp r10 = r62, r63, 40
  424. br.ctop.sptk.many SpecialLoop3_ita
  425. ;;
  426. } { .mib
  427. sub r3 = r3, r2
  428. mov pr = r18
  429. br UnalignedByteDone_ita
  430. ;;
  431. }
  432. .align 32
  433. SpecialLoop4_ita:
  434. { .mfb
  435. (p16) ld8 r32 = [r3], 8
  436. nop.f 0
  437. brp.sptk.imp SpecialLoop4_ita, SpecialLoop4E_ita
  438. } { .mib
  439. SpecialLoop4E_ita:
  440. (p48) st8 [r11] = r10, 8
  441. (p47) shrp r10 = r62, r63, 32
  442. br.ctop.sptk.many SpecialLoop4_ita
  443. ;;
  444. } { .mib
  445. sub r3 = r3, r2
  446. mov pr = r18
  447. br UnalignedByteDone_ita
  448. ;;
  449. }
  450. .align 32
  451. SpecialLoop5_ita:
  452. { .mfb
  453. (p16) ld8 r32 = [r3], 8
  454. nop.f 0
  455. brp.sptk.imp SpecialLoop5_ita, SpecialLoop5E_ita
  456. } { .mib
  457. SpecialLoop5E_ita:
  458. (p48) st8 [r11] = r10, 8
  459. (p47) shrp r10 = r62, r63, 24
  460. br.ctop.sptk.many SpecialLoop5_ita
  461. ;;
  462. } { .mib
  463. sub r3 = r3, r2
  464. mov pr = r18
  465. br UnalignedByteDone_ita
  466. ;;
  467. }
  468. .align 32
  469. SpecialLoop6_ita:
  470. { .mfb
  471. (p16) ld8 r32 = [r3], 8
  472. nop.f 0
  473. brp.sptk.imp SpecialLoop6_ita, SpecialLoop6E_ita
  474. } { .mib
  475. SpecialLoop6E_ita:
  476. (p48) st8 [r11] = r10, 8
  477. (p47) shrp r10 = r62, r63, 16
  478. br.ctop.sptk.many SpecialLoop6_ita
  479. ;;
  480. } { .mib
  481. sub r3 = r3, r2
  482. mov pr = r18
  483. br UnalignedByteDone_ita
  484. ;;
  485. }
  486. .align 32
  487. SpecialLoop7_ita:
  488. { .mfb
  489. (p16) ld8 r32 = [r3], 8
  490. nop.f 0
  491. brp.sptk.imp SpecialLoop7_ita, SpecialLoop7E_ita
  492. } { .mib
  493. SpecialLoop7E_ita:
  494. (p48) st8 [r11] = r10, 8
  495. (p47) shrp r10 = r62, r63, 8
  496. br.ctop.sptk.many SpecialLoop7_ita
  497. ;;
  498. } { .mii
  499. sub r3 = r3, r2
  500. mov pr = r18
  501. nop.i 0
  502. ;;
  503. }
  504. UnalignedByteDone_ita:
  505. { .mib
  506. cmp.eq p6 = r0, r9
  507. mov.i ar.lc = r27
  508. (p6) br.ret.spnt b0
  509. ;;
  510. }
  511. UnAlignedByteDoneLoop_ita:
  512. { .mii
  513. ld1 r19 = [r3], 1
  514. add r9 = -1, r9
  515. ;;
  516. cmp.ne p7 = r0, r9
  517. } { .mbb
  518. st1 [r11] = r19, 1
  519. (p7) br.cond.sptk UnAlignedByteDoneLoop_ita
  520. br.ret.spnt b0
  521. ;;
  522. }
  523. .endp _memcpy_ita#