Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1207 lines
26 KiB

  1. //
  2. // Module Name:
  3. //
  4. // fillmem.s
  5. //
  6. // Abstract:
  7. //
  8. // This module implements functions to move, zero, and fill blocks
  9. // of memory. If the memory is aligned, then these functions are
  10. // very efficient.
  11. //
  12. // Author:
  13. //
  14. //
  15. // Environment:
  16. //
  17. // User or Kernel mode.
  18. //
  19. //--
  20. #include "ksia64.h"
  21. //++
  22. //
  23. // VOID
  24. // RtlFillMemory (
  25. // IN PVOID destination,
  26. // IN SIZE_T length,
  27. // IN UCHAR fill
  28. // )
  29. //
  30. // Routine Description:
  31. //
  32. // This function fills memory by first aligning the destination address to
  33. // a qword boundary, and then filling 4-byte blocks, followed by any
  34. // remaining bytes.
  35. //
  36. // Arguments:
  37. //
  38. // destination (a0) - Supplies a pointer to the memory to fill.
  39. //
  40. // length (a1) - Supplies the length, in bytes, of the memory to be filled.
  41. //
  42. // fill (a2) - Supplies the fill byte.
  43. //
  44. // N.B. The alternate entry memset expects the length and fill arguments
  45. // to be reversed. It also returns the Destination pointer
  46. //
  47. // Return Value:
  48. //
  49. // None.
  50. //
  51. //--
  52. LEAF_ENTRY(RtlFillMemory)
  53. lfetch.excl [a0]
  54. mov t0 = a0
  55. add t4 = 64, a0
  56. cmp.eq pt0 = zero, a1 // length == 0 ?
  57. add t1 = -1, a0
  58. zxt1 a2 = a2
  59. cmp.ge pt1 = 7, a1
  60. mov v0 = a0
  61. (pt0) br.ret.spnt brp // return if length is zero
  62. ;;
  63. //
  64. // Align address on qword boundary by determining the number of bytes
  65. // before the next qword boundary by performing an AND operation on
  66. // the 2's complement of the address with a mask value of 0x7.
  67. //
  68. lfetch.excl [t4], 64
  69. andcm t1 = 7, t1 // t1 = # bytes before dword boundary
  70. (pt1) br.cond.spnt TailSet // 1 <= length <= 3, br to TailSet
  71. ;;
  72. cmp.eq pt2 = zero, t1 // skip HeadSet if t1 is zero
  73. mux1 t2 = a2, @brcst // t2 = all 8 bytes = [fill]
  74. sub a1 = a1, t1 // a1 = adjusted length
  75. ;;
  76. lfetch.excl [t4], 64
  77. (pt2) br.cond.sptk SkipHeadSet
  78. //
  79. // Copy the leading bytes until t1 is equal to zero
  80. //
  81. HeadSet:
  82. st1 [t0] = a2, 1
  83. add t1 = -1, t1
  84. ;;
  85. cmp.ne pt0 = zero, t1
  86. (pt0) br.cond.sptk HeadSet
  87. //
  88. // now the address is qword aligned;
  89. // fall into the QwordSet loop if remaining length is greater than 8;
  90. // else skip the QwordSet loop
  91. //
  92. SkipHeadSet:
  93. cmp.gt pt1 = 16, a1
  94. add t4 = 64, t0
  95. cmp.le pt2 = 8, a1
  96. add t3 = 8, t0
  97. cmp.gt pt3 = 64, a1
  98. (pt1) br.cond.spnt SkipQwordSet
  99. ;;
  100. lfetch.excl [t4], 64
  101. (pt3) br.cond.spnt QwordSet
  102. nop.m 0
  103. nop.m 0
  104. nop.i 0
  105. UnrolledQwordSet:
  106. st8 [t0] = t2, 16
  107. st8 [t3] = t2, 16
  108. add a1 = -64, a1
  109. ;;
  110. st8 [t0] = t2, 16
  111. st8 [t3] = t2, 16
  112. cmp.le pt0 = 64, a1
  113. ;;
  114. st8 [t0] = t2, 16
  115. st8 [t3] = t2, 16
  116. cmp.le pt2 = 8, a1
  117. ;;
  118. st8 [t0] = t2, 16
  119. nop.f 0
  120. cmp.gt pt1 = 16, a1
  121. st8 [t3] = t2, 16
  122. (pt0) br.cond.sptk UnrolledQwordSet
  123. (pt1) br.cond.spnt SkipQwordSet
  124. ;;
  125. //
  126. // fill 8 bytes at a time until the remaining length is less than 8
  127. //
  128. QwordSet:
  129. st8 [t0] = t2, 16
  130. st8 [t3] = t2, 16
  131. add a1 = -16, a1
  132. ;;
  133. cmp.le pt0 = 16, a1
  134. cmp.le pt2 = 8, a1
  135. (pt0) br.cond.sptk QwordSet
  136. ;;
  137. SkipQwordSet:
  138. (pt2) st8 [t0] = t2, 8
  139. (pt2) add a1 = -8, a1
  140. ;;
  141. cmp.eq pt3 = zero, a1 // return now if length equals 0
  142. (pt3) br.ret.sptk brp
  143. ;;
  144. //
  145. // copy the remaining bytes one at a time
  146. //
  147. TailSet:
  148. st1 [t0] = a2, 1
  149. add a1 = -1, a1
  150. nop.i 0
  151. ;;
  152. cmp.ne pt0, pt3 = 0, a1
  153. (pt0) br.cond.dptk TailSet
  154. (pt3) br.ret.dpnt brp
  155. ;;
  156. LEAF_EXIT(RtlFillMemory)
  157. //++
  158. //
  159. // VOID
  160. // RtlFillMemoryUlong (
  161. // IN PVOID Destination,
  162. // IN SIZE_T Length,
  163. // IN ULONG Pattern
  164. // )
  165. //
  166. // Routine Description:
  167. //
  168. // This function fills memory with the specified longowrd pattern
  169. // 4 bytes at a time.
  170. //
  171. // N.B. This routine assumes that the destination address is aligned
  172. // on a longword boundary and that the length is an even multiple
  173. // of longwords.
  174. //
  175. // Arguments:
  176. //
  177. // Destination (a0) - Supplies a pointer to the memory to fill.
  178. //
  179. // Length (a1) - Supplies the length, in bytes, of the memory to be filled.
  180. //
  181. // Pattern (a2) - Supplies the fill pattern.
  182. //
  183. // Return Value:
  184. //
  185. // None.
  186. //
  187. //--
  188. LEAF_ENTRY(RtlFillMemoryUlong)
  189. .prologue
  190. .save ar.lc, t22
  191. mov t22 = ar.lc
  192. extr.u a1 = a1, 2, 30
  193. ;;
  194. PROLOGUE_END
  195. cmp.eq pt0, pt1 = zero, a1
  196. add a1 = -1, a1
  197. ;;
  198. nop.m 0
  199. (pt1) mov ar.lc = a1
  200. (pt0) br.ret.spnt brp
  201. ;;
  202. Rfmu10:
  203. st4 [a0] = a2, 4
  204. br.cloop.dptk.few Rfmu10
  205. ;;
  206. nop.m 0
  207. mov ar.lc = t22
  208. br.ret.sptk brp
  209. LEAF_EXIT(RtlFillMemoryUlong)
  210. //++
  211. //
  212. // VOID
  213. // RtlFillMemoryUlonglong (
  214. // IN PVOID Destination,
  215. // IN SIZE_T Length,
  216. // IN ULONGLONG Pattern
  217. // )
  218. //
  219. // Routine Description:
  220. //
  221. // This function fills memory with the specified pattern
  222. // 8 bytes at a time.
  223. //
  224. // N.B. This routine assumes that the destination address is aligned
  225. // on a longword boundary and that the length is an even multiple
  226. // of longwords.
  227. //
  228. // Arguments:
  229. //
  230. // Destination (a0) - Supplies a pointer to the memory to fill.
  231. //
  232. // Length (a1) - Supplies the length, in bytes, of the memory to be filled.
  233. //
  234. // Pattern (a2,a3) - Supplies the fill pattern.
  235. //
  236. // Return Value:
  237. //
  238. // None.
  239. //
  240. //--
  241. LEAF_ENTRY(RtlFillMemoryUlonglong)
  242. .prologue
  243. .save ar.lc, t22
  244. mov t22 = ar.lc
  245. extr.u a1 = a1, 3, 29
  246. ;;
  247. PROLOGUE_END
  248. cmp.eq pt0, pt1 = zero, a1
  249. add a1 = -1, a1
  250. ;;
  251. nop.m 0
  252. (pt1) mov ar.lc = a1
  253. (pt0) br.ret.spnt brp
  254. ;;
  255. Rfmul10:
  256. st8 [a0] = a2, 8
  257. br.cloop.dptk.few Rfmul10
  258. ;;
  259. nop.m 0
  260. mov ar.lc = t22
  261. br.ret.sptk brp
  262. ;;
  263. LEAF_EXIT(RtlFillMemoryUlonglong)
  264. //++
  265. //
  266. // VOID
  267. // RtlZeroMemory (
  268. // IN PVOID Destination,
  269. // IN SIZE_T Length
  270. // )
  271. //
  272. // Routine Description:
  273. //
  274. // This function simply sets up the fill value (out2) and branches
  275. // directly to RtlFillMemory
  276. //
  277. // Arguments:
  278. //
  279. // Destination (a0) - Supplies a pointer to the memory to zero.
  280. //
  281. // Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
  282. //
  283. // Return Value:
  284. //
  285. // None.
  286. //
  287. //--
  288. LEAF_ENTRY(RtlZeroMemory)
  289. alloc t22 = ar.pfs, 0, 0, 3, 0
  290. mov out2 = 0
  291. br RtlFillMemory
  292. LEAF_EXIT(RtlZeroMemory)
  293. //++
  294. //
  295. // VOID
  296. // RtlMoveMemory (
  297. // IN PVOID Destination,
  298. // IN PVOID Source,
  299. // IN SIZE_T Length
  300. // )
  301. //
  302. // Routine Description:
  303. //
  304. // This function moves memory either forward or backward, aligned or
  305. // unaligned.
  306. //
  307. // Algorithm:
  308. // 1) Length equals zero, return immediately
  309. // 2) Source & Destination don't overlap, copy from low to high
  310. // else copy from high to low address one byte at a time
  311. // 3) if Source & Destination are both 8-byte aligned, copy 8 bytes
  312. // at a time and the remaining bytes are copied one at a time.
  313. // 4) if Source & Destination are both 4-byte aligned, copy 4 bytes
  314. // at a time and the remaining bytes are copied one at a time.
  315. // 5) else copy one byte at a time from low to high address.
  316. //
  317. // Arguments:
  318. //
  319. // Destination (a0) - Supplies a pointer to the destination address of
  320. // the move operation.
  321. //
  322. // Source (a1) - Supplies a pointer to the source address of the move
  323. // operation.
  324. //
  325. // Length (a2) - Supplies the length, in bytes, of the memory to be moved.
  326. //
  327. // Return Value:
  328. //
  329. // None.
  330. //
  331. //--
  332. LEAF_ENTRY(memcpy)
  333. ALTERNATE_ENTRY(memmove)
  334. ALTERNATE_ENTRY(RtlMoveMemory)
  335. ALTERNATE_ENTRY(RtlCopyMemory)
  336. ALTERNATE_ENTRY(RtlCopyMemoryNonTemporal)
  337. .prologue
  338. .regstk 3,7,0,8
  339. alloc t17 = ar.pfs,3,31,0,32
  340. .save pr, r64
  341. mov r64 = pr
  342. and t3 = -32, a1
  343. ;;
  344. lfetch [t3], 32 //0
  345. .save ar.lc, r65
  346. mov.i r65 = ar.lc
  347. and t1 = 7, a1
  348. ;;
  349. .body
  350. lfetch [t3], 32 //32
  351. mov v0 = a0
  352. and t0 = 7, a0
  353. ;;
  354. add t21 = a1, a2
  355. cmp.gtu pt0 = a0, a1
  356. or t2 = t0, t1
  357. ;;
  358. (pt0) cmp.ltu.unc pt0 = a0, t21
  359. cmp.eq pt1 = zero, a2
  360. (pt1) br.ret.spnt brp
  361. lfetch [t3], 32 //64
  362. cmp.lt pt2 = 16, a2
  363. (pt0) br.cond.spnt CopyDown
  364. ;;
  365. lfetch [t3], 32 //96
  366. cmp.lt pt6 = 127, a2
  367. cmp.le pt4 = 8, a2
  368. ;;
  369. (pt6) lfetch [t3], 32 //128
  370. (pt4) cmp.eq.unc pt3 = 0, t2
  371. (pt4) cmp.eq.unc pt5 = t0, t1
  372. (pt3) br.cond.sptk QwordMoveUp
  373. (pt5) br.cond.spnt AlignedMove
  374. (pt2) br.cond.sptk UnalignedMove
  375. ByteMoveUpLoop:
  376. ld1 t10 = [a1], 1
  377. nop.f 0
  378. add a2 = -1, a2
  379. ;;
  380. st1 [a0] = t10, 1
  381. cmp.ne pt1 = zero, a2
  382. (pt1) br.cond.sptk ByteMoveUpLoop
  383. nop.m 0
  384. nop.f 0
  385. br.ret.sptk brp
  386. UnalignedMove:
  387. cmp.eq pt0 = 0, t1
  388. sub t1 = 8, t1
  389. (pt0) br.cond.spnt SkipUnalignedMoveByteLoop
  390. ;;
  391. UnalignedMoveByteLoop:
  392. ld1 t10 = [a1], 1
  393. add t1 = -1, t1
  394. add a2 = -1, a2
  395. ;;
  396. st1 [a0] = t10, 1
  397. cmp.eq p0, pt1 = zero, t1
  398. (pt1) br.cond.sptk UnalignedMoveByteLoop
  399. ;;
  400. SkipUnalignedMoveByteLoop:
  401. and t0 = 7, a0
  402. mov pr.rot = 3<<16
  403. or t1 = a1, r0
  404. ;;
  405. add t2 = a2, t0
  406. mov.i ar.ec = 32
  407. sub t21 = 8, t0
  408. ;;
  409. sub t4 = a0, t0
  410. shr t10 = t2, 3
  411. shl t21 = t21, 3
  412. ;;
  413. ld8 r33 = [t4], 0
  414. add t10 = -1,t10
  415. and t2 = 7, t2
  416. ;;
  417. cmp.eq pt0 = 2, t0
  418. cmp.eq pt3 = 4, t0
  419. cmp.eq pt5 = 6, t0
  420. ;;
  421. nop.m 0
  422. shl r33 = r33,t21 // Prime r39
  423. mov.i ar.lc = t10
  424. (pt0) br.cond.spnt SpecialLoop2
  425. (pt3) br.cond.spnt SpecialLoop4
  426. (pt5) br.cond.spnt SpecialLoop6
  427. cmp.eq pt1 = 3, t0
  428. cmp.eq pt4 = 5, t0
  429. cmp.eq pt6 = 7, t0
  430. (pt1) br.cond.spnt SpecialLoop3
  431. (pt4) br.cond.spnt SpecialLoop5
  432. (pt6) br.cond.spnt SpecialLoop7
  433. ;;
  434. SpecialLoop1:
  435. (p16) ld8 r32 = [t1], 8
  436. nop.f 0
  437. brp.sptk.imp SpecialLoop1E, SpecialLoop1
  438. SpecialLoop1E:
  439. (p48) st8 [t4] = r10, 8
  440. (p47) shrp r10 = r62,r63,56
  441. br.ctop.sptk.many SpecialLoop1
  442. br UnalignedByteDone
  443. SpecialLoop2:
  444. (p16) ld8 r32 = [t1], 8
  445. nop.f 0
  446. brp.sptk.imp SpecialLoop2E, SpecialLoop2
  447. SpecialLoop2E:
  448. (p48) st8 [t4] = r10, 8
  449. (p47) shrp r10 = r62,r63,48
  450. br.ctop.sptk.many SpecialLoop2
  451. br UnalignedByteDone
  452. SpecialLoop3:
  453. (p16) ld8 r32 = [t1], 8
  454. nop.f 0
  455. brp.sptk.imp SpecialLoop3E, SpecialLoop3
  456. SpecialLoop3E:
  457. (p48) st8 [t4] = r10, 8
  458. (p47) shrp r10 = r62,r63,40
  459. br.ctop.sptk.many SpecialLoop3
  460. br UnalignedByteDone
  461. SpecialLoop4:
  462. (p16) ld8 r32 = [t1], 8
  463. nop.f 0
  464. brp.sptk.imp SpecialLoop4E, SpecialLoop4
  465. SpecialLoop4E:
  466. (p48) st8 [t4] = r10, 8
  467. (p47) shrp r10 = r62,r63,32
  468. br.ctop.sptk.many SpecialLoop4
  469. br UnalignedByteDone
  470. SpecialLoop5:
  471. (p16) ld8 r32 = [t1], 8
  472. nop.f 0
  473. brp.sptk.imp SpecialLoop5E, SpecialLoop5
  474. SpecialLoop5E:
  475. (p48) st8 [t4] = r10, 8
  476. (p47) shrp r10 = r62,r63,24
  477. br.ctop.sptk.many SpecialLoop5
  478. br UnalignedByteDone
  479. SpecialLoop6:
  480. (p16) ld8 r32 = [t1], 8
  481. nop.f 0
  482. brp.sptk.imp SpecialLoop6E, SpecialLoop6
  483. SpecialLoop6E:
  484. (p48) st8 [t4] = r10, 8
  485. (p47) shrp r10 = r62,r63,16
  486. br.ctop.sptk.many SpecialLoop6
  487. br UnalignedByteDone
  488. SpecialLoop7:
  489. (p16) ld8 r32 = [t1], 8
  490. nop.f 0
  491. brp.sptk.imp SpecialLoop7E, SpecialLoop7
  492. SpecialLoop7E:
  493. (p48) st8 [t4] = r10, 8
  494. (p47) shrp r10 = r62,r63,8
  495. br.ctop.sptk.many SpecialLoop7;;
  496. UnalignedByteDone:
  497. sub t1 = t1, t0
  498. mov pr = r64
  499. mov.i ar.lc = r65
  500. ;;
  501. cmp.eq pt0 = zero, t2
  502. (pt0) br.ret.spnt brp
  503. UnAlignedByteDoneLoop:
  504. ld1 t10 = [t1], 1
  505. add t2 = -1, t2
  506. ;;
  507. cmp.ne pt1 = zero, t2
  508. st1 [t4] = t10, 1
  509. (pt1) br.cond.sptk UnAlignedByteDoneLoop
  510. br.ret.spnt brp
  511. AlignedMove:
  512. add t4 = 64, t3
  513. (pt6) lfetch [t3], 32 //160
  514. sub t22 = 8, t0
  515. ;;
  516. (pt6) lfetch [t3], 64 //192
  517. (pt6) lfetch [t4], 96 //224
  518. sub a2 = a2, t22
  519. ;;
  520. AlignedMoveByteLoop:
  521. ld1 t10 = [a1], 1
  522. nop.f 0
  523. add t22 = -1, t22
  524. ;;
  525. st1 [a0] = t10, 1
  526. cmp.ne pt1 = zero, t22
  527. (pt1) br.cond.sptk AlignedMoveByteLoop
  528. ;;
  529. (pt6) lfetch [t3], 32 //256
  530. cmp.eq.unc pt0 = zero, a2
  531. cmp.gt pt2 = 8, a2
  532. (pt6) lfetch [t4], 128 //320
  533. (pt0) br.ret.spnt brp
  534. (pt2) br.cond.sptk ByteMoveUpLoop
  535. ;;
  536. //
  537. // both src & dest are now 8-byte aligned
  538. //
  539. QwordMoveUp:
  540. add t3 = 128, a1
  541. add t4 = 288, a1
  542. add t7 = 8, a1
  543. add t8 = 8, a0
  544. cmp.gt pt3 = 64, a2
  545. (pt3) br.cond.spnt QwordMoveUpLoop
  546. ;;
  547. UnrolledQwordMoveUpLoop:
  548. ld8 t10 = [a1], 16
  549. ld8 t11 = [t7], 16
  550. add a2 = -64, a2
  551. ;;
  552. ld8 t12 = [a1], 16
  553. ld8 t13 = [t7], 16
  554. cmp.le pt3 = 128, a2
  555. ;;
  556. ld8 t14 = [a1], 16
  557. ld8 t15 = [t7], 16
  558. cmp.gt pt2 = 8, a2
  559. ;;
  560. ld8 t16 = [a1], 16
  561. ld8 t17 = [t7], 16
  562. ;;
  563. (pt3) lfetch [t3], 64
  564. (pt3) lfetch [t4], 64
  565. st8 [a0] = t10, 16
  566. st8 [t8] = t11, 16
  567. ;;
  568. st8 [a0] = t12, 16
  569. st8 [t8] = t13, 16
  570. ;;
  571. st8 [a0] = t14, 16
  572. st8 [t8] = t15, 16
  573. ;;
  574. st8 [a0] = t16, 16
  575. st8 [t8] = t17, 16
  576. (pt3) br.cond.dptk UnrolledQwordMoveUpLoop
  577. (pt2) br.cond.spnt ByteMoveUp
  578. ;;
  579. QwordMoveUpLoop:
  580. ld8 t10 = [a1], 8
  581. add a2 = -8, a2
  582. ;;
  583. cmp.le pt1 = 8, a2
  584. st8 [a0] = t10, 8
  585. (pt1) br.cond.sptk QwordMoveUpLoop
  586. ;;
  587. ByteMoveUp:
  588. cmp.eq pt0 = zero, a2
  589. (pt0) br.ret.spnt brp
  590. ;;
  591. AlignedByteDoneLoop:
  592. ld1 t10 = [a1], 1
  593. add a2 = -1, a2
  594. ;;
  595. cmp.ne pt1 = zero, a2
  596. st1 [a0] = t10, 1
  597. (pt1) br.cond.sptk AlignedByteDoneLoop
  598. br.ret.spnt brp
  599. ;;
  600. CopyDown:
  601. cmp.eq pt0 = zero, a2
  602. cmp.ne pt6 = t0, t1
  603. (pt0) br.ret.spnt brp // return if length is zero
  604. cmp.gt pt4 = 16, a2
  605. add t20 = a2, a0
  606. add t21 = a2, a1
  607. nop.m 0
  608. (pt4) br.cond.sptk ByteMoveDown // less than 16 bytes to copy
  609. (pt6) br.cond.spnt UnalignedMoveDown // incompatible alignment
  610. ;;
  611. nop.m 0
  612. nop.m 0
  613. and t22 = 0x7, t21
  614. ;;
  615. add t20 = -1, t20
  616. add t21 = -1, t21
  617. sub a2 = a2, t22
  618. ;;
  619. TailMove:
  620. cmp.eq pt0, pt1 = zero, t22
  621. ;;
  622. (pt1) ld1 t10 = [t21], -1
  623. (pt1) add t22 = -1, t22
  624. ;;
  625. (pt1) st1 [t20] = t10, -1
  626. (pt1) br.cond.sptk TailMove
  627. Block8Move:
  628. nop.m 0
  629. add t20 = -7, t20
  630. add t21 = -7, t21
  631. ;;
  632. Block8MoveLoop:
  633. cmp.gt pt5, pt6 = 8, a2
  634. ;;
  635. (pt6) ld8 t10 = [t21], -8
  636. (pt6) add a2 = -8, a2
  637. ;;
  638. (pt6) st8 [t20] = t10, -8
  639. (pt6) br.cond.sptk Block8MoveLoop
  640. add t20 = 8, t20 // adjust dest
  641. add t21 = 8, t21 // adjust source
  642. br.cond.sptk ByteMoveDown
  643. ;;
  644. UnalignedMoveDown:
  645. and t1 = 7, t21
  646. ;;
  647. cmp.eq pt0 = 0, t1
  648. (pt0) br.cond.spnt SkipUnalignedMoveDownByteLoop
  649. ;;
  650. add t20 = -1, t20
  651. add t21 = -1, t21
  652. ;;
  653. UnalignedMoveDownByteLoop:
  654. ld1 t10 = [t21], -1
  655. add t1 = -1, t1
  656. add a2 = -1, a2
  657. ;;
  658. st1 [t20] = t10, -1
  659. cmp.eq p0, pt1 = zero, t1
  660. (pt1) br.cond.sptk UnalignedMoveDownByteLoop
  661. ;;
  662. add t20 = 1, t20
  663. add t21 = 1, t21
  664. ;;
  665. SkipUnalignedMoveDownByteLoop:
  666. add t21 = -8, t21
  667. ;;
  668. and t0 = 7, t20
  669. mov pr.rot = 3<<16
  670. or t1 = t21, r0
  671. ;;
  672. sub t7 = 8, t0
  673. ;;
  674. add t2 = a2, t7
  675. mov.i ar.ec = 32
  676. ;;
  677. sub t4 = t20, t0
  678. shr t10 = t2, 3
  679. shl t6 = t0, 3
  680. ;;
  681. ld8 r33 = [t4], 0
  682. add t10 = -1,t10
  683. and t2 = 7, t2
  684. ;;
  685. cmp.eq pt0 = 2, t0
  686. cmp.eq pt3 = 4, t0
  687. cmp.eq pt5 = 6, t0
  688. ;;
  689. shr r33 = r33,t6 // Prime r39
  690. mov.i ar.lc = t10
  691. (pt0) br.cond.spnt SpecialLoopDown2
  692. (pt3) br.cond.spnt SpecialLoopDown4
  693. (pt5) br.cond.spnt SpecialLoopDown6
  694. cmp.eq pt1 = 3, t0
  695. cmp.eq pt4 = 5, t0
  696. cmp.eq pt6 = 7, t0
  697. (pt1) br.cond.spnt SpecialLoopDown3
  698. (pt4) br.cond.spnt SpecialLoopDown5
  699. (pt6) br.cond.spnt SpecialLoopDown7
  700. ;;
  701. SpecialLoopDown1:
  702. (p16) ld8 r32 = [t1], -8
  703. nop.f 0
  704. brp.sptk.imp SpecialLoopDown1E, SpecialLoopDown1
  705. SpecialLoopDown1E:
  706. (p48) st8 [t4] = r10, -8
  707. (p47) shrp r10 = r63,r62,56
  708. br.ctop.sptk.many SpecialLoopDown1
  709. br UnalignedByteDownDone
  710. SpecialLoopDown2:
  711. (p16) ld8 r32 = [t1], -8
  712. nop.f 0
  713. brp.sptk.imp SpecialLoopDown2E, SpecialLoopDown2
  714. SpecialLoopDown2E:
  715. (p48) st8 [t4] = r10, -8
  716. (p47) shrp r10 = r63,r62,48
  717. br.ctop.sptk.many SpecialLoopDown2
  718. br UnalignedByteDownDone
  719. SpecialLoopDown3:
  720. (p16) ld8 r32 = [t1], -8
  721. nop.f 0
  722. brp.sptk.imp SpecialLoopDown3E, SpecialLoopDown3
  723. SpecialLoopDown3E:
  724. (p48) st8 [t4] = r10, -8
  725. (p47) shrp r10 = r63,r62,40
  726. br.ctop.sptk.many SpecialLoopDown3
  727. br UnalignedByteDownDone
  728. SpecialLoopDown4:
  729. (p16) ld8 r32 = [t1], -8
  730. nop.f 0
  731. brp.sptk.imp SpecialLoopDown4E, SpecialLoopDown4
  732. SpecialLoopDown4E:
  733. (p48) st8 [t4] = r10, -8
  734. (p47) shrp r10 = r63,r62,32
  735. br.ctop.sptk.many SpecialLoopDown4
  736. br UnalignedByteDownDone
  737. SpecialLoopDown5:
  738. (p16) ld8 r32 = [t1], -8
  739. nop.f 0
  740. brp.sptk.imp SpecialLoopDown5E, SpecialLoopDown5
  741. SpecialLoopDown5E:
  742. (p48) st8 [t4] = r10, -8
  743. (p47) shrp r10 = r63,r62,24
  744. br.ctop.sptk.many SpecialLoopDown5
  745. br UnalignedByteDownDone
  746. SpecialLoopDown6:
  747. (p16) ld8 r32 = [t1], -8
  748. nop.f 0
  749. brp.sptk.imp SpecialLoopDown6E, SpecialLoopDown6
  750. SpecialLoopDown6E:
  751. (p48) st8 [t4] = r10, -8
  752. (p47) shrp r10 = r63,r62,16
  753. br.ctop.sptk.many SpecialLoopDown6
  754. br UnalignedByteDownDone
  755. SpecialLoopDown7:
  756. (p16) ld8 r32 = [t1], -8
  757. nop.f 0
  758. brp.sptk.imp SpecialLoopDown7E, SpecialLoopDown7
  759. SpecialLoopDown7E:
  760. (p48) st8 [t4] = r10, -8
  761. (p47) shrp r10 = r63,r62,8
  762. br.ctop.sptk.many SpecialLoopDown7;;
  763. UnalignedByteDownDone:
  764. add t1 = 7, t1
  765. add t4 = 7, t4
  766. ;;
  767. add t1 = t1, t7
  768. mov pr = r64
  769. mov.i ar.lc = r65
  770. ;;
  771. cmp.eq pt0 = zero, t2
  772. (pt0) br.ret.spnt brp
  773. ;;
  774. UnAlignedByteDoneDownLoop:
  775. ld1 t10 = [t1], -1
  776. add t2 = -1, t2
  777. ;;
  778. cmp.ne pt1 = zero, t2
  779. st1 [t4] = t10, -1
  780. (pt1) br.cond.sptk UnAlignedByteDoneDownLoop
  781. br.ret.spnt brp
  782. ByteMoveDown:
  783. nop.m 0
  784. add t20 = -1, t20 // adjust source
  785. add t21 = -1, t21 // adjust destination
  786. ;;
  787. ByteMoveDownLoop:
  788. cmp.ne pt1 = zero, a2
  789. ;;
  790. (pt1) ld1 t10 = [t21], -1
  791. (pt1) add a2 = -1, a2
  792. ;;
  793. (pt1) st1 [t20] = t10, -1
  794. (pt1) br.cond.sptk ByteMoveDownLoop
  795. br.ret.spnt brp
  796. ;;
  797. LEAF_EXIT(RtlMoveMemory)
  798. LEAF_ENTRY(RtlCompareMemory)
  799. cmp.eq pt0 = 0, a2
  800. mov v0 = 0
  801. (pt0) br.ret.spnt.many brp
  802. ;;
  803. add t2 = -1, a2
  804. Rcmp10:
  805. ld1 t0 = [a0], 1
  806. ld1 t1 = [a1], 1
  807. ;;
  808. cmp4.eq pt2 = t0, t1
  809. ;;
  810. (pt2) cmp.ne.unc pt1 = v0, t2
  811. (pt2) add v0 = 1, v0
  812. (pt1) br.cond.dptk.few Rcmp10
  813. br.ret.sptk.many brp
  814. LEAF_EXIT(RtlCompareMemory)
  815. //++
  816. //
  817. // VOID
  818. // RtlCopyIa64FloatRegisterContext (
  819. // PFLOAT128 Destination,
  820. // PFLOAT128 Source,
  821. // ULONGLONG Length
  822. // )
  823. //
  824. // Routine Description:
  825. //
  826. // This routine copies floating point context from one place to
  827. // another. It assumes both the source and the destination are
  828. // 16-byte aligned and the buffer contains only memory image of
  829. // floating point registers. Note that Length must be greater
  830. // than 0 and a multiple of 16.
  831. //
  832. // Arguments:
  833. //
  834. // a0 - Destination
  835. // a1 - Source
  836. // a2 - Length
  837. //
  838. // Return Value:
  839. //
  840. // None.
  841. //
  842. //--
  843. NESTED_ENTRY(RtlCopyIa64FloatRegisterContext)
  844. .prologue
  845. .save ar.lc, t22
  846. mov t22 = ar.lc
  847. shr t0 = a2, 4
  848. ;;
  849. cmp.gtu pt0, pt1 = 16, a2
  850. add t0 = -1, t0
  851. ;;
  852. PROLOGUE_END
  853. (pt1) mov ar.lc = t0
  854. (pt0) br.ret.spnt brp
  855. Rcf10:
  856. ldf.fill ft0 = [a1], 16
  857. nop.m 0
  858. nop.i 0
  859. ;;
  860. stf.spill [a0] = ft0, 16
  861. nop.i 0
  862. br.cloop.dptk Rcf10
  863. ;;
  864. nop.m 0
  865. mov ar.lc = t22
  866. br.ret.sptk brp
  867. ;;
  868. NESTED_EXIT(RtlCopyIa64FloatRegisterContext)
  869. NESTED_ENTRY(RtlpCopyContextSubSet)
  870. .prologue
  871. .save ar.lc, t22
  872. mov t22 = ar.lc
  873. mov t0 = a0
  874. mov t1 = a1
  875. ;;
  876. PROLOGUE_END
  877. ld8 t3 = [t1], CxFltS0
  878. ;;
  879. st8 [t0] = t3, CxFltS0
  880. mov t2 = 3
  881. add t10 = CxFltS4, a0
  882. add t11 = CxFltS4, a1
  883. ;;
  884. mov ar.lc = t2
  885. Rcc10:
  886. ldf.fill ft0 = [t1], 16
  887. ;;
  888. stf.spill [t0] = ft0, 16
  889. mov t2 = 15
  890. br.cloop.dptk.few Rcc10
  891. ;;
  892. mov t0 = CxStIFS
  893. mov t1 = CxStFPSR
  894. mov ar.lc = t2
  895. Rcc20:
  896. ldf.fill ft0 = [t11], 16
  897. ;;
  898. stf.spill [t10] = ft0, 16
  899. sub t2 = t0, t1
  900. br.cloop.dptk.few Rcc20
  901. ;;
  902. add t11 = CxStFPSR, a1
  903. add t10 = CxStFPSR, a0
  904. shr t2 = t2, 3
  905. ;;
  906. mov ar.lc = t2
  907. ;;
  908. Rcc30:
  909. ld8 t0 = [t11], 8
  910. ;;
  911. st8 [t10] = t0, 8
  912. nop.i 0
  913. br.cloop.dptk.few Rcc30
  914. ;;
  915. nop.m 0
  916. mov ar.lc = t22
  917. br.ret.sptk brp
  918. NESTED_EXIT(RtlpCopyContextSubSet)
  919. //++
  920. //
  921. // VOID
  922. // RtlPrefetchMemoryNonTemporal (
  923. // IN PVOID Source,
  924. // IN SIZE_T Length
  925. // )
  926. //
  927. // Routine Description:
  928. //
  929. // This routine prefetches memory at Source, for Length bytes into
  930. // the closest cache to the processor.
  931. //
  932. // N.B. Currently this code assumes a line size of 32 bytes. At
  933. // some stage it should be altered to determine and use the processor's
  934. // actual line size.
  935. //
  936. // Arguments:
  937. //
  938. // a0 - Source
  939. // a1 - Length
  940. //
  941. // Return Value:
  942. //
  943. // None.
  944. //
  945. //--
  946. LEAF_ENTRY(RtlPrefetchMemoryNonTemporal)
  947. .prologue
  948. lfetch.nta [a0], 32 // get first line coming
  949. .save ar.lc, t0
  950. mov.i t0 = ar.lc // save loop counter
  951. shr a1 = a1, 5 // determine loop count
  952. ;;
  953. .body
  954. add t2 = -1, a1 // subtract out already fetched line
  955. cmp.lt pt0, pt1 = 2, a1 // check if less than one line to fetch
  956. ;;
  957. (pt0) mov ar.lc = t2 // set loop count
  958. (pt1) br.ret.spnt.few brp // return if no more lines to fetch
  959. ;;
  960. Rpmnt10:
  961. lfetch.nta [a0], 32 // fetch next line
  962. br.cloop.dptk.many Rpmnt10 // loop while more lines to fetch
  963. ;;
  964. mov ar.lc = t0 // restore loop counter
  965. br.ret.sptk.many brp // return
  966. LEAF_EXIT(RtlPrefetchMemoryNonTemporal)