Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

667 lines
30 KiB

4 years ago
  1. #++
  2. # Copyright 1991, 1994, Digital Equipment Corporation
  3. #
  4. # ots_movem(char *dstptr INOUT, long dstlen INOUT,
  5. # char *srcptr, long srclen)
  6. #
  7. # Move dstlen characters from *srcptr to *dstptr, possibly overlapping
  8. #
  9. # Special conventions: No stack space, r16-r21 and r27-r28 ONLY,
  10. # no linkage pointer required, r16 is INOUT and points to the address
  11. # following the move, r17 is INOUT and has the remaining destination
  12. # length following the move.
  13. # (Warning: The auto-loader potentially takes some regs across
  14. # the call if this is being used in a shared lib. environment.)
  15. #
  16. # This is a GEM support routine for moving (possibly overlapping) memory
  17. # from one address to another. This is optimized for extremely high
  18. # performance both for small blocks and large moves. In order to reduce
  19. # overhead for small cases, they are retired as quickly as possible,
  20. # more case analysis is reserved for cases which will do more. Note
  21. # that while overlapping moves are supported, (unlike Sys V memcpy)
  22. # routines), they are not quite as fast.
  23. #
  24. # Warning - This code is basically "expanded microcode". Since it is
  25. # executed so frequently in many contexts, it has been extensively "hand-
  26. # optimized"...
  27. #
  28. # Note that this routine and ots_move are basically similar in many
  29. # respects (same basic code), so maintenance should be done both
  30. # places. This routine is primarily provided for lower overhead (for
  31. # short strings).
  32. # [Except for the first few instructions, the recipe for creating OTS_MOVEM
  33. # from OTS_MOVE is to change uses of R19->R21 and then R17->R19.]
  34. #
  35. # This version of OTS_MOVEM provides longword granularity.
  36. #
  37. # 015 1 Sep 1994 WBN Longword granularity version, based on
  38. # OTS_MOVEM_ALPHA.M64 version 014 and
  39. # OTS_MOVE_ALPHA_WNT.M64 version 015.
  40. #--
  41. #include "ots_defs.hs"
  42. # r16 = dst --> r16 = end
  43. # r19 = dst_len --> r17 = remaining
  44. # r18 = src
  45. # r19 = src_len
  46. # destroys r18-r21, r27-r28
  47. .globl _OtsMoveMinimum
  48. .ent _OtsMoveMinimum
  49. _OtsMoveMinimum:
  50. .set noat
  51. .set noreorder
  52. .frame sp,0,r26
  53. .prologue 0
  54. subq r17, r19, r20 # Which length is larger?
  55. cmovlt r20, r17, r19 # Min to r19
  56. andnot r16, 3, r21 # LW-aligned dst pointer
  57. subq r19, 4, r20 # Get length-4
  58. beq r19, done # No memory accesses if length=0
  59. ldq_u r28, (r18) # Load first QW of source
  60. addq r19, r18, r27 # Point to end of source
  61. subq r17, r19, r17 # Set remaining length for return
  62. bge r20, geq4 # Go handle lengths >= 4
  63. ldq_u r27, -1(r27) # Load last QW of source
  64. and r16, 3, r16 # Get dst alignment within LW
  65. ldl r19, (r21) # Load first LW of destination
  66. addq r20, r16, r20 # Get alignment+length-4
  67. extql r28, r18, r28 # Extract first bytes of source
  68. bgt r20, double # Go handle LW crossing
  69. extqh r27, r18, r27 # Extract last bytes of source
  70. addq r20, 4, r20 # Get ending alignment in LW
  71. or r27, r28, r28 # Combine halves of source
  72. insql r28, r16, r28 # Position low part of source
  73. mskql r19, r16, r18 # Keep low bytes of destination
  74. mskql r28, r20, r28 # Trim off high bytes of source
  75. mskqh r19, r20, r19 # Keep high bytes of destination
  76. or r18, r28, r28 # Combine source with low dest
  77. or r19, r28, r28 # Combine with high dest
  78. stl r28, (r21) # Store to destination
  79. addq r21, r20, r16 # Point to end of dest for return
  80. ret r31, (r26)
  81. double: extqh r27, r18, r27 # Extract last bytes of source
  82. ldl r18, 4(r21) # Load second LW of destination
  83. mskql r19, r16, r19 # Keep low bytes of 1st dest LW
  84. or r27, r28, r28 # Combine parts of source
  85. insql r28, r16, r27 # Position start of source
  86. addq r16, 4, r16 # Compute virtual start in LW
  87. insqh r28, r16, r28 # Position end of source
  88. addq r21, 4, r21 # Prepare to compute end address
  89. mskqh r18, r20, r18 # Keep high bytes of 2nd dest LW
  90. mskql r28, r20, r28 # Trim end of source to length
  91. or r27, r19, r19 # Combine low source with 1st LW
  92. stl r19, -4(r21)
  93. or r28, r18, r18 # Combine high source with 2nd LW
  94. stl r18, (r21)
  95. addq r21, r20, r16 # Point to end of dest for return
  96. done: ret r31, (r26)
  97. # Come here to move >= 4 bytes.
  98. #
  99. # r16-> dst
  100. # r17 = remaining length for return
  101. # r18-> src
  102. # r19 = length
  103. # r20 = len-4
  104. # r21-> LW-aligned dst
  105. # r27 = src+len
  106. # r28 = first src QW
  107. geq4: subq r20, 4, r19 # At least 8 bytes to move?
  108. subq r16, r27, r27 # Check if dst >= src+len
  109. blt r19, lss8 # Move 4..7 bytes
  110. subq r18, r16, r19 # Check if src >= dst
  111. bge r27, ok1 # Forward OK if whole src precedes dst
  112. blt r19, reverse # Go backwards if src < dst < src+len
  113. ok1: and r16, 7, r16
  114. addq r16, r20, r27 # Alignment + length - 4
  115. bne r16, part # Part of first QW to be skipped
  116. subq r20, 4, r20 # At least 8 bytes to be stored?
  117. beq r27, simple # Only low LW to be stored
  118. and r18, 7, r27 # Is src address now aligned?
  119. blt r20, shortq # Dst ends in first QW
  120. subq r20, 32, r19 # At least 4 quadwords left to move?
  121. beq r27, align # Go handle matching alignment
  122. # Src alignment differs from dst alignment.
  123. # r16 = dst alignment
  124. # r17 = remaining length for return
  125. # r18 = src-8 after 1st move
  126. # r19
  127. # r20 = initial length-8
  128. # r21 = initial dst
  129. # r27 = dst QW if dst wasn't aligned
  130. # r28 = source QW
  131. misal: or r16, r21, r21 # Put alignment back with dst ptr ***
  132. ldq_u r19, 8(r18) # Load same or next source QW
  133. extql r28, r18, r28 # Get first part of source to store
  134. addq r20, r16, r20 # Adjust length for partial move
  135. mskql r27, r21, r27 # Trim destination for merge
  136. extqh r19, r18, r16 # Get second part of source
  137. subq r20, 24, r20 # At least 4 more quadwords?
  138. or r28, r16, r28 # Combine pieces of source
  139. mskqh r28, r21, r28 # Trim low junk off source
  140. andnot r21, 7, r21 # Adjust dst for partial move
  141. bge r20, unrol2 # Taken branch for long strings
  142. addq r20, 16, r16 # Add back: how many whole QW's?
  143. nop
  144. short2: and r20, 7, r20 # How many odd bytes?
  145. blt r16, last # Skip if no more whole QW's
  146. or r28, r27, r28 # Combine pieces
  147. stq r28, (r21)
  148. extql r19, r18, r27 # Get last part of prior src QW
  149. ldq_u r19, 16(r18) # Load another src QW
  150. addq r21, 8, r21 # Update dst
  151. subq r16, 8, r16 # More whole QW's?
  152. addq r18, 8, r18 # Update src
  153. blt r16, lastx # Skip if no more whole QWs
  154. extqh r19, r18, r28 # Get first part of this src QW
  155. addq r18, 8, r18 # Update src again
  156. or r28, r27, r28 # Combine pieces
  157. stq r28, (r21)
  158. extql r19, r18, r27 # Get last part of this src QW
  159. ldq_u r19, 8(r18) # Load another src QW
  160. addq r21, 8, r21 # Update dst
  161. lastx: extqh r19, r18, r28 # Get first part of this src QW
  162. last: addq r18, r20, r16 # Point to end-8 of src
  163. beq r20, done_u # Skip if no odd bytes
  164. or r28, r27, r28 # Combine parts of last whole QW
  165. ldq_u r27, 7(r16) # Load final (maybe same) src QW
  166. subq r20, 4, r16 # More than 4 bytes left?
  167. stq r28, (r21) # Store last whole QW
  168. extql r19, r18, r19 # Get last part of this src QW
  169. extqh r27, r18, r27 # Get what we need from final src QW
  170. joinx: ldq r28, 8(r21) # Load last QW of destination
  171. or r19, r27, r27 # Combine pieces of source
  172. mskql r27, r20, r27 # Trim to length
  173. mskqh r28, r20, r28 # Make room in destination
  174. bgt r16, done_u # Go store a whole QW
  175. addq r20, 8, r20 # Increment length for return
  176. or r28, r27, r28 # Insert src into dst
  177. stl r28, 8(r21) # Final LW
  178. addq r21, r20, r16 # Point to end of dst for return
  179. ret r31, (r26)
  180. # Come here to move 4 thru 7 bytes.
  181. #
  182. lss8: addq r18, r19, r27 # Recover src+len-8
  183. and r16, 3, r16 # Dst alignment within LW
  184. ldq_u r27, 7(r27) # Load last part of source
  185. extql r28, r18, r28 # Extract first part of source
  186. beq r16, lw # Handle LW-aligned dst
  187. extqh r27, r18, r27 # Extract last part of source
  188. ldl r18, (r21) # Load first LW of dst
  189. addq r16, r20, r20 # align+len-4 of dst
  190. or r28, r27, r28 # Complete source
  191. mskql r28, r19, r28 # Trim source to length
  192. mskql r18, r16, r18 # Make room in dst
  193. insql r28, r16, r27 # Position src like dst
  194. addq r16, r19, r19 # Align+len-8 of dst
  195. or r27, r18, r18 # Merge
  196. stl r18, (r21) # Store first LW of dst
  197. extql r27, 4, r27 # Position next LW of src
  198. blt r19, zz # Skip if not a whole LW
  199. stl r27, 4(r21) # Store the whole LW
  200. addq r21, 4, r21 # Adjust pointer
  201. subq r20, 4, r20 # Adjust ending alignment
  202. beq r19, donezz # Exit if done
  203. insqh r28, r16, r27 # Position remainder of src
  204. zz: ldl r28, 4(r21) # Load last dst LW
  205. mskqh r28, r20, r28 # Make room in dst
  206. or r28, r27, r27 # Merge
  207. stl r27, 4(r21) # Final store
  208. donezz: addq r21, r20, r16 # End address -4
  209. addq r16, 4, r16
  210. ret r31, (r26)
  211. lw: extqh r27, r18, r27 # Extract last part of source
  212. addq r21, 4, r16 # Adjust for return value
  213. beq r20, lwdone # Skip if exactly 4 bytes
  214. ldl r19, 4(r21) # Load next dst LW
  215. or r27, r28, r28 # Complete source
  216. stl r28, (r21) # Store first LW
  217. extql r28, 4, r28 # Position rest of source
  218. mskqh r19, r20, r27 # Make room in dst
  219. mskql r28, r20, r28 # Trim src
  220. or r27, r28, r28 # Merge
  221. stl r28, 4(r21)
  222. addq r16, r20, r16 # Update return value
  223. ret r31, (r26)
  224. lwdone: or r27, r28, r28 # Merge
  225. stl r28, (r21)
  226. ret r31, (r26)
  227. # Move 4 bytes to an aligned LW.
  228. #
  229. simple: ldq_u r27, 3(r18) # Load last QW of source
  230. extql r28, r18, r28 # Position first QW
  231. addq r21, 4, r16 # Point to end of dst for return
  232. extqh r27, r18, r27 # Position last QW
  233. or r28, r27, r28 # Merge
  234. stl r28, (r21) # Store
  235. ret r31, (r26)
  236. # Dst is not aligned. Check whether first write is to a LW or a QW,
  237. # and whether that finishes the move. Then see if src alignment
  238. # matches, and read/rewrite the first dst quadword.
  239. #
  240. # r16 = dst alignment in QW
  241. # r17 = remaining length for return
  242. # r18-> src
  243. # r19
  244. # r20 = len-4
  245. # r21-> LW-aligned dst
  246. # r27 = QW_alignment + length - 4
  247. # r28 = first src QW
  248. #.align quad
  249. part: subq r27, 4, r19 # Does dst end in first QW?
  250. ldq_u r27, (r21) # Load first dst QW
  251. blt r19, shortu # Go handle short store
  252. and r16, 4, r19 # Does it start in high LW?
  253. subq r18, r16, r18 # Adjust src for this partial move
  254. beq r19, quad # Whole QW to be touched
  255. extql r28, r18, r19 # Position first part of source
  256. ldq_u r28, 7(r18) # Get next (or same) src QW
  257. mskql r27, r16, r27 # Trim destination for merge
  258. addq r20, r16, r20 # Len + alignment...
  259. extqh r28, r18, r28 # Position second part of source
  260. subq r20, 4, r20 # Len+alignment-8 = remaining len
  261. or r28, r19, r28 # Pieces of source
  262. mskqh r28, r16, r19 # Trim junk preceding source
  263. ldq_u r28, 7(r18) # Get src QW again **
  264. or r27, r19, r19 # Combine other source piece
  265. extql r19, 4, r19 # Get the high LW
  266. stl r19, (r21) # Store just that
  267. # Now at a QW boundary. Is there a QW left to store?
  268. # Is the source QW aligned?
  269. andnot r21, 7, r21 # Adjust dst pointer to next-8
  270. subq r20, 8, r19 # Got a QW more?
  271. and r18, 7, r27 # Src aligned?
  272. blt r19, short3 # Too short
  273. addq r21, 8, r21
  274. subq r20, 8, r20
  275. ldq_u r28, 8(r18)
  276. addq r18, 8, r18
  277. subq r20, 32, r19 # Prepare for unrolled loop
  278. beq r27, align # Alignment matches
  279. or r31, r31, r27
  280. or r31, r31, r16
  281. br r31, misal
  282. shortu: addq r18, r20, r20 # Point to end-4 of src
  283. ldq_u r20, 3(r20) # Get last QW of source
  284. extql r28, r18, r28 # Fetch first QW of source
  285. extqh r20, r18, r20 # Fetch last QW of source
  286. mskql r27, r16, r18 # Clear from start thru end of dst
  287. mskqh r27, r19, r27 # Clear from 0 to end of dst
  288. or r28, r20, r28 # Combine src pieces
  289. insql r28, r16, r28 # Position src
  290. or r27, r18, r27 # Combine dst pieces
  291. mskql r28, r19, r28 # Trim src
  292. addq r21, r19, r20 # Final pointer for return
  293. or r28, r27, r28 # Merge src & dst
  294. stq_u r28, (r21) # Store it
  295. addq r20, 8, r16
  296. ret r31, (r26)
  297. quad: and r18, 7, r19 # Is src address now aligned?
  298. subq r20, 4, r20 # Get length-8
  299. bne r19, misal # Go handle mismatched alignment
  300. mskqh r28, r16, r28 # Keep desired part of source
  301. addq r20, r16, r20 # Adjust count for this partial move
  302. mskql r27, r16, r27 # Keep desired part of destination QW
  303. subq r20, 32, r19 # At least 4 quadwords left to move?
  304. or r27, r28, r28 # Merge source and destination
  305. # Src alignment matches.
  306. # r16
  307. # r17 = remaining length for return
  308. # r18 = next src pointer -8
  309. # r19 = remaining length -32
  310. # r20
  311. # r21 = dst pointer
  312. # r27
  313. # r28 = dst quadword
  314. align: and r19, 24, r20 # How many after a multiple of 4?
  315. bge r19, unrol1 # Taken branch for long strings
  316. nop
  317. short1: and r19, 7, r19 # How many odd bytes?
  318. beq r20, last28 # Skip if no more whole QWs after r28
  319. ldq r27, 8(r18) # Load next QW
  320. addq r18, 8, r18
  321. stq r28, (r21) # Store prior QW
  322. subq r20, 16, r20 # Map 8/16/24 to -8/0/8
  323. addq r21, 8, r21
  324. blt r20, last27 # Skip if no more after r27
  325. ldq r28, 8(r18) # Load next QW
  326. addq r18, 8, r18
  327. stq r27, (r21) # Store prior QW
  328. addq r21, 8, r21
  329. nop
  330. beq r20, last28
  331. ldq r27, 8(r18) # Load next QW
  332. addq r18, 8, r18
  333. stq r28, (r21) # Store prior QW
  334. addq r21, 8, r21
  335. last27: beq r19, done27 # Skip if no odd bytes
  336. ldq r28, 8(r18) # Load one more src QW
  337. ldq r18, 8(r21) # Load last destination QW
  338. subq r19, 4, r16 # More than 4 bytes to store?
  339. stq r27, (r21) # Store prior QW
  340. mskql r28, r19, r27 # Trim source
  341. mskqh r18, r19, r18 # Trim destination
  342. ble r16, lastl # Go store just a LW
  343. lastq: addq r21, r19, r21 # End-8 of dst for return
  344. or r27, r18, r27 # Merge src & dst
  345. done27: stq_u r27, 7(r21) # Store last destination QW
  346. addq r21, 8, r16 # End of dst for return
  347. ret r31, (r26)
  348. short3: addq r18, r20, r16 # Point to end-8 of src
  349. beq r20, donexx # Completely done?
  350. ldq_u r19, 7(r16) # Load final src QW
  351. subq r20, 4, r16 # Got more than a LW?
  352. beq r27, joinx # Don't include prior src if aligned
  353. extql r28, r18, r27 # Last part of prior src QW
  354. extqh r19, r18, r19 # First part of this src QW
  355. br joinx
  356. donexx: addq r21, r20, r16
  357. addq r16, 8, r16
  358. ret r31, (r26)
  359. last28: beq r19, done28 # Skip if no odd bytes
  360. ldq r27, 8(r18) # Load one more src QW
  361. ldq r18, 8(r21) # Load last destination QW
  362. subq r19, 4, r16 # More than 4 bytes to store?
  363. stq r28, (r21) # Store prior QW
  364. mskql r27, r19, r27 # Trim source
  365. mskqh r18, r19, r18 # Trim destination
  366. bgt r16, lastq # Go store a QW
  367. lastl: addq r19, 8, r19 # Increment length for return
  368. or r27, r18, r27 # Merge src & dst
  369. stl r27, 8(r21) # Store last destination LW
  370. addq r21, r19, r16 # End of dst for return
  371. ret r31, (r26)
  372. shortq: addq r18, r20, r16 # Point to end-8 of src
  373. ldq r27, (r21) # Get dst QW
  374. extql r28, r18, r28 # Position first src QW
  375. ldq_u r19, 7(r16) # Get last QW of src
  376. mskqh r27, r20, r27 # Mask dst QW
  377. extqh r19, r18, r19 # Position last src QW
  378. or r19, r28, r28 # Merge
  379. mskql r28, r20, r28 # Trim src QW
  380. done_u: addq r21, r20, r21 # End-8 of dst for return
  381. or r28, r27, r28 # Combine pieces
  382. done28: stq_u r28, 7(r21) # Store last destination QW
  383. addq r21, 8, r16 # End of dst for return
  384. ret r31, (r26)
  385. # Unrolled loop for long moves with matching alignment within QW.
  386. # Each iteration moves two cache blocks.
  387. # We try to schedule the cache misses to avoid a double miss
  388. # in EV4 pass 2.1 chips. If the source alignment within a cache
  389. # block is exactly 3, alter it, since that case runs slower.
  390. #
  391. # R16
  392. # R17 = remaining length for return
  393. # R18 = src pointer
  394. # R19 = remaining length (to load) - 32
  395. # R20 = length & 24 (needed at return)
  396. # R21 = dst pointer
  397. # R27
  398. # R28 = QW from 0(R18) to store at 0(R21), both on input and at return
  399. #
  400. #.align quad
  401. unrol1: ldq r27, 32(r18) # Cache miss here; later loads hit.
  402. subq r19, 48, r16 # Six more quadwords?
  403. and r18, 16, r20 # Starting in 2nd half of cache block?
  404. blt r16, uent1 # If not 6 more, don't adjust.
  405. ldq r16, 8(r18)
  406. beq r20, utop1 # If in 1st half, don't adjust.
  407. ldq r27, 48(r18) # Cache miss here
  408. addq r18, 16, r18
  409. stq r28, (r21) # Adjust by going ahead 1/2 block.
  410. addq r21, 16, r21
  411. ldq r28, (r18)
  412. subq r19, 16, r19
  413. stq r16, -8(r21)
  414. nop
  415. ldq r16, 8(r18)
  416. utop1: subq r19, 32, r19
  417. uloop1: ldq r20, 64(r18) # Cache miss here
  418. stq r28, (r21)
  419. ldq r28, 16(r18)
  420. stq r16, 8(r21)
  421. ldq r16, 24(r18)
  422. addq r18, 64, r18
  423. stq r28, 16(r21)
  424. mov r20, r28
  425. stq r16, 24(r21)
  426. addq r21, 64, r21
  427. ldq r20, -24(r18)
  428. subq r19, 32, r19
  429. blt r19, uexit1
  430. ldq r16, 32(r18) # Cache miss here
  431. stq r27, -32(r21)
  432. ldq r27, -16(r18)
  433. stq r20, -24(r21)
  434. ldq r20, -8(r18)
  435. stq r27, -16(r21)
  436. mov r16, r27
  437. stq r20, -8(r21)
  438. uent1: subq r19, 32, r19
  439. ldq r16, 8(r18)
  440. bge r19, uloop1
  441. # finish last block of 4 quadwords
  442. #
  443. ubot1: stq r28, (r21)
  444. mov r27, r28 # Position last QW for return
  445. ldq r27, 16(r18)
  446. addq r18, 32, r18
  447. stq r16, 8(r21)
  448. addq r21, 32, r21
  449. uex1a: ldq r16, -8(r18)
  450. and r19, 24, r20 # Recover count of remaining QW's
  451. stq r27, -16(r21)
  452. stq r16, -8(r21)
  453. br r31, short1
  454. nop
  455. uexit1: stq r27, -32(r21) # Here if exit from middle of loop
  456. ldq r27, -16(r18)
  457. stq r20, -24(r21)
  458. br r31, uex1a # Join common exit sequence
  459. #.align quad
  460. unrol2: ldq_u r16, 16(r18) # Load next src QW
  461. extql r19, r18, r19 # Get last part of prior one
  462. or r28, r27, r28 # Combine pieces
  463. stq r28, (r21) # Store prior dst QW
  464. subq r20, 24, r20 # Update loop counter
  465. extqh r16, r18, r28 # Get first part of a src QW
  466. ldq_u r27, 24(r18) # Load next src QW
  467. extql r16, r18, r16 # Get last part of prior one
  468. or r28, r19, r28 # Combine pieces
  469. stq r28, 8(r21) # Store prior dst QW
  470. addq r21, 24, r21 # Update dst pointer
  471. extqh r27, r18, r28 # Get first part of a src QW
  472. ldq_u r19, 32(r18) # Load next src QW
  473. extql r27, r18, r27 # Get last part of prior one
  474. or r28, r16, r28 # Combine pieces
  475. stq r28, -8(r21) # Store prior dst QW
  476. addq r18, 24, r18 # Update src pointer
  477. extqh r19, r18, r28 # Get first part of a src QW
  478. bge r20, unrol2 # Repeat as needed
  479. addq r20, 16, r16 # How many whole quadwords left?
  480. br r31, short2 # Go handle leftovers
  481. nop
  482. # Must move in reverse order because of overlap.
  483. # r16 = dst address
  484. # r17 = remaining length for return
  485. # r18 = src address
  486. # r19
  487. # r20 = len-4 (>= 0)
  488. # r21
  489. # r27
  490. # r28
  491. # Not yet LW-granularity...
  492. reverse:
  493. subq r20, 4, r20 # This code expects len-8
  494. addq r20, r18, r18 # Point to end-8 of source
  495. addq r20, r16, r19 # Point to end-8 of destination
  496. and r19, 7, r21 # Is destination aligned?
  497. ldq_u r28, 7(r18) # Get source QW
  498. addq r19, 8, r16 # Point to end of dst for return
  499. bne r21, rpart # Skip if partial write needed
  500. and r18, 7, r27 # Is source aligned too?
  501. beq r27, ralign # Skip if so
  502. ldq_u r21, (r18) # Handle aligned dst, unaligned src
  503. subq r20, 8, r20
  504. extqh r28, r18, r28
  505. extql r21, r18, r27
  506. br r31, rwhole
  507. rmis: ldq_u r21, (r18) # Load same or preceding src QW
  508. extqh r28, r18, r28 # Get last part of source to store
  509. mskqh r27, r16, r27 # Keep high-address part of dst
  510. extql r21, r18, r21
  511. subq r20, 8, r20 # How many more whole QW's?
  512. or r21, r28, r28
  513. ldq_u r21, (r18) # Reload source QW
  514. mskql r28, r16, r28 # Trim source to length
  515. rwhole: blt r20, rlast2 # Skip if no more whole QW's
  516. rloop2: or r28, r27, r28 # Combine pieces
  517. stq r28, (r19)
  518. rent2: extqh r21, r18, r27
  519. ldq_u r21, -8(r18)
  520. subq r20, 8, r20
  521. subq r19, 8, r19
  522. extql r21, r18, r28
  523. subq r18, 8, r18
  524. bge r20, rloop2
  525. rlast2: and r20, 7, r20
  526. beq r20, rdone2
  527. or r28, r27, r28
  528. subq r18, r20, r27
  529. stq r28, (r19)
  530. rl2ent: subq r31, r20, r20
  531. ldq_u r27, (r27)
  532. extqh r21, r18, r21
  533. ldq r28, -8(r19)
  534. subq r19, 8, r19
  535. extql r27, r18, r27
  536. mskql r28, r20, r28
  537. or r27, r21, r27
  538. mskqh r27, r20, r27
  539. and r20, 4, r21 # Ending in high LW?
  540. bne r21, rdone3 # Only longword store at the end
  541. rdone2: or r28, r27, r28
  542. stq r28, (r19)
  543. ret r31, (r26)
  544. rdone3: or r28, r27, r28
  545. extql r28, 4, r28
  546. stl r28, 4(r19)
  547. ret r31, (r26)
  548. rpart: ldq_u r27, 7(r19) # Get dst QW
  549. subq r21, 8, r21 # Get negative of bytes not moved
  550. subq r18, r21, r18 # From src-8, get src after partial
  551. subq r20, r21, r20 # Adjust length for partial move
  552. subq r19, r21, r19 # Adjust dst pointer
  553. addq r21, 4, r21 # End alignment - 4
  554. ble r21, r_lw # Only storing the low longword?
  555. and r18, 7, r21 # Src alignment now matching dst?
  556. bne r21, rmis # Go back if not
  557. mskql r28, r16, r28 # Keep low addresses of src QW
  558. mskqh r27, r16, r27 # Keep high address of dst QW
  559. ralign: subq r20, 8, r20 # How many more whole QW's?
  560. or r27, r28, r28 # Combine
  561. blt r20, rlast1 # Skip if this is the end
  562. rloop1: stq r28, (r19) # Store one QW
  563. rent1: subq r20, 8, r20 # Decrement length
  564. ldq r28, -8(r18) # Load preceding QW
  565. subq r19, 8, r19 # Decrement dst pointer
  566. subq r18, 8, r18 # Decrement src pointer
  567. bge r20, rloop1 # Repeat for each whole QW
  568. rlast1: and r20, 7, r20 # How many odd bytes?
  569. beq r20, rdone # Skip if none
  570. ldq r27, -8(r18) # Get another source QW
  571. subq r31, r20, r20 # Get byte # to end at
  572. stq r28, (r19)
  573. rl_ent: ldq r28, -8(r19)
  574. subq r19, 8, r19 # Adjust dst pointer again
  575. mskqh r27, r20, r27 # Keep top of src QW
  576. and r20, 4, r21 # Ending in high LW?
  577. mskql r28, r20, r28 # Keep bottom of dst QW
  578. bne r21, rdone4 # Only longword store at the end
  579. or r27, r28, r28 # Combine
  580. rdone: stq r28, (r19) # Store last QW
  581. ret r31, (r26)
  582. rdone4: or r27, r28, r28 # Combine
  583. extql r28, 4, r28 # Get high part
  584. stl r28, 4(r19) # Store last LW
  585. ret r31, (r26)
  586. r_lw: and r18, 7, r21 # Src alignment now matching dst?
  587. bne r21, rmislw # Go back if not
  588. mskql r28, r16, r28 # Keep low addresses of src QW
  589. mskqh r27, r16, r27 # Keep high address of dst QW
  590. subq r20, 8, r20 # How many more whole QW's?
  591. or r27, r28, r28 # Combine
  592. blt r20, rlast1_lw # Skip if this is the end
  593. stl r28, (r19) # Store one QW
  594. br r31, rent1
  595. rlast1_lw:
  596. and r20, 7, r20 # How many odd bytes?
  597. ldq r27, -8(r18) # Get another source QW
  598. subq r31, r20, r20 # Get byte # to end at
  599. stl r28, (r19)
  600. br rl_ent
  601. rmislw: ldq_u r21, (r18) # Load same or preceding src QW
  602. extqh r28, r18, r28 # Get last part of source to store
  603. mskqh r27, r16, r27 # Keep high-address part of dst
  604. extql r21, r18, r21
  605. subq r20, 8, r20 # How many more whole QW's?
  606. or r21, r28, r28
  607. ldq_u r21, (r18) # Reload source QW
  608. mskql r28, r16, r28 # Trim source to length
  609. blt r20, rlast2_lw # Skip if no more whole QW's
  610. or r28, r27, r28 # Combine pieces
  611. stl r28, (r19)
  612. br r31, rent2
  613. rlast2_lw:
  614. and r20, 7, r20
  615. or r28, r27, r28
  616. subq r18, r20, r27
  617. stl r28, (r19)
  618. br r31, rl2ent
  619. .set at
  620. .set reorder
  621. .end _OtsMove