mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
652 lines
30 KiB
652 lines
30 KiB
#++
|
|
# Copyright 1991, 1994, Digital Equipment Corporation
|
|
#
|
|
# ots_move(char *dstptr INOUT, long dstlen, char *srcptr)
|
|
#
|
|
# Move dstlen characters from *srcptr to *dstptr, possibly overlapping
|
|
#
|
|
# Special conventions: No stack space, r16-r20 and r27-r28 ONLY,
|
|
# no linkage pointer required, r16 is INOUT and points to the address
|
|
# following the move. (Warning: The auto-loader potentially takes
|
|
# some regs across the call if this is being used in a shared lib.
|
|
# environment.)
|
|
#
|
|
# This is a GEM support routine for moving (possibly overlapping) memory
|
|
# from one address to another. This is optimized for extremely high
|
|
# performance both for small blocks and large moves. In order to reduce
|
|
# overhead for small cases, they are retired as quickly as possible,
|
|
# more case analysis is reserved for cases which will do more. Note
|
|
# that while overlapping moves are supported, (unlike Sys V memcpy)
|
|
# routines), they are not quite as fast.
|
|
#
|
|
# Warning - This code is basically "expanded microcode". Since it is
|
|
# executed so frequently in many contexts, it has been extensively "hand-
|
|
# optimized"...
|
|
#
|
|
# Note that this routine and ots_movem are basically similar in many
|
|
# respects (same basic code), so maintenance should be done both
|
|
# places. This routine is primarily provided for lower overhead (for
|
|
# short strings).
|
|
#
|
|
# This version of OTS_MOVE provides longword granularity.
|
|
#
|
|
# 015 30 Aug 1994 WBN Longword granularity version, based on
|
|
# OTS_MOVE_ALPHA.M64 version 014.
|
|
#--
|
|
|
|
#include "ots_defs.hs"
|
|
|
|
# r16 = dst --> r16 = end
|
|
# r17 = len
|
|
# r18 = src
|
|
# destroys r17-r20, r27-r28
|
|
|
|
.globl _OtsMove
|
|
.ent _OtsMove
|
|
_OtsMove:
|
|
.set noat
|
|
.set noreorder
|
|
.frame sp,0,r26
|
|
.prologue 0
|
|
beq r17, done # No memory accesses if length=0
|
|
subq r17, 4, r20 # Get length-4
|
|
ldq_u r28, (r18) # Load first QW of source
|
|
addq r17, r18, r27 # Point to end of source
|
|
andnot r16, 3, r19 # LW-aligned dst pointer
|
|
bge r20, geq4 # Go handle lengths >= 4
|
|
ldq_u r27, -1(r27) # Load last QW of source
|
|
and r16, 3, r16 # Get dst alignment within LW
|
|
ldl r17, (r19) # Load first LW of destination
|
|
addq r20, r16, r20 # Get alignment+length-4
|
|
extql r28, r18, r28 # Extract first bytes of source
|
|
bgt r20, double # Go handle LW crossing
|
|
extqh r27, r18, r27 # Extract last bytes of source
|
|
addq r20, 4, r20 # Get ending alignment in LW
|
|
or r27, r28, r28 # Combine halves of source
|
|
insql r28, r16, r28 # Position low part of source
|
|
mskql r17, r16, r18 # Keep low bytes of destination
|
|
mskql r28, r20, r28 # Trim off high bytes of source
|
|
mskqh r17, r20, r17 # Keep high bytes of destination
|
|
or r18, r28, r28 # Combine source with low dest
|
|
or r17, r28, r28 # Combine with high dest
|
|
stl r28, (r19) # Store to destination
|
|
addq r19, r20, r16 # Point to end of dest for return
|
|
ret r31, (r26)
|
|
|
|
double: extqh r27, r18, r27 # Extract last bytes of source
|
|
ldl r18, 4(r19) # Load second LW of destination
|
|
mskql r17, r16, r17 # Keep low bytes of 1st dest LW
|
|
or r27, r28, r28 # Combine parts of source
|
|
insql r28, r16, r27 # Position start of source
|
|
addq r16, 4, r16 # Compute virtual start in LW
|
|
insqh r28, r16, r28 # Position end of source
|
|
addq r19, 4, r19 # Prepare to compute end address
|
|
mskqh r18, r20, r18 # Keep high bytes of 2nd dest LW
|
|
mskql r28, r20, r28 # Trim end of source to length
|
|
or r27, r17, r17 # Combine low source with 1st LW
|
|
stl r17, -4(r19)
|
|
or r28, r18, r18 # Combine high source with 2nd LW
|
|
stl r18, (r19)
|
|
addq r19, r20, r16 # Point to end of dest for return
|
|
done: ret r31, (r26)
|
|
|
|
# Come here to move >= 4 bytes.
|
|
#
|
|
# r16-> dst
|
|
# r17 = length
|
|
# r18-> src
|
|
# r19-> LW-aligned dst
|
|
# r20 = len-4
|
|
# r27 = src+len
|
|
# r28 = first src QW
|
|
|
|
geq4: subq r20, 4, r17 # At least 8 bytes to move?
|
|
subq r16, r27, r27 # Check if dst >= src+len
|
|
blt r17, lss8 # Move 4..7 bytes
|
|
subq r18, r16, r17 # Check if src >= dst
|
|
bge r27, ok1 # Forward OK if whole src precedes dst
|
|
blt r17, reverse # Go backwards if src < dst < src+len
|
|
ok1: and r16, 7, r16
|
|
addq r16, r20, r27 # Alignment + length - 4
|
|
bne r16, part # Part of first QW to be skipped
|
|
subq r20, 4, r20 # At least 8 bytes to be stored?
|
|
beq r27, simple # Only low LW to be stored
|
|
and r18, 7, r27 # Is src address now aligned?
|
|
blt r20, shortq # Dst ends in first QW
|
|
subq r20, 32, r17 # At least 4 quadwords left to move?
|
|
beq r27, align # Go handle matching alignment
|
|
|
|
# Src alignment differs from dst alignment.
|
|
# r16 = dst alignment
|
|
# r17
|
|
# r18 = src-8 after 1st move
|
|
# r19 = initial dst
|
|
# r20 = initial length-8
|
|
# r27 = dst QW if dst wasn't aligned
|
|
# r28 = source QW
|
|
|
|
misal: or r16, r19, r19 # Put alignment back with dst ptr ***
|
|
ldq_u r17, 8(r18) # Load same or next source QW
|
|
extql r28, r18, r28 # Get first part of source to store
|
|
addq r20, r16, r20 # Adjust length for partial move
|
|
mskql r27, r19, r27 # Trim destination for merge
|
|
extqh r17, r18, r16 # Get second part of source
|
|
subq r20, 24, r20 # At least 4 more quadwords?
|
|
or r28, r16, r28 # Combine pieces of source
|
|
mskqh r28, r19, r28 # Trim low junk off source
|
|
andnot r19, 7, r19 # Adjust dst for partial move
|
|
bge r20, unrol2 # Taken branch for long strings
|
|
addq r20, 16, r16 # Add back: how many whole QW's?
|
|
nop
|
|
short2: and r20, 7, r20 # How many odd bytes?
|
|
blt r16, last # Skip if no more whole QW's
|
|
or r28, r27, r28 # Combine pieces
|
|
stq r28, (r19)
|
|
extql r17, r18, r27 # Get last part of prior src QW
|
|
ldq_u r17, 16(r18) # Load another src QW
|
|
addq r19, 8, r19 # Update dst
|
|
subq r16, 8, r16 # More whole QW's?
|
|
addq r18, 8, r18 # Update src
|
|
blt r16, lastx # Skip if no more whole QWs
|
|
extqh r17, r18, r28 # Get first part of this src QW
|
|
addq r18, 8, r18 # Update src again
|
|
or r28, r27, r28 # Combine pieces
|
|
stq r28, (r19)
|
|
extql r17, r18, r27 # Get last part of this src QW
|
|
ldq_u r17, 8(r18) # Load another src QW
|
|
addq r19, 8, r19 # Update dst
|
|
lastx: extqh r17, r18, r28 # Get first part of this src QW
|
|
last: addq r18, r20, r16 # Point to end-8 of src
|
|
beq r20, done_u # Skip if no odd bytes
|
|
or r28, r27, r28 # Combine parts of last whole QW
|
|
ldq_u r27, 7(r16) # Load final (maybe same) src QW
|
|
subq r20, 4, r16 # More than 4 bytes left?
|
|
stq r28, (r19) # Store last whole QW
|
|
extql r17, r18, r17 # Get last part of this src QW
|
|
extqh r27, r18, r27 # Get what we need from final src QW
|
|
joinx: ldq r28, 8(r19) # Load last QW of destination
|
|
or r17, r27, r27 # Combine pieces of source
|
|
mskql r27, r20, r27 # Trim to length
|
|
mskqh r28, r20, r28 # Make room in destination
|
|
bgt r16, done_u # Go store a whole QW
|
|
addq r20, 8, r20 # Increment length for return
|
|
or r28, r27, r28 # Insert src into dst
|
|
stl r28, 8(r19) # Final LW
|
|
addq r19, r20, r16 # Point to end of dst for return
|
|
ret r31, (r26)
|
|
|
|
# Come here to move 4 thru 7 bytes.
|
|
#
|
|
lss8: addq r18, r17, r27 # Recover src+len-8
|
|
and r16, 3, r16 # Dst alignment within LW
|
|
ldq_u r27, 7(r27) # Load last part of source
|
|
extql r28, r18, r28 # Extract first part of source
|
|
beq r16, lw # Handle LW-aligned dst
|
|
extqh r27, r18, r27 # Extract last part of source
|
|
ldl r18, (r19) # Load first LW of dst
|
|
addq r16, r20, r20 # align+len-4 of dst
|
|
or r28, r27, r28 # Complete source
|
|
mskql r28, r17, r28 # Trim source to length
|
|
mskql r18, r16, r18 # Make room in dst
|
|
insql r28, r16, r27 # Position src like dst
|
|
addq r16, r17, r17 # Align+len-8 of dst
|
|
or r27, r18, r18 # Merge
|
|
stl r18, (r19) # Store first LW of dst
|
|
extql r27, 4, r27 # Position next LW of src
|
|
blt r17, zz # Skip if not a whole LW
|
|
stl r27, 4(r19) # Store the whole LW
|
|
addq r19, 4, r19 # Adjust pointer
|
|
subq r20, 4, r20 # Adjust ending alignment
|
|
beq r17, donezz # Exit if done
|
|
insqh r28, r16, r27 # Position remainder of src
|
|
zz: ldl r28, 4(r19) # Load last dst LW
|
|
mskqh r28, r20, r28 # Make room in dst
|
|
or r28, r27, r27 # Merge
|
|
stl r27, 4(r19) # Final store
|
|
donezz: addq r19, r20, r16 # End address -4
|
|
addq r16, 4, r16
|
|
ret r31, (r26)
|
|
|
|
lw: extqh r27, r18, r27 # Extract last part of source
|
|
addq r19, 4, r16 # Adjust for return value
|
|
beq r20, lwdone # Skip if exactly 4 bytes
|
|
ldl r17, 4(r19) # Load next dst LW
|
|
or r27, r28, r28 # Complete source
|
|
stl r28, (r19) # Store first LW
|
|
extql r28, 4, r28 # Position rest of source
|
|
mskqh r17, r20, r27 # Make room in dst
|
|
mskql r28, r20, r28 # Trim src
|
|
or r27, r28, r28 # Merge
|
|
stl r28, 4(r19)
|
|
addq r16, r20, r16 # Update return value
|
|
ret r31, (r26)
|
|
|
|
lwdone: or r27, r28, r28 # Merge
|
|
stl r28, (r19)
|
|
ret r31, (r26)
|
|
|
|
# Move 4 bytes to an aligned LW.
|
|
#
|
|
simple: ldq_u r27, 3(r18) # Load last QW of source
|
|
extql r28, r18, r28 # Position first QW
|
|
addq r19, 4, r16 # Point to end of dst for return
|
|
extqh r27, r18, r27 # Position last QW
|
|
or r28, r27, r28 # Merge
|
|
stl r28, (r19) # Store
|
|
ret r31, (r26)
|
|
|
|
|
|
# Dst is not aligned. Check whether first write is to a LW or a QW,
|
|
# and whether that finishes the move. Then see if src alignment
|
|
# matches, and read/rewrite the first dst quadword.
|
|
#
|
|
# r16 = dst alignment in QW
|
|
# r17
|
|
# r18-> src
|
|
# r19-> LW-aligned dst
|
|
# r20 = len-4
|
|
# r27 = QW_alignment + length - 4
|
|
# r28 = first src QW
|
|
|
|
#.align quad
|
|
|
|
part: subq r27, 4, r17 # Does dst end in first QW?
|
|
ldq_u r27, (r19) # Load first dst QW
|
|
blt r17, shortu # Go handle short store
|
|
and r16, 4, r17 # Does it start in high LW?
|
|
subq r18, r16, r18 # Adjust src for this partial move
|
|
beq r17, quad # Whole QW to be touched
|
|
extql r28, r18, r17 # Position first part of source
|
|
ldq_u r28, 7(r18) # Get next (or same) src QW
|
|
mskql r27, r16, r27 # Trim destination for merge
|
|
addq r20, r16, r20 # Len + alignment...
|
|
extqh r28, r18, r28 # Position second part of source
|
|
subq r20, 4, r20 # Len+alignment-8 = remaining len
|
|
or r28, r17, r28 # Pieces of source
|
|
mskqh r28, r16, r17 # Trim junk preceding source
|
|
ldq_u r28, 7(r18) # Get src QW again **
|
|
or r27, r17, r17 # Combine other source piece
|
|
extql r17, 4, r17 # Get the high LW
|
|
stl r17, (r19) # Store just that
|
|
|
|
# Now at a QW boundary. Is there a QW left to store?
|
|
# Is the source QW aligned?
|
|
|
|
andnot r19, 7, r19 # Adjust dst pointer to next-8
|
|
subq r20, 8, r17 # Got a QW more?
|
|
and r18, 7, r27 # Src aligned?
|
|
blt r17, short3 # Too short
|
|
addq r19, 8, r19
|
|
subq r20, 8, r20
|
|
ldq_u r28, 8(r18)
|
|
addq r18, 8, r18
|
|
subq r20, 32, r17 # Prepare for unrolled loop
|
|
beq r27, align # Alignment matches
|
|
or r31, r31, r27
|
|
or r31, r31, r16
|
|
br r31, misal
|
|
|
|
shortu: addq r18, r20, r20 # Point to end-4 of src
|
|
ldq_u r20, 3(r20) # Get last QW of source
|
|
extql r28, r18, r28 # Fetch first QW of source
|
|
extqh r20, r18, r20 # Fetch last QW of source
|
|
mskql r27, r16, r18 # Clear from start thru end of dst
|
|
mskqh r27, r17, r27 # Clear from 0 to end of dst
|
|
or r28, r20, r28 # Combine src pieces
|
|
insql r28, r16, r28 # Position src
|
|
or r27, r18, r27 # Combine dst pieces
|
|
mskql r28, r17, r28 # Trim src
|
|
addq r19, r17, r20 # Final pointer for return
|
|
or r28, r27, r28 # Merge src & dst
|
|
stq_u r28, (r19) # Store it
|
|
addq r20, 8, r16
|
|
ret r31, (r26)
|
|
|
|
quad: and r18, 7, r17 # Is src address now aligned?
|
|
subq r20, 4, r20 # Get length-8
|
|
bne r17, misal # Go handle mismatched alignment
|
|
mskqh r28, r16, r28 # Keep desired part of source
|
|
addq r20, r16, r20 # Adjust count for this partial move
|
|
mskql r27, r16, r27 # Keep desired part of destination QW
|
|
subq r20, 32, r17 # At least 4 quadwords left to move?
|
|
or r27, r28, r28 # Merge source and destination
|
|
|
|
# Src alignment matches.
|
|
# r16
|
|
# r17 = remaining length -32
|
|
# r18 = next src pointer -8
|
|
# r19 = dst pointer
|
|
# r20
|
|
# r27
|
|
# r28 = dst quadword
|
|
|
|
align: and r17, 24, r20 # How many after a multiple of 4?
|
|
bge r17, unrol1 # Taken branch for long strings
|
|
nop
|
|
short1: and r17, 7, r17 # How many odd bytes?
|
|
beq r20, last28 # Skip if no more whole QWs after r28
|
|
ldq r27, 8(r18) # Load next QW
|
|
addq r18, 8, r18
|
|
stq r28, (r19) # Store prior QW
|
|
subq r20, 16, r20 # Map 8/16/24 to -8/0/8
|
|
addq r19, 8, r19
|
|
blt r20, last27 # Skip if no more after r27
|
|
ldq r28, 8(r18) # Load next QW
|
|
addq r18, 8, r18
|
|
stq r27, (r19) # Store prior QW
|
|
addq r19, 8, r19
|
|
nop
|
|
beq r20, last28
|
|
ldq r27, 8(r18) # Load next QW
|
|
addq r18, 8, r18
|
|
stq r28, (r19) # Store prior QW
|
|
addq r19, 8, r19
|
|
last27: beq r17, done27 # Skip if no odd bytes
|
|
ldq r28, 8(r18) # Load one more src QW
|
|
ldq r18, 8(r19) # Load last destination QW
|
|
subq r17, 4, r16 # More than 4 bytes to store?
|
|
stq r27, (r19) # Store prior QW
|
|
mskql r28, r17, r27 # Trim source
|
|
mskqh r18, r17, r18 # Trim destination
|
|
ble r16, lastl # Go store just a LW
|
|
lastq: addq r19, r17, r19 # End-8 of dst for return
|
|
or r27, r18, r27 # Merge src & dst
|
|
done27: stq_u r27, 7(r19) # Store last destination QW
|
|
addq r19, 8, r16 # End of dst for return
|
|
ret r31, (r26)
|
|
|
|
short3: addq r18, r20, r16 # Point to end-8 of src
|
|
beq r20, donexx # Completely done?
|
|
ldq_u r17, 7(r16) # Load final src QW
|
|
subq r20, 4, r16 # Got more than a LW?
|
|
beq r27, joinx # Don't include prior src if aligned
|
|
extql r28, r18, r27 # Last part of prior src QW
|
|
extqh r17, r18, r17 # First part of this src QW
|
|
br joinx
|
|
|
|
donexx: addq r19, r20, r16
|
|
addq r16, 8, r16
|
|
ret r31, (r26)
|
|
|
|
last28: beq r17, done28 # Skip if no odd bytes
|
|
ldq r27, 8(r18) # Load one more src QW
|
|
ldq r18, 8(r19) # Load last destination QW
|
|
subq r17, 4, r16 # More than 4 bytes to store?
|
|
stq r28, (r19) # Store prior QW
|
|
mskql r27, r17, r27 # Trim source
|
|
mskqh r18, r17, r18 # Trim destination
|
|
bgt r16, lastq # Go store a QW
|
|
lastl: addq r17, 8, r17 # Increment length for return
|
|
or r27, r18, r27 # Merge src & dst
|
|
stl r27, 8(r19) # Store last destination LW
|
|
addq r19, r17, r16 # End of dst for return
|
|
ret r31, (r26)
|
|
|
|
shortq: addq r18, r20, r16 # Point to end-8 of src
|
|
ldq r27, (r19) # Get dst QW
|
|
extql r28, r18, r28 # Position first src QW
|
|
ldq_u r17, 7(r16) # Get last QW of src
|
|
mskqh r27, r20, r27 # Mask dst QW
|
|
extqh r17, r18, r17 # Position last src QW
|
|
or r17, r28, r28 # Merge
|
|
mskql r28, r20, r28 # Trim src QW
|
|
done_u: addq r19, r20, r19 # End-8 of dst for return
|
|
or r28, r27, r28 # Combine pieces
|
|
done28: stq_u r28, 7(r19) # Store last destination QW
|
|
addq r19, 8, r16 # End of dst for return
|
|
ret r31, (r26)
|
|
|
|
# Unrolled loop for long moves with matching alignment within QW.
|
|
# Each iteration moves two cache blocks.
|
|
# We try to schedule the cache misses to avoid a double miss
|
|
# in EV4 pass 2.1 chips. If the source alignment within a cache
|
|
# block is exactly 3, alter it, since that case runs slower.
|
|
#
|
|
# R19 = dst pointer
|
|
# R17 = remaining length (to load) - 32
|
|
# R18 = src pointer
|
|
# R16
|
|
# R20 = length & 24 (needed at return)
|
|
# R27
|
|
# R28 = QW from 0(R18) to store at 0(R19), both on input and at return
|
|
#
|
|
|
|
#.align quad
|
|
|
|
unrol1: ldq r27, 32(r18) # Cache miss here; later loads hit.
|
|
subq r17, 48, r16 # Six more quadwords?
|
|
and r18, 16, r20 # Starting in 2nd half of cache block?
|
|
blt r16, uent1 # If not 6 more, don't adjust.
|
|
ldq r16, 8(r18)
|
|
beq r20, utop1 # If in 1st half, don't adjust.
|
|
ldq r27, 48(r18) # Cache miss here
|
|
addq r18, 16, r18
|
|
stq r28, (r19) # Adjust by going ahead 1/2 block.
|
|
addq r19, 16, r19
|
|
ldq r28, (r18)
|
|
subq r17, 16, r17
|
|
stq r16, -8(r19)
|
|
nop
|
|
ldq r16, 8(r18)
|
|
utop1: subq r17, 32, r17
|
|
|
|
uloop1: ldq r20, 64(r18) # Cache miss here
|
|
stq r28, (r19)
|
|
ldq r28, 16(r18)
|
|
stq r16, 8(r19)
|
|
ldq r16, 24(r18)
|
|
addq r18, 64, r18
|
|
stq r28, 16(r19)
|
|
mov r20, r28
|
|
stq r16, 24(r19)
|
|
addq r19, 64, r19
|
|
ldq r20, -24(r18)
|
|
subq r17, 32, r17
|
|
blt r17, uexit1
|
|
ldq r16, 32(r18) # Cache miss here
|
|
stq r27, -32(r19)
|
|
ldq r27, -16(r18)
|
|
stq r20, -24(r19)
|
|
ldq r20, -8(r18)
|
|
stq r27, -16(r19)
|
|
mov r16, r27
|
|
stq r20, -8(r19)
|
|
uent1: subq r17, 32, r17
|
|
ldq r16, 8(r18)
|
|
bge r17, uloop1
|
|
|
|
# finish last block of 4 quadwords
|
|
#
|
|
ubot1: stq r28, (r19)
|
|
mov r27, r28 # Position last QW for return
|
|
ldq r27, 16(r18)
|
|
addq r18, 32, r18
|
|
stq r16, 8(r19)
|
|
addq r19, 32, r19
|
|
uex1a: ldq r16, -8(r18)
|
|
and r17, 24, r20 # Recover count of remaining QW's
|
|
stq r27, -16(r19)
|
|
stq r16, -8(r19)
|
|
br r31, short1
|
|
|
|
nop
|
|
uexit1: stq r27, -32(r19) # Here if exit from middle of loop
|
|
ldq r27, -16(r18)
|
|
stq r20, -24(r19)
|
|
br r31, uex1a # Join common exit sequence
|
|
|
|
#.align quad
|
|
|
|
unrol2: ldq_u r16, 16(r18) # Load next src QW
|
|
extql r17, r18, r17 # Get last part of prior one
|
|
or r28, r27, r28 # Combine pieces
|
|
stq r28, (r19) # Store prior dst QW
|
|
subq r20, 24, r20 # Update loop counter
|
|
extqh r16, r18, r28 # Get first part of a src QW
|
|
ldq_u r27, 24(r18) # Load next src QW
|
|
extql r16, r18, r16 # Get last part of prior one
|
|
or r28, r17, r28 # Combine pieces
|
|
stq r28, 8(r19) # Store prior dst QW
|
|
addq r19, 24, r19 # Update dst pointer
|
|
extqh r27, r18, r28 # Get first part of a src QW
|
|
ldq_u r17, 32(r18) # Load next src QW
|
|
extql r27, r18, r27 # Get last part of prior one
|
|
or r28, r16, r28 # Combine pieces
|
|
stq r28, -8(r19) # Store prior dst QW
|
|
addq r18, 24, r18 # Update src pointer
|
|
extqh r17, r18, r28 # Get first part of a src QW
|
|
bge r20, unrol2 # Repeat as needed
|
|
addq r20, 16, r16 # How many whole quadwords left?
|
|
br r31, short2 # Go handle leftovers
|
|
nop
|
|
|
|
# Must move in reverse order because of overlap.
|
|
# r16 = dst address
|
|
# r17
|
|
# r18 = src address
|
|
# r19
|
|
# r20 = len-4 (>= 0)
|
|
# r27
|
|
# r28
|
|
|
|
# Not yet LW-granularity...
|
|
|
|
reverse:
|
|
subq r20, 4, r20 # This code expects len-8
|
|
addq r20, r18, r18 # Point to end-8 of source
|
|
addq r20, r16, r17 # Point to end-8 of destination
|
|
and r17, 7, r19 # Is destination aligned?
|
|
ldq_u r28, 7(r18) # Get source QW
|
|
addq r17, 8, r16 # Point to end of dst for return
|
|
bne r19, rpart # Skip if partial write needed
|
|
and r18, 7, r27 # Is source aligned too?
|
|
beq r27, ralign # Skip if so
|
|
ldq_u r19, (r18) # Handle aligned dst, unaligned src
|
|
subq r20, 8, r20
|
|
extqh r28, r18, r28
|
|
extql r19, r18, r27
|
|
br r31, rwhole
|
|
|
|
rmis: ldq_u r19, (r18) # Load same or preceding src QW
|
|
extqh r28, r18, r28 # Get last part of source to store
|
|
mskqh r27, r16, r27 # Keep high-address part of dst
|
|
extql r19, r18, r19
|
|
subq r20, 8, r20 # How many more whole QW's?
|
|
or r19, r28, r28
|
|
ldq_u r19, (r18) # Reload source QW
|
|
mskql r28, r16, r28 # Trim source to length
|
|
rwhole: blt r20, rlast2 # Skip if no more whole QW's
|
|
rloop2: or r28, r27, r28 # Combine pieces
|
|
stq r28, (r17)
|
|
rent2: extqh r19, r18, r27
|
|
ldq_u r19, -8(r18)
|
|
subq r20, 8, r20
|
|
subq r17, 8, r17
|
|
extql r19, r18, r28
|
|
subq r18, 8, r18
|
|
bge r20, rloop2
|
|
rlast2: and r20, 7, r20
|
|
beq r20, rdone2
|
|
or r28, r27, r28
|
|
subq r18, r20, r27
|
|
stq r28, (r17)
|
|
rl2ent: subq r31, r20, r20
|
|
ldq_u r27, (r27)
|
|
extqh r19, r18, r19
|
|
ldq r28, -8(r17)
|
|
subq r17, 8, r17
|
|
extql r27, r18, r27
|
|
mskql r28, r20, r28
|
|
or r27, r19, r27
|
|
mskqh r27, r20, r27
|
|
and r20, 4, r19 # Ending in high LW?
|
|
bne r19, rdone3 # Only longword store at the end
|
|
rdone2: or r28, r27, r28
|
|
stq r28, (r17)
|
|
ret r31, (r26)
|
|
|
|
rdone3: or r28, r27, r28
|
|
extql r28, 4, r28
|
|
stl r28, 4(r17)
|
|
ret r31, (r26)
|
|
|
|
rpart: ldq_u r27, 7(r17) # Get dst QW
|
|
subq r19, 8, r19 # Get negative of bytes not moved
|
|
subq r18, r19, r18 # From src-8, get src after partial
|
|
subq r20, r19, r20 # Adjust length for partial move
|
|
subq r17, r19, r17 # Adjust dst pointer
|
|
addq r19, 4, r19 # End alignment - 4
|
|
ble r19, r_lw # Only storing the low longword?
|
|
and r18, 7, r19 # Src alignment now matching dst?
|
|
bne r19, rmis # Go back if not
|
|
mskql r28, r16, r28 # Keep low addresses of src QW
|
|
mskqh r27, r16, r27 # Keep high address of dst QW
|
|
ralign: subq r20, 8, r20 # How many more whole QW's?
|
|
or r27, r28, r28 # Combine
|
|
blt r20, rlast1 # Skip if this is the end
|
|
rloop1: stq r28, (r17) # Store one QW
|
|
rent1: subq r20, 8, r20 # Decrement length
|
|
ldq r28, -8(r18) # Load preceding QW
|
|
subq r17, 8, r17 # Decrement dst pointer
|
|
subq r18, 8, r18 # Decrement src pointer
|
|
bge r20, rloop1 # Repeat for each whole QW
|
|
rlast1: and r20, 7, r20 # How many odd bytes?
|
|
beq r20, rdone # Skip if none
|
|
ldq r27, -8(r18) # Get another source QW
|
|
subq r31, r20, r20 # Get byte # to end at
|
|
stq r28, (r17)
|
|
rl_ent: ldq r28, -8(r17)
|
|
subq r17, 8, r17 # Adjust dst pointer again
|
|
mskqh r27, r20, r27 # Keep top of src QW
|
|
and r20, 4, r19 # Ending in high LW?
|
|
mskql r28, r20, r28 # Keep bottom of dst QW
|
|
bne r19, rdone4 # Only longword store at the end
|
|
or r27, r28, r28 # Combine
|
|
rdone: stq r28, (r17) # Store last QW
|
|
ret r31, (r26)
|
|
|
|
rdone4: or r27, r28, r28 # Combine
|
|
extql r28, 4, r28 # Get high part
|
|
stl r28, 4(r17) # Store last LW
|
|
ret r31, (r26)
|
|
|
|
r_lw: and r18, 7, r19 # Src alignment now matching dst?
|
|
bne r19, rmislw # Go back if not
|
|
mskql r28, r16, r28 # Keep low addresses of src QW
|
|
mskqh r27, r16, r27 # Keep high address of dst QW
|
|
subq r20, 8, r20 # How many more whole QW's?
|
|
or r27, r28, r28 # Combine
|
|
blt r20, rlast1_lw # Skip if this is the end
|
|
stl r28, (r17) # Store one QW
|
|
br r31, rent1
|
|
|
|
rlast1_lw:
|
|
and r20, 7, r20 # How many odd bytes?
|
|
ldq r27, -8(r18) # Get another source QW
|
|
subq r31, r20, r20 # Get byte # to end at
|
|
stl r28, (r17)
|
|
br rl_ent
|
|
|
|
rmislw: ldq_u r19, (r18) # Load same or preceding src QW
|
|
extqh r28, r18, r28 # Get last part of source to store
|
|
mskqh r27, r16, r27 # Keep high-address part of dst
|
|
extql r19, r18, r19
|
|
subq r20, 8, r20 # How many more whole QW's?
|
|
or r19, r28, r28
|
|
ldq_u r19, (r18) # Reload source QW
|
|
mskql r28, r16, r28 # Trim source to length
|
|
blt r20, rlast2_lw # Skip if no more whole QW's
|
|
or r28, r27, r28 # Combine pieces
|
|
stl r28, (r17)
|
|
br r31, rent2
|
|
|
|
rlast2_lw:
|
|
and r20, 7, r20
|
|
or r28, r27, r28
|
|
subq r18, r20, r27
|
|
stl r28, (r17)
|
|
br r31, rl2ent
|
|
|
|
.set at
|
|
.set reorder
|
|
.end _OtsMove
|