mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
161 lines
7.3 KiB
161 lines
7.3 KiB
#++
|
|
# Copyright 1991, 1994, Digital Equipment Corporation
|
|
#
|
|
# ots_fill(char *dstptr, long dstlen, unsigned char fill)
|
|
#
|
|
# Fill dstlen bytes of memory at *dstptr with "fill"
|
|
#
|
|
# Special conventions: No stack space, r16-r19 and r27-r28 ONLY,
|
|
# no linkage pointer required.
|
|
# (Warning: The auto-loader potentially takes some regs across
|
|
# the call if this is being used in a shared lib. environment.)
|
|
#
|
|
# This is a GEM support routine for filling memory with a specified value,
|
|
# basically identical to the System V routine memset, with the 2nd two
|
|
# parameters reversed. This is optimized for extremely high performance
|
|
# both for small blocks (string padding) and large blocks (memory fill).
|
|
# In order to reduce overhead for small cases, they are retired as quickly
|
|
# as possible, more case analysis is reserved for cases which will do
|
|
# more.
|
|
#
|
|
# This version of OTS_FILL provides longword granularity for Alpha.
|
|
#
|
|
# 011 30 Aug 1994 WBN Longword granularity version based on
|
|
# OTS_FILL_ALPHA.M64 edit 010.
|
|
#--
|
|
|
|
#include "ots_defs.hs"
|
|
|
|
# r16 = dst
|
|
# r17 = len
|
|
# r18 = fill byte
|
|
# destroys r16-r19, r27-r28
|
|
|
|
.globl _OtsFill
|
|
.ent _OtsFill
|
|
_OtsFill:
|
|
.set noat
|
|
.set noreorder
|
|
.frame sp,0,r26
|
|
.prologue 0
|
|
|
|
sll r18, 8, r19 # Start propagating byte to quadword
|
|
beq r17, done # No memory refs if len=0
|
|
subq r17, 4, r28 # Length-4
|
|
or r19, r18, r18 # Fill in bytes 0-1
|
|
sll r18, 16, r19
|
|
and r16, 3, r27 # Dst alignment (0-3)
|
|
or r19, r18, r18 # Fill in bytes 0-3
|
|
andnot r16, 3, r16 # LW aligned dst pointer
|
|
addq r27, r28, r17 # Alignment + length - 4
|
|
bge r28, geq4 # Lengths >= 4 may not need load
|
|
ldl r28, (r16) # Load first LW of dst
|
|
bgt r17, double # Skip if it crosses to next LW
|
|
addq r17, 4, r17 # Find endpoint within LW
|
|
xor r28, r18, r28 # Pre-flip all fill bits in dest
|
|
mskql r28, r27, r27 # Clear from startpoint thru 7
|
|
mskqh r28, r17, r28 # Clear from 0 to endpoint
|
|
xor r27, r18, r27 # Combine fill with masked dest
|
|
xor r28, r27, r27 # Result is fill in center part only
|
|
stl r27, (r16)
|
|
ret r31, (r26)
|
|
|
|
double: mskqh r18, r27, r19 # Discard fill preceding startpoint
|
|
mskql r28, r27, r28 # Clear from startpoint in first LW
|
|
ldl r27, 4(r16) # Load second LW of dst
|
|
mskql r18, r17, r18 # Discard fill following endpoint
|
|
or r28, r19, r28 # Insert fill in first LW
|
|
stl r28, (r16)
|
|
mskqh r27, r17, r27 # Clear up to endpoint in second LW
|
|
or r27, r18, r27 # Insert fill in second LW
|
|
stl r27, 4(r16)
|
|
ret r31, (r26)
|
|
|
|
# Come here if length to be zeroed is >= 4.
|
|
# r16-> dst aligned to LW
|
|
# r17 = alignment + length - 4
|
|
# r18 = fill in bytes 0-3
|
|
# r27 = dst alignment within LW
|
|
# r28 = length-4
|
|
|
|
#.align quad
|
|
|
|
geq4: and r16, 4, r28 # Which LW in QW to store first?
|
|
beq r17, simple # Go handle single aligned LW
|
|
sll r18, 32, r19
|
|
bne r28, longs # Go use QW stores
|
|
quad: subq r17, 4, r17 # Does dest end in first QW?
|
|
or r18, r19, r18 # Fill in bytes 0-7
|
|
blt r17, shortq # Ends within first QW
|
|
mskqh r18, r27, r28 # Clear initial bytes of fill
|
|
beq r27, wh_qw # Store a whole QW
|
|
ldq r19, (r16) # Load first QW of dest
|
|
mskql r19, r27, r19 # Clear from startpoint
|
|
or r19, r28, r28 # Combine first QW with fill
|
|
wh_qw: stq r28, (r16) # Store first QW of dest
|
|
br r31, join # Go fill rest of string
|
|
|
|
simple: stl r18, (r16) # Single aligned LW
|
|
ret r31, (r26)
|
|
|
|
shortq: ldq r28, (r16) # Load QW of dest
|
|
xor r28, r18, r28 # Pre-flip all fill bits in dest
|
|
mskql r28, r27, r27 # Clear from startpoint thru 7
|
|
mskqh r28, r17, r28 # Clear from 0 up to endpoint
|
|
xor r27, r18, r27 # Combine fill with masked dest
|
|
xor r28, r27, r27 # Result is fill in center part only
|
|
stq r27, (r16) # Store
|
|
ret r31, (r26)
|
|
|
|
longs: mskqh r18, r27, r28 # Clear initial bytes of LW fill
|
|
or r18, r19, r18 # Fill in bytes 0-7
|
|
beq r27, wh_lw # Store a whole LW
|
|
ldl r19, (r16) # Load first LW of dest
|
|
mskql r19, r27, r19 # Clear from startpoint
|
|
or r19, r28, r28 # Combine first LW with fill
|
|
wh_lw: stl r28, (r16) # Store first LW of dest
|
|
join: subq r17, 32, r17 # At least 4 more quadwords?
|
|
and r17, 24, r27 # How many after multiple of 4?
|
|
bge r17, unroll # Taken branch for long strings
|
|
short: and r17, 7, r17 # How many odd bytes?
|
|
beq r27, last # Skip if no more whole QWs
|
|
stq_u r18, 8(r16) # Clear one...
|
|
subq r27, 16, r27 # Map 8/16/24 to -8/0/8
|
|
addq r16, 8, r16 # Update dest pointer
|
|
blt r27, last # Skip if no more whole QWs
|
|
#stall
|
|
stq_u r18, 8(r16) # Clear two...
|
|
addq r16, 8, r16 # Update dest pointer
|
|
nop
|
|
beq r27, last # Skip if no more whole QWs
|
|
stq_u r18, 8(r16) # Clear three...
|
|
addq r16, 8, r16 # Update dest pointer
|
|
last: beq r17, done # Finished if no odd bytes
|
|
ldq_u r27, 8(r16) # Load last QW of dest
|
|
subq r17, 4, r28 # More than a LW left?
|
|
andnot r16, 7, r16 # Clean pointer for STL
|
|
mskql r18, r17, r18 # Discard unneeded fill bytes
|
|
#stall
|
|
mskqh r27, r17, r27 # Clear up to endpoint in last QW
|
|
#stall
|
|
or r27, r18, r27 # Combine fill with last QW
|
|
bgt r28, lastq # Go store a QW
|
|
stl r27, 8(r16) # LW store for last piece
|
|
done: ret r31, (r26)
|
|
|
|
lastq: stq r27, 8(r16) # QW store for last piece
|
|
ret r31, (r26)
|
|
|
|
|
|
unroll: stq_u r18, 8(r16) # Store 4 QWs per iteration
|
|
stq_u r18, 16(r16)
|
|
stq_u r18, 24(r16)
|
|
subq r17, 32, r17 # Decrement remaining count
|
|
stq_u r18, 32(r16)
|
|
addq r16, 32, r16 # Update dest pointer
|
|
bge r17, unroll # repeat until done
|
|
br r31, short # Then handle leftovers
|
|
|
|
.set at
|
|
.set reorder
|
|
.end _OtsFill
|