mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1920 lines
72 KiB
1920 lines
72 KiB
// TITLE("Compare, Move, Zero, and Fill Memory Support")
|
|
//++
|
|
//
|
|
// Copyright (c) 1992 Digital Equipment Corporation
|
|
//
|
|
// Module Name:
|
|
//
|
|
// mvmem.s
|
|
//
|
|
// Abstract:
|
|
//
|
|
// This module implements functions to compare, move, zero, and fill
|
|
// blocks of memory. If the memory is aligned, then these functions
|
|
// are very efficient.
|
|
//
|
|
// N.B. These routines MUST preserve all floating state since they are
|
|
// frequently called from interrupt service routines that normally
|
|
// do not save or restore floating state.
|
|
//
|
|
// Author:
|
|
//
|
|
// Joe Notarangelo 21-May-1992
|
|
//
|
|
// Environment:
|
|
//
|
|
// User or Kernel mode.
|
|
//
|
|
// Revision History:
|
|
//
|
|
// Monty VanderBilt 14-Feb-1996 Avoid memory loads and branch takens between
|
|
// load lock and store conditional instructions
|
|
// to conform with all alpha architecture rules.
|
|
// Monty VanderBilt 27-Feb-1996 Added RtlZeroBytes and RtlFillBytes to support
|
|
// byte granularity access when necessary.
|
|
//--
|
|
|
|
#include "ksalpha.h"
|
|
|
|
SBTTL("Compare Memory")
|
|
//++
|
|
//
|
|
// ULONG
|
|
// RtlCompareMemory (
|
|
// IN PVOID Source1,
|
|
// IN PVOID Source2,
|
|
// IN ULONG Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function compares two blocks of memory and returns the number
|
|
// of bytes that compared equal.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Source1 (a0) - Supplies a pointer to the first block of memory to
|
|
// compare.
|
|
//
|
|
// Source2 (a1) - Supplies a pointer to the second block of memory to
|
|
// compare.
|
|
//
|
|
// Length (a2) - Supplies the length, in bytes, of the memory to be
|
|
// compared.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// The number of bytes that compared equal is returned as the function
|
|
// value. If all bytes compared equal, then the length of the orginal
|
|
// block of memory is returned.
|
|
//
|
|
//--
|
|
|
|
|
|
LEAF_ENTRY(RtlCompareMemory)
|
|
|
|
bis a2, zero, v0 // save length of comparison
|
|
beq a2, 90f // (JAE) quit if nothing to compare
|
|
xor a0, a1, t0 // check for compatible alignment
|
|
and t0, 0x7, t0 // low bits only
|
|
bne t0, CompareUnaligned // if ne, incompatible alignment
|
|
|
|
//
|
|
// Compare memory aligned
|
|
//
|
|
|
|
CompareAligned: //
|
|
|
|
//
|
|
// compare memory until sources are aligned
|
|
//
|
|
and a0, 0x7, t0 // get low bits
|
|
bne t0, 10f // if ne, sources not aligned yet
|
|
br zero, 30f // already aligned, predicted
|
|
|
|
|
|
10:
|
|
ldq_u t1, 0(a0) // get unaligned quad at source 1
|
|
ldq_u t2, 0(a1) // get unaligned quad at source 2
|
|
|
|
20:
|
|
extbl t1, t0, t4 // byte at t0 in source 1 quad
|
|
extbl t2, t0, t5 // byte at t0 in source 2 quad
|
|
xor t4, t5, t3 // t1 = t2 ?
|
|
bne t3, 110f // not equal, miscompare
|
|
subq a2, 1, a2 // decrement bytes to compare
|
|
beq a2, 90f // if eq, compare success
|
|
addq t0, 1, t0 // increment pointer within quad
|
|
cmpeq t0, 8, t3 // t0 = 8?, if so first quadword done
|
|
beq t3, 20b // continue while t0 < 8
|
|
|
|
|
|
addq a0, 8, a0 // increment to next quadword
|
|
addq a1, 8, a1 // increment source 2 to next also
|
|
bic a0, 7, a0 // align source 1 quadword
|
|
bic a1, 7, a1 // align source 2 quadword
|
|
|
|
|
|
//
|
|
// aligned block compare, compare blocks of 64 bytes
|
|
//
|
|
|
|
30:
|
|
srl a2, 6, t0 // t0 = number of 64 byte blocks
|
|
beq t0, 50f // if eq, no 64 byte blocks
|
|
|
|
//
|
|
// N.B. loads from each of the sources were separated in case these
|
|
// blocks are fighting for the cache
|
|
//
|
|
.set noat
|
|
40:
|
|
ldq t1, 0(a0) // t1 = source 1, quad 0
|
|
ldq t2, 8(a0) // t2 = source 1, quad 1
|
|
ldq t3, 16(a0) // t3 = source 1, quad 2
|
|
addq a1, 64, a1 // increment source 2 pointer
|
|
ldq t4, 24(a0) // t4 = source 1, quad 3
|
|
|
|
ldq t5, -64(a1) // t5 = source 2, quad 0
|
|
ldq a4, -56(a1) // a4 = source 2, quad 1
|
|
ldq a5, -48(a1) // a5 = source 2, quad 2
|
|
xor t1, t5, $at // quad 0 match?
|
|
bne $at, 200f // if ne[false], miscompare
|
|
ldq t5, -40(a1) // t5 = source 2, quad 3
|
|
ldq t1, 32(a0) // t1 = source 1, quad 4
|
|
xor t2, a4, $at // quad 1 match?
|
|
bne $at, 122f // if ne[false], miscompare
|
|
ldq t2, 40(a0) // t2 = source 1, quad 5
|
|
xor t3, a5, $at // quad 2 match?
|
|
bne $at, 124f // if ne[false], miscompare
|
|
ldq t3, 48(a0) // t3 = source 1, quad 6
|
|
xor t4, t5, $at // quad 3 match?
|
|
bne $at, 126f // if ne[false], miscompare
|
|
ldq t4, 56(a0) // t4 = source 1, quad 7
|
|
|
|
ldq t5, -32(a1) // t5 = source 2, quad 4
|
|
addq a0, 64, a0 // increment source 1 pointer
|
|
ldq a4, -24(a1) // a4 = source 2, quad 5
|
|
subq t0, 1, t0 // decrement blocks to compare
|
|
ldq a5, -16(a1) // a5 = source 2, quad 6
|
|
xor t1, t5, $at // quad 4 match?
|
|
bne $at, 130f // if ne[false], miscompare
|
|
ldq t5, -8(a1) // t5 = source 2, quad 7
|
|
xor t2, a4, $at // quad 5 match?
|
|
bne $at, 132f // if ne[false], miscompare
|
|
xor t3, a5, $at // quad 6 match?
|
|
bne $at, 134f // if ne[false], miscompare
|
|
xor t4, t5, $at // quad 7 match?
|
|
bne $at, 136f // if ne[false], miscompare
|
|
subq a2, 64, a2 // decrement bytes to compare
|
|
bne t0, 40b // if ne, more blocks to compare
|
|
.set at
|
|
|
|
|
|
//
|
|
// Compare quadwords
|
|
//
|
|
|
|
50:
|
|
srl a2, 3, t0 // t0 = number of quadwords to compare
|
|
beq t0, 70f // if eq, no quadwords to compare
|
|
|
|
.set noat
|
|
60:
|
|
ldq t1, 0(a0) // t1 = quad from source 1
|
|
lda a0, 8(a0) // increment source 1 pointer
|
|
ldq t2, 0(a1) // t2 = quad from source 2
|
|
lda a1, 8(a1) // increment source 2 pointer
|
|
xor t1, t2, $at // are quadwords equal?
|
|
bne $at, 200f // if ne, miscompare
|
|
subq t0, 1, t0 // decrement quads to compare
|
|
subq a2, 8, a2 // decrement bytes to compare
|
|
bne t0, 60b // if ne, more quads to compare
|
|
|
|
.set at
|
|
|
|
//
|
|
// Compare bytes in last quadword
|
|
//
|
|
|
|
// a2 = number of bytes to compare, less than 8, greater than zero
|
|
// a0, a1, quad-aligned to last quadword
|
|
|
|
beq a2, 80f // if eq, all bytes compared
|
|
|
|
.set noat
|
|
70:
|
|
ldq t1, 0(a0) // t1 = quad at source 1
|
|
ldq t2, 0(a1) // t2 = quad at source 2
|
|
bis zero, 0xff, t0 // zap mask
|
|
sll t0, a2, t0 //
|
|
zap t1, t0, t1 // zero bytes not compared
|
|
zap t2, t0, t2 // same for source 2
|
|
xor t1, t2, $at // compare quadwords
|
|
bne $at, 200f // if ne, miscompare
|
|
|
|
.set at
|
|
//
|
|
// Successful compare
|
|
// v0 already contains full length
|
|
//
|
|
|
|
80:
|
|
ret zero, (ra) // return
|
|
|
|
|
|
//
|
|
// Sources have incompatible alignment
|
|
//
|
|
CompareUnaligned:
|
|
|
|
|
|
//
|
|
// Compare until source 1 (a0) is aligned
|
|
//
|
|
|
|
and a0, 0x7, t0 // get byte position of pointer
|
|
beq t0, 30f // if eq, already aligned
|
|
|
|
ldq_u t1, 0(a0) // get unaligned quad at a0
|
|
|
|
10:
|
|
ldq_u t2, 0(a1) // get unaligned quad at a1
|
|
extbl t1, t0, t4 // get byte to compare from source 1
|
|
extbl t2, a1, t2 // get byte to compare from source 2
|
|
xor t4, t2, t3 // do bytes match?
|
|
bne t3, 110f // if ne, miscompare
|
|
subq a2, 1, a2 // decrement bytes to compare
|
|
beq a2, 90f // (JAE) quit if nothing left to compare
|
|
addq t0, 1, t0 // increment byte within source 1
|
|
addq a1, 1, a1 // increment source 2 pointer
|
|
cmpeq t0, 8, t3 // finished with source 1 quad?
|
|
beq t3, 10b // if eq[false], more to compare
|
|
|
|
addq a0, 7, a0 // point to next source 1 quad
|
|
bic a0, 7, a0 // align to quadword
|
|
|
|
|
|
//
|
|
// Compare 64-byte blocks
|
|
//
|
|
|
|
30:
|
|
srl a2, 6, t0 // t0 = number of blocks to compare
|
|
beq t0, 50f // if eq, no blocks to move
|
|
|
|
ldq_u t1, 0(a1) // get source 2 unaligned quad 1
|
|
|
|
.set noat
|
|
40:
|
|
ldq_u t2, 7(a1) // get source 2 unaligned quad 2
|
|
addq a0, 64, a0 // increment source 1 pointer
|
|
ldq_u t3, 15(a1) // get source 2 unaligned quad 3
|
|
extql t1, a1, t1 // bytes from unaligned quad 1
|
|
extqh t2, a1, $at // bytes from unaligned quad 2
|
|
ldq_u t4, 23(a1) // get source 2 unaligned quad 4
|
|
bis t1, $at, t1 // t1 = quadword 1 (source 2)
|
|
ldq_u t5, 31(a1) // get source 2 unaligned quad 5
|
|
extql t2, a1, t2 // bytes from unaligned quad 2
|
|
extqh t3, a1, $at // bytes from unaligned quad 3
|
|
ldq a3, -64(a0) // a3 = quadword 1 (source 1)
|
|
bis t2, $at, t2 // t2 = quadword 2 (source 2)
|
|
ldq a4, -56(a0) // a4 = quadword 2 (source 1)
|
|
extql t3, a1, t3 // bytes from unaligned quad 3
|
|
extqh t4, a1, $at // bytes from unaligned quad 4
|
|
ldq a5, -48(a0) // a5 = quadword 3 (source 1)
|
|
bis t3, $at, t3 // t3 = quadword 3 (source 2)
|
|
extql t4, a1, t4 // bytes from unaligned quad 4
|
|
extqh t5, a1, $at // bytes from unaligned quad 5
|
|
subq t0, 1, t0 // decrement blocks to compare
|
|
bis t4, $at, t4 // t4 = quadword 4 (source 2)
|
|
|
|
xor t1, a3, $at // match on quadword 1?
|
|
ldq a3, -40(a0) // a3 = quadword 4 (source 1)
|
|
bne $at, 200f // if ne, miscompare quad 1
|
|
xor t2, a4, $at // match on quadword 2?
|
|
ldq_u t2, 39(a1) // get source 2 unaligned quad 6
|
|
bne $at, 122f // if ne, miscompare quad 2
|
|
xor t3, a5, $at // match on quadword 3?
|
|
ldq_u t3, 47(a1) // get source 2 unaligned quad 7
|
|
bne $at, 124f // if ne, miscompare quad 3
|
|
xor t4, a3, $at // match on quadword 4?
|
|
ldq_u t4, 55(a1) // get source 2 unaligned quad 8
|
|
bne $at, 126f // if ne, miscompare quad 4
|
|
ldq_u t1, 63(a1) // get source 2 unaligned quad 9
|
|
|
|
ldq a3, -32(a0) // a3 = quadword 5 (source 1)
|
|
extql t5, a1, t5 // bytes from unaligned quad 5
|
|
extqh t2, a1, $at // bytes from unaligned quad 6
|
|
ldq a4, -24(a0) // a4 = quadword 6 (source 1)
|
|
ldq a5, -16(a0) // a5 = quadword 7 (source 1)
|
|
bis t5, $at, t5 // t5 = quadword 5 (source 2)
|
|
|
|
xor t5, a3, $at // match on quadword 5?
|
|
ldq a3, -8(a0) // a3 = quadword 8 (source 1)
|
|
bne $at, 130f // if ne, miscompare quad 5
|
|
extql t2, a1, t2 // bytes from unaligned quad 6
|
|
extqh t3, a1, $at // bytes from unaligned quad 7
|
|
extql t3, a1, t3 // bytes from unaligned quad 7
|
|
bis t2, $at, t2 // t2 = quadword 6 (source 2)
|
|
xor t2, a4, $at // match on quadword 6?
|
|
bne $at, 132f // if ne, miscompare quad 6
|
|
extqh t4, a1, $at // bytes from unaligned quad 8
|
|
extql t4, a1, t4 // bytes from unaligned quad 8
|
|
bis t3, $at, t3 // t3 = quadword 7 (source 2)
|
|
xor t3, a5, $at // match on quadword 7?
|
|
bne $at, 134f // if ne, miscompare quad 7
|
|
extqh t1, a1, $at // bytes from unaligned quad 9
|
|
addq a1, 64, a1 // increment source 2 pointer
|
|
bis t4, $at, t4 // t4 = quadword 8 (source 2)
|
|
xor t4, a3, $at // match on quadword 8?
|
|
bne $at, 136f // if ne, miscompare quad 8
|
|
subq a2, 64, a2 // decrement number of bytes to compare
|
|
bne t0, 40b // if ne, more blocks to compare
|
|
|
|
.set at
|
|
|
|
//
|
|
// Compare quadwords
|
|
//
|
|
|
|
|
|
50:
|
|
srl a2, 3, t0 // t0 = number of quads to compare
|
|
beq t0, 70f // if eq, no quads to compare
|
|
ldq_u t1, 0(a1) // get unaligned quad 1 (source 2)
|
|
|
|
.set noat
|
|
60:
|
|
ldq_u t2, 7(a1) // get unaligned quad 2 (source 2)
|
|
ldq t3, 0(a0) // t3 = quadword 1 (source 1)
|
|
extql t1, a1, t1 // get bytes from unaligned quad 1
|
|
extqh t2, a1, $at // get bytes from unaligned quad 2
|
|
addq a1, 8, a1 // increment source 2 pointer
|
|
bis t1, $at, t1 // t1 = quadword 1 (source 2)
|
|
xor t1, t3, $at // match on quadword?
|
|
bne $at, 200f // if ne, miscompare
|
|
subq t0, 1, t0 // decrement quadwords to compare
|
|
addq a0, 8, a0 // increment source 1 pointer
|
|
subq a2, 8, a2 // decrement bytes to compare
|
|
bis t2, zero, t1 // save low quadword for next loop
|
|
bne t0, 60b // if ne, more quads to compare
|
|
|
|
.set at
|
|
|
|
//
|
|
// Compare bytes for final quadword
|
|
//
|
|
|
|
70:
|
|
beq a2, 90f // if eq, comparison complete
|
|
|
|
ldq t1, 0(a0) // get quadword from source 1
|
|
bis zero, zero, t0 // t0 = byte position to compare
|
|
|
|
.set noat
|
|
80:
|
|
ldq_u t2, 0(a1) // get unaligned quad from source 2
|
|
extbl t1, t0, t3 // t3 = byte from source 1
|
|
extbl t2, a1, t2 // t2 = byte from source 2
|
|
xor t3, t2, $at // match on byte?
|
|
bne $at, 100f // if ne, miscompare on byte
|
|
addq t0, 1, t0 // increment byte position
|
|
addq a1, 1, a1 // increment source 2 pointer
|
|
subq a2, 1, a2 // decrement bytes to compare
|
|
bne a2, 80b // if ne, more bytes to compare
|
|
|
|
.set at
|
|
//
|
|
// Successful full comparison
|
|
//
|
|
|
|
90:
|
|
ret zero, (ra) // return, v0 already set
|
|
|
|
|
|
//
|
|
// Miscompare on last quadword
|
|
//
|
|
|
|
100:
|
|
subq v0, a2, v0 // subtract bytes not compared
|
|
ret zero, (ra) // return
|
|
|
|
//
|
|
// Miscompare on first quadword, unaligned case
|
|
//
|
|
// v0 = total bytes to compare
|
|
// a2 = bytes remaining to compare
|
|
//
|
|
|
|
110:
|
|
subq v0, a2, v0 // bytes compared successfully
|
|
ret zero, (ra) // return
|
|
|
|
//
|
|
// Miscompare on 64-byte block compare
|
|
//
|
|
|
|
122:
|
|
subq a2, 8, a2 // miscompare on quad 2
|
|
br zero, 200f // finish in common code
|
|
|
|
124:
|
|
subq a2, 16, a2 // miscompare on quad 3
|
|
br zero, 200f // finish in common code
|
|
|
|
126:
|
|
subq a2, 24, a2 // miscompare on quad 4
|
|
br zero, 200f // finish in common code
|
|
|
|
130:
|
|
subq a2, 32, a2 // miscompare on quad 5
|
|
br zero, 200f // finish in common code
|
|
|
|
132:
|
|
subq a2, 40, a2 // miscompare on quad 6
|
|
br zero, 200f // finish in common code
|
|
|
|
134:
|
|
subq a2, 48, a2 // miscompare on quad 7
|
|
br zero, 200f // finish in common code
|
|
|
|
136:
|
|
subq a2, 56, a2 // miscompare on quad 8
|
|
br zero, 200f // finish in common code
|
|
|
|
//
|
|
// Miscompare, determine number of bytes that successfully compared
|
|
// $at = xor of relevant quads from sources, must be non-zero
|
|
// a2 = number of bytes left to compare
|
|
//
|
|
.set noat
|
|
200:
|
|
cmpbge zero, $at, $at // $at = mask of non-zero bytes
|
|
|
|
//
|
|
// look for the first bit cleared in $at, this is the
|
|
// number of the first byte which differed
|
|
//
|
|
bis zero, zero, t0 // bit position to look for clear
|
|
|
|
210:
|
|
blbc $at, 220f // if low clear, found difference
|
|
srl $at, 1, $at // check next bit
|
|
addq t0, 1, t0 // count bit position checked
|
|
br zero, 210b
|
|
|
|
220:
|
|
subq v0, a2, v0 // subtract bytes yet to compare
|
|
addq v0, t0, v0 // add bytes that matched on last quad
|
|
|
|
ret zero, (ra)
|
|
|
|
.set at
|
|
|
|
.end RtlCompareMemory
|
|
|
|
|
|
|
|
SBTTL("Move Memory")
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlMoveMemory (
|
|
// IN PVOID Destination,
|
|
// IN PVOID Source,
|
|
// IN ULONG Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function moves memory either forward or backward, aligned or
|
|
// unaligned, in 64-byte blocks, followed by 8-byte blocks, followed
|
|
// by any remaining bytes.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the destination address of
|
|
// the move operation.
|
|
//
|
|
// Source (a1) - Supplies a pointer to the source address of the move
|
|
// operation.
|
|
//
|
|
// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlMoveMemory)
|
|
|
|
beq a2, 80f // if eq, no bytes to move
|
|
//
|
|
// If the source address is less than the destination address and source
|
|
// address plus the length of the move is greater than the destination
|
|
// address, then the source and destination overlap such that the move
|
|
// must be performed backwards.
|
|
//
|
|
|
|
cmpult a0, a1, t0 // is destination less than source
|
|
bne t0, MoveForward // if eq [true] no overlap possible
|
|
addq a1, a2, t0 // compute source ending address
|
|
cmpult t0, a0, t1 // is source end less than dest.
|
|
beq t1, MoveBackward // if eq [false], overlap
|
|
|
|
//
|
|
// Move memory forward aligned and unaligned.
|
|
//
|
|
|
|
MoveForward: //
|
|
xor a0, a1, t0 // compare alignment bits
|
|
and t0, 0x7, t0 // isloate alignment comparison
|
|
bne t0, MoveForwardUnaligned // if ne, incompatible alignment
|
|
|
|
//
|
|
// Move memory forward aligned.
|
|
//
|
|
|
|
MoveForwardAligned: //
|
|
|
|
//
|
|
// Move bytes until source and destination are quadword aligned
|
|
//
|
|
|
|
and a0, 0x7, t0 // t0 = unaligned bits
|
|
bne t0, 5f // if ne, not quad aligned
|
|
br zero, 20f // predicted taken
|
|
|
|
5:
|
|
ldq_u t2, 0(a0) // get unaligned quad from dest.
|
|
ldq_u t1, 0(a1) // get unaligned quadword from source
|
|
10:
|
|
beq a2, 15f // if eq, all bytes moved
|
|
extbl t1, t0, t3 // t3 = byte from source
|
|
insbl t3, t0, t3 // t3 = byte from source, in position
|
|
mskbl t2, t0, t2 // clear position in dest. quad
|
|
bis t2, t3, t2 // merge in byte from source
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
addq t0, 1, t0 // increment byte within quad
|
|
cmpeq t0, 8, t3 // finished the quadword?
|
|
beq t3, 10b // if eq [false], do next byte
|
|
15:
|
|
stq_u t2, 0(a0) // store merged destination bytes
|
|
|
|
addq a0, 7, a0 // move to next quadword
|
|
bic a0, 7, a0 // aligned quadword
|
|
|
|
addq a1, 7, a1 // move to next quadword
|
|
bic a1, 7, a1 // aligned quadword
|
|
|
|
//
|
|
// Check for 64-byte block moves
|
|
//
|
|
|
|
20:
|
|
srl a2, 6, t0 // t0 = number of 64 byte blocks
|
|
beq t0, 40f // if eq no blocks to move
|
|
and a2, 64-1, a2 // a2 = residual bytes
|
|
|
|
30:
|
|
ldq t1, 0(a1) // load 64 bytes from source
|
|
addq a0, 64, a0 // increment destination pointer
|
|
ldq v0, 56(a1) //
|
|
ldq a3, 32(a1) //
|
|
stq t1, -64(a0) // write to destination
|
|
ldq t2, 8(a1) // into volatile registers
|
|
ldq t3, 16(a1) //
|
|
ldq t4, 24(a1) //
|
|
subq t0, 1, t0 // decrement number of blocks
|
|
stq t2, -56(a0) //
|
|
ldq a4, 40(a1) //
|
|
stq t3, -48(a0) //
|
|
ldq a5, 48(a1) //
|
|
stq t4, -40(a0) //
|
|
addq a1, 64, a1 // increment source pointer
|
|
stq a3, -32(a0) //
|
|
stq a4, -24(a0) //
|
|
stq a5, -16(a0) //
|
|
stq v0, -8(a0) //
|
|
bne t0, 30b // if ne, more blocks to copy
|
|
|
|
//
|
|
// Copy quadwords
|
|
//
|
|
|
|
40:
|
|
srl a2, 3, t0 // t0 = number of quadwords to move
|
|
beq t0, 60f // if eq no quadwords to move
|
|
and a2, 8-1, a2 // a2 = residual bytes
|
|
|
|
50:
|
|
ldq t1, 0(a1) // load quadword from source
|
|
addq a1, 8, a1 // increment source pointer
|
|
stq t1, 0(a0) // store quadword to destination
|
|
addq a0, 8, a0 // increment destination pointer
|
|
subq t0, 1, t0 // decrement number of quadwords
|
|
bne t0, 50b // if ne, more quadwords to move
|
|
|
|
//
|
|
// Move final residual bytes
|
|
//
|
|
|
|
60:
|
|
beq a2, 80f // if eq, no more bytes to move
|
|
ldq t1, 0(a1) // get last source quadword
|
|
ldq t2, 0(a0) // get last dest. quadword
|
|
bis zero, zero, t0 // t0 = next byte number to move
|
|
|
|
70:
|
|
extbl t1, t0, t3 // extract byte from source
|
|
insbl t3, t0, t3 // t3 = source byte, in position
|
|
mskbl t2, t0, t2 // clear byte position for dest.
|
|
bis t2, t3, t2 // merge in source byte
|
|
addq t0, 1, t0 // increment byte position
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
bne a2, 70b // if ne => more bytes to move
|
|
|
|
stq t2, 0(a0) // store merged data
|
|
|
|
//
|
|
// Finish aligned MoveForward
|
|
//
|
|
|
|
80:
|
|
ret zero, (ra) // return
|
|
|
|
|
|
|
|
//
|
|
// Move memory forward unaligned.
|
|
//
|
|
|
|
MoveForwardUnaligned: //
|
|
|
|
|
|
//
|
|
// Move bytes until the destination is aligned
|
|
//
|
|
|
|
and a0, 0x7, t0 // t0 = unaligned bits
|
|
beq t0, 100f // if eq, destination quad aligned
|
|
|
|
ldq_u t2, 0(a0) // get unaligned quad from dest
|
|
|
|
90:
|
|
beq a2, 95f // if eq no more bytes to move
|
|
ldq_u t1, 0(a1) // get unaligned quad from source
|
|
extbl t1, a1, t1 // extract source byte
|
|
insbl t1, t0, t1 // t1 = source byte, in position
|
|
mskbl t2, t0, t2 // clear byte position in dest.
|
|
bis t2, t1, t2 // merge in source byte
|
|
addq t0, 1, t0 // increment byte position
|
|
addq a1, 1, a1 // increment source pointer
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
cmpeq t0, 8, t3 // t0 = 8? => quad finished
|
|
beq t3, 90b // if eq [false], more bytes to move
|
|
95:
|
|
stq_u t2, 0(a0) // store merged quadword
|
|
addq a0, 7, a0 // increment to next quad
|
|
bic a0, 7, a0 // align next quadword
|
|
|
|
//
|
|
// Check for 64-byte blocks to move
|
|
//
|
|
|
|
100:
|
|
srl a2, 6, t0 // t0 = number of blocks to move
|
|
beq t0, 120f // if eq no blocks to move
|
|
and a2, 64-1, a2 // a2 = residual bytes to move
|
|
|
|
|
|
ldq_u t1, 0(a1) // t1 = first unaligned quad
|
|
|
|
110:
|
|
// get source data and merge it
|
|
// as we go
|
|
ldq_u t2, 7(a1) // t2 = second unaligned quad
|
|
extql t1, a1, t1 // extract applicable bytes from t1
|
|
extqh t2, a1, v0 // extract applicable bytes from t2
|
|
bis t1, v0, t1 // t1 = quad #1
|
|
ldq_u t3, 15(a1) // t3 = third unaligned quad
|
|
extql t2, a1, t2 // extract applicable bytes from t2
|
|
extqh t3, a1, v0 // extract applicable bytes from t3
|
|
stq t1, 0(a0) // store quad #1
|
|
bis t2, v0, t2 // t2 = quad #2
|
|
ldq_u t4, 23(a1) // t4 = fourth unaligned quad
|
|
extql t3, a1, t3 // extract applicable bytes from t3
|
|
extqh t4, a1, v0 // extract applicable bytes from t4
|
|
stq t2, 8(a0) // store quad #2
|
|
bis t3, v0, t3 // t3 = quad #3
|
|
ldq_u t5, 31(a1) // t5 = fifth unaligned quad
|
|
extql t4, a1, t4 // extract applicable bytes from t4
|
|
extqh t5, a1, v0 // extract applicable bytes from t5
|
|
stq t3, 16(a0) // store quad #3
|
|
bis t4, v0, t4 // t4 = quad #4
|
|
ldq_u a3, 39(a1) // a3 = sixth unaligned quad
|
|
extql t5, a1, t5 // extract applicable bytes from t5
|
|
extqh a3, a1, v0 // extract applicable bytes from a3
|
|
stq t4, 24(a0) // store quad #4
|
|
bis t5, v0, t5 // t5 = quad #5
|
|
ldq_u a4, 47(a1) // a4 = seventh unaligned quad
|
|
extql a3, a1, a3 // extract applicable bytes from a3
|
|
extqh a4, a1, v0 // extract applicable bytes from a4
|
|
stq t5, 32(a0) // store quad #5
|
|
bis a3, v0, a3 // a3 = quad #6
|
|
ldq_u a5, 55(a1) // a5 = eighth unaligned quad
|
|
extql a4, a1, a4 // extract applicable bytes from a4
|
|
extqh a5, a1, v0 // extract applicable bytes from a5
|
|
stq a3, 40(a0) // store quad #6
|
|
bis a4, v0, a4 // a4 = quad #7
|
|
ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next
|
|
extql a5, a1, a5 // extract applicable bytes from a5
|
|
extqh t1, a1, v0 // extract applicable bytes from t1
|
|
stq a4, 48(a0) // store quad #7
|
|
bis a5, v0, a5 // a5 = quad #8
|
|
addq a1, 64, a1 // increment source pointer
|
|
stq a5, 56(a0) // store quad #8
|
|
addq a0, 64, a0 // increment destination pointer
|
|
subq t0, 1, t0 // decrement number of blocks
|
|
bne t0, 110b // if ne, more blocks to move
|
|
|
|
//
|
|
// Move unaligned source quads to aligned destination quads
|
|
//
|
|
|
|
120:
|
|
srl a2, 3, t0 // t0 = number of quads to move
|
|
beq t0, 140f // if eq no quads to move
|
|
and a2, 8-1, a2 // a2 = residual bytes
|
|
|
|
|
|
ldq_u t1, 0(a1) // t1 = first unaligned quad
|
|
130:
|
|
ldq_u t2, 7(a1) // t2 = second unaligned quad
|
|
addq a0, 8, a0 // increment destination pointer
|
|
extql t1, a1, t1 // extract applicable bytes from t1
|
|
extqh t2, a1, v0 // extract applicable bytes from t2
|
|
bis t1, v0, t1 // t1 = quadword of data
|
|
stq t1, -8(a0) // store data to destination
|
|
addq a1, 8, a1 // increment source pointer
|
|
subq t0, 1, t0 // decrement quads to move
|
|
bis t2, zero, t1 // t1 = first of next unaligned pair
|
|
bne t0, 130b // if ne, more quads to move
|
|
|
|
//
|
|
// Move remaining bytes to final quadword
|
|
//
|
|
|
|
|
|
140:
|
|
beq a2, 160f // if eq no more bytes to move
|
|
ldq t2, 0(a0) // t2 = destination quadword
|
|
bis zero, zero, t3 // t3 = position for next insertion
|
|
|
|
150:
|
|
ldq_u t1, 0(a1) // get unaligned source quad
|
|
extbl t1, a1, t1 // t1 = source byte
|
|
insbl t1, t3, t1 // t1 = source byte, in position
|
|
mskbl t2, t3, t2 // clear byte in destination
|
|
bis t2, t1, t2 // merge in source byte
|
|
addq a1, 1, a1 // increment source pointer
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
addq t3, 1, t3 // increment destination position
|
|
bne a2, 150b // more bytes to move
|
|
|
|
stq t2, 0(a0) // store merged data
|
|
|
|
//
|
|
// Finish unaligned MoveForward
|
|
//
|
|
|
|
160:
|
|
ret zero, (ra) // return
|
|
|
|
|
|
//
|
|
// Move memory backward.
|
|
//
|
|
|
|
MoveBackward: //
|
|
|
|
addq a0, a2, a0 // compute ending destination address
|
|
addq a1, a2, a1 // compute ending source address
|
|
subq a0, 1, a0 // point to last destination byte
|
|
subq a1, 1, a1 // point to last source byte
|
|
xor a0, a1, t0 // compare alignment bits
|
|
and t0, 0x7, t0 // isolate alignment comparison
|
|
bne t0, MoveBackwardUnaligned // if ne, incompatible alignment
|
|
|
|
//
|
|
// Move memory backward aligned.
|
|
//
|
|
|
|
MoveBackwardAligned: //
|
|
|
|
//
|
|
// Move bytes until source and destination are quadword aligned
|
|
//
|
|
|
|
and a0, 0x7, t0 // t0 = unaligned bits
|
|
cmpeq t0, 7, t1 // last byte position 7?
|
|
beq t1, 5f // if eq [false], not quad aligned
|
|
subq a0, 7, a0 // point to beginning of last quad
|
|
subq a1, 7, a1 // point to beginning of last quad
|
|
br zero, 30f // predicted taken
|
|
|
|
5:
|
|
ldq_u t1, 0(a0) // get unaligned quad from dest.
|
|
ldq_u t2, 0(a1) // get unaligned quad from source
|
|
|
|
10:
|
|
beq a2, 20f // if eq, all bytes moved
|
|
extbl t2, t0, t3 // t3 = byte from source
|
|
insbl t3, t0, t3 // t3 = byte from source, in position
|
|
mskbl t1, t0, t1 // clear position in destination
|
|
bis t1, t3, t1 // merge in byte from source
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
subq t0, 1, t0 // decrement byte within quadword
|
|
cmplt t0, zero, t3 // finished the quadword?
|
|
beq t3, 10b // if eq [false], do next byte
|
|
|
|
20:
|
|
stq_u t1, 0(a0) // store merged destination bytes
|
|
|
|
subq a0, 8, a0 // move to previous quadword
|
|
bic a0, 7, a0 // aligned quadword
|
|
|
|
subq a1, 8, a1 // move to previous quadword
|
|
bic a1, 7, a1 // aligned quadword
|
|
|
|
//
|
|
// Check for 64-byte block moves
|
|
//
|
|
|
|
30:
|
|
|
|
srl a2, 6, t0 // t0 = number of 64 byte blocks
|
|
beq t0, 50f // if eq, no blocks to move
|
|
and a2, 64-1, a2 // a2 = residual bytes
|
|
|
|
40:
|
|
ldq t1, 0(a1) // load 64 bytes from source into
|
|
subq a0, 64, a0 // decrement destination pointer
|
|
ldq v0, -56(a1) //
|
|
ldq a3, -32(a1) //
|
|
stq t1, 64(a0) // write to destination
|
|
ldq t2, -8(a1) // into volatile registers
|
|
ldq a5, -48(a1) //
|
|
ldq a4, -40(a1) //
|
|
stq t2, 56(a0) //
|
|
ldq t3, -16(a1) //
|
|
ldq t4, -24(a1) //
|
|
subq a1, 64, a1 // decrement source pointer
|
|
stq t3, 48(a0) //
|
|
stq t4, 40(a0) //
|
|
stq a3, 32(a0) //
|
|
subq t0, 1, t0 // decrement number of blocks
|
|
stq a4, 24(a0) //
|
|
stq a5, 16(a0) //
|
|
stq v0, 8(a0) //
|
|
bne t0, 40b // if ne, more blocks to copy
|
|
|
|
//
|
|
// Copy quadwords
|
|
//
|
|
|
|
50:
|
|
srl a2, 3, t0 // t0 = number of quadwords to move
|
|
beq t0, 70f // if eq no quadwords to move
|
|
and a2, 8-1, a2 // a2 = residual bytes
|
|
|
|
60:
|
|
ldq t1, 0(a1) // load quadword from source
|
|
subq a1, 8, a1 // decrement source pointer
|
|
stq t1, 0(a0) // store quadword to destination
|
|
subq a0, 8, a0 // decrement destination pointer
|
|
subq t0, 1, t0 // decrement quadwords to move
|
|
bne t0, 60b // if ne, more quadwords to move
|
|
|
|
//
|
|
// Move final residual bytes
|
|
//
|
|
|
|
70:
|
|
beq a2, 90f // if eq, no more bytes to move
|
|
ldq t1, 0(a1) // get last source quadword
|
|
ldq t2, 0(a0) // get last destination quadword
|
|
bis zero, 7, t0 // t0 = next byte number to move
|
|
|
|
80:
|
|
extbl t1, t0, t3 // extract byte from source
|
|
insbl t3, t0, t3 // t3 = source byte, in position
|
|
mskbl t2, t0, t2 // clear byte position for dest.
|
|
bis t2, t3, t2 // merge in source byte
|
|
subq t0, 1, t0 // decrement byte position
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
bne a2, 80b // if ne, more bytes to move
|
|
|
|
stq t2, 0(a0) // write destination data
|
|
//
|
|
// Finish aligned MoveBackward
|
|
//
|
|
|
|
90:
|
|
|
|
ret zero, (ra) // return
|
|
|
|
|
|
//
|
|
// Move memory backward unaligned.
|
|
//
|
|
|
|
MoveBackwardUnaligned: //
|
|
|
|
|
|
//
|
|
// Move bytes until the destination is aligned
|
|
//
|
|
|
|
and a0, 0x7, t0 // t0 = unaligned bits
|
|
cmpeq t0, 7, t1 // last byte of a quadword
|
|
beq t1, 95f // if eq[false], not aligned
|
|
subq a0, 7, a0 // align pointer to beginning of quad
|
|
br zero, 120f //
|
|
|
|
95:
|
|
ldq_u t2, 0(a0) // get unaligned quad from dest.
|
|
|
|
100:
|
|
beq a2, 110f // if eq, no more bytes to move
|
|
ldq_u t1, 0(a1) // get unaligned quad from source
|
|
extbl t1, a1, t1 // extract source byte
|
|
insbl t1, t0, t1 // t1 = source byte in position
|
|
mskbl t2, t0, t2 // clear byte position in dest.
|
|
bis t2, t1, t2 // merge source byte
|
|
subq t0, 1, t0 // decrement byte position
|
|
subq a1, 1, a1 // decrement source pointer
|
|
subq a2, 1, a2 // decrement number of bytes to move
|
|
cmplt t0, zero, t3 // t0 < 0? => quad finished
|
|
beq t3, 100b // if eq [false], more bytes to move
|
|
|
|
110:
|
|
stq_u t2, 0(a0) // store merged quadword
|
|
|
|
subq a0, 8, a0 // decrement dest. to previous quad
|
|
bic a0, 7, a0 // align previous quadword
|
|
|
|
//
|
|
// Check for 64-byte blocks to move
|
|
//
|
|
|
|
120:
|
|
|
|
srl a2, 6, t0 // t0 = number of blocks to move
|
|
subq a1, 7, a1 // point to beginning of last quad
|
|
beq t0, 140f // if eq no blocks to move
|
|
and a2, 64-1, a2 // a2 = residual bytes to move
|
|
|
|
ldq_u t1, 7(a1) // t1 = first unaligned quad
|
|
|
|
130:
|
|
// get source data and merge it
|
|
// as we go
|
|
ldq_u t2, 0(a1) // t2 = second unaligned quad
|
|
extqh t1, a1, t1 // extract applicable bytes from t1
|
|
extql t2, a1, v0 // extract applicable bytes from t2
|
|
bis t1, v0, t1 // t1 = quad #1
|
|
ldq_u t3, -8(a1) // t3 = third unaligned quad
|
|
extqh t2, a1, t2 // extract applicable bytes from t2
|
|
extql t3, a1, v0 // extract applicable bytes from t3
|
|
stq t1, 0(a0) // store quad #1
|
|
bis t2, v0, t2 // t2 = quad #2
|
|
ldq_u t4, -16(a1) // t4 = fourth unaligned quad
|
|
extqh t3, a1, t3 // extract applicable bytes from t3
|
|
extql t4, a1, v0 // extract applicable bytes from t4
|
|
stq t2, -8(a0) // store quad #2
|
|
bis t3, v0, t3 // t3 = quad #3
|
|
ldq_u t5, -24(a1) // t5 = fifth unaligned quad
|
|
extqh t4, a1, t4 // extract applicable bytes from t4
|
|
extql t5, a1, v0 // extract applicable bytes from t5
|
|
stq t3, -16(a0) // store quad #3
|
|
bis t4, v0, t4 // t4 = quad #4
|
|
ldq_u a3, -32(a1) // a3 = sixth unaligned quad
|
|
extqh t5, a1, t5 // extract applicable bytes from t5
|
|
extql a3, a1, v0 // extract applicable bytes from a3
|
|
stq t4, -24(a0) // store quad #4
|
|
bis t5, v0, t5 // t5 = quad #5
|
|
ldq_u a4, -40(a1) // a4 = seventh unaligned quad
|
|
extqh a3, a1, a3 // extract applicable bytes from a3
|
|
extql a4, a1, v0 // extract applicable bytes from a4
|
|
stq t5, -32(a0) // store quad #5
|
|
bis a3, v0, a3 // a3 = quad #6
|
|
ldq_u a5, -48(a1) // a5 = eighth unaligned quad
|
|
extqh a4, a1, a4 // extract applicable bytes from a4
|
|
extql a5, a1, v0 // extract applicable bytes from a5
|
|
stq a3, -40(a0) // store quad #6
|
|
bis a4, v0, a4 // a4 = quad #7
|
|
ldq_u t1, -56(a1) // t1 = ninth unaligned = 1st of next
|
|
extqh a5, a1, a5 // extract applicable bytes from a5
|
|
extql t1, a1, v0 // extract applicable bytes from t1
|
|
stq a4, -48(a0) // store quad #7
|
|
bis a5, v0, a5 // a5 = quad #8
|
|
subq a1, 64, a1 // increment source pointer
|
|
stq a5, -56(a0) // store quad #8
|
|
subq a0, 64, a0 // increment destination pointer
|
|
subq t0, 1, t0 // decrement number of blocks
|
|
bne t0, 130b // if ne, more blocks to move
|
|
|
|
|
|
//
|
|
// Move unaligned source quads to aligned destination quads
|
|
//
|
|
|
|
140:
|
|
srl a2, 3, t0 // t0 = number of quads to move
|
|
beq t0, 160f // if eq no quads to move
|
|
and a2, 8-1, a2 // a2 = residual bytes
|
|
|
|
ldq_u t1, 7(a1) // t1 = first unaligned quad
|
|
|
|
150:
|
|
ldq_u t2, 0(a1) // t2 = second unaligned quad
|
|
subq a0, 8, a0 // decrement destination pointer
|
|
extqh t1, a1, t1 // extract applicable bytes from t1
|
|
extql t2, a1, v0 // extract applicable bytes from t2
|
|
bis t1, v0, t1 // t1 = quadword of data
|
|
stq t1, 8(a0) // store data to destination
|
|
subq a1, 8, a1 // decrement source pointer
|
|
subq t0, 1, t0 // decrement quads to move
|
|
bis t2, zero, t1 // t1 = first of next unaligned pair
|
|
bne t0, 150b // if ne, more quads to move
|
|
|
|
//
|
|
// Move remaining bytes to final quadword
|
|
//
|
|
|
|
160:
|
|
beq a2, 180f // if eq, no more bytes to move
|
|
ldq t2, 0(a0) // t2 = destination quadword
|
|
bis zero, 7, t0 // t0 = position for next insertion
|
|
|
|
170:
|
|
subq a1, 1, a1 // decrement source pointer
|
|
ldq_u t1, 8(a1) // get unaligned source quad
|
|
extbl t1, a1, t1 // t1 = source byte
|
|
insbl t1, t0, t1 // t1 = source byte, in position
|
|
mskbl t2, t0, t2 // clear byte position
|
|
bis t2, t1, t2 // merge in source byte
|
|
subq t0, 1, t0 // decrement byte position for dest.
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
bne a2, 170b // if ne, more bytes to move
|
|
|
|
stq t2, 0(a0) //
|
|
|
|
//
|
|
// Finish unaligned MoveBackward
|
|
//
|
|
|
|
180:
|
|
ret zero, (ra) // return
|
|
|
|
.end RtlMoveMemory
|
|
|
|
SBTTL("Zero Memory")
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlZeroMemory (
|
|
// IN PVOID Destination,
|
|
// IN ULONG Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function zeros memory by first aligning the destination address to
|
|
// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
|
|
// blocks, followed by any remaining bytes.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the memory to zero.
|
|
//
|
|
// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlZeroMemory)
|
|
|
|
bis zero, zero, a2 // set fill pattern
|
|
br zero, RtlpFillMemory //
|
|
|
|
|
|
SBTTL("Fill Memory")
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlFillMemory (
|
|
// IN PVOID Destination,
|
|
// IN ULONG Length,
|
|
// IN UCHAR Fill
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function fills memory by first aligning the destination address to
|
|
// a longword boundary, and then filling 32-byte blocks, followed by 4-byte
|
|
// blocks, followed by any remaining bytes.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the memory to fill.
|
|
//
|
|
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
|
|
//
|
|
// Fill (a2) - Supplies the fill byte.
|
|
//
|
|
// N.B. The alternate entry memset expects the length and fill arguments
|
|
// to be reversed. It also returns the Destination pointer
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
ALTERNATE_ENTRY(memset)
|
|
|
|
bis a0, zero, v0 // set return value
|
|
bis a1, zero, a3 // swap length and fill arguments
|
|
bis a2, zero, a1 //
|
|
bis a3, zero, a2 //
|
|
|
|
ALTERNATE_ENTRY(RtlFillMemory)
|
|
|
|
and a2, 0xff, a2 // clear excess bits
|
|
sll a2, 8, t0 // duplicate fill byte
|
|
bis a2, t0, a2 // generate fill word
|
|
sll a2, 16, t0 // duplicate fill word
|
|
bis a2, t0, a2 // generate fill longword
|
|
sll a2, 32, t0 // duplicate fill longword
|
|
bis a2, t0, a2 // generate fill quadword
|
|
|
|
.align 3 // ensure quadword aligned target
|
|
//
|
|
// Fill memory with the pattern specified in register a2.
|
|
//
|
|
|
|
RtlpFillMemory: //
|
|
|
|
//
|
|
// Align destination to quadword
|
|
//
|
|
|
|
beq a1, 80f // anything to fill? (paranoia)
|
|
and a0, 8-1, t0 // t0 = unaligned bits
|
|
bne t0, 5f // if ne, then not quad aligned
|
|
br zero, 20f // if eq, then quad aligned
|
|
|
|
5:
|
|
ldq_u t1, 0(a0) // get unaligned quadword
|
|
// for first group of bytes
|
|
10:
|
|
beq a1, 15f // if eq no more bytes to fill
|
|
insbl a2, t0, t2 // get fill byte into position
|
|
mskbl t1, t0, t1 // clear byte for fill
|
|
bis t1, t2, t1 // put in fill byte
|
|
addq t0, 1, t0 // increment to next byte position
|
|
subq a1, 1, a1 // decrement bytes to fill
|
|
cmpeq t0, 8, t2 // t0 = 8?
|
|
beq t2, 10b // if eq [false] more bytes to do
|
|
|
|
15:
|
|
stq_u t1, 0(a0) // store modified bytes
|
|
addq a0, 7, a0 // move a0 to next quadword
|
|
bic a0, 7, a0 // align a0 to quadword
|
|
|
|
//
|
|
// Check for 64-byte blocks
|
|
//
|
|
|
|
20:
|
|
srl a1, 6, t0 // t0 = number of 64 byte blocks
|
|
beq t0, 40f // if eq then no 64 byte blocks
|
|
and a1, 64-1, a1 // a1 = residual bytes to fill
|
|
|
|
30:
|
|
stq a2, 0(a0) // store 64 bytes
|
|
stq a2, 8(a0) //
|
|
stq a2, 16(a0) //
|
|
stq a2, 24(a0) //
|
|
stq a2, 32(a0) //
|
|
stq a2, 40(a0) //
|
|
stq a2, 48(a0) //
|
|
stq a2, 56(a0) //
|
|
|
|
subq t0, 1, t0 // decrement blocks remaining
|
|
addq a0, 64, a0 // increment destination pointer
|
|
bne t0, 30b // more blocks to write
|
|
|
|
|
|
|
|
//
|
|
// Fill aligned quadwords
|
|
//
|
|
|
|
40:
|
|
srl a1, 3, t0 // t0 = number of quadwords
|
|
bne t0, 55f // if ne quadwords left to fill
|
|
br zero, 60f // if eq no quadwords left
|
|
|
|
55:
|
|
and a1, 8-1, a1 // a1 = residual bytes to fill
|
|
|
|
50:
|
|
stq a2, 0(a0) // store quadword
|
|
subq t0, 1, t0 // decrement quadwords remaining
|
|
addq a0, 8, a0 // next quadword
|
|
bne t0, 50b // more quadwords to write
|
|
|
|
|
|
//
|
|
// Fill bytes for last quadword
|
|
//
|
|
|
|
60:
|
|
bne a1, 65f // if ne bytes remain to be filled
|
|
br zero, 80f // if eq no more bytes to fill
|
|
|
|
65:
|
|
ldq t1, 0(a0) // get last quadword
|
|
bis zero, zero, t0 // t0 = byte position to start fill
|
|
|
|
70:
|
|
beq a1, 75f // if eq, no more bytes to fill
|
|
insbl a2, t0, t2 // get fill byte into position
|
|
mskbl t1, t0, t1 // clear fill byte position
|
|
bis t1, t2, t1 // insert fill byte
|
|
addq t0, 1, t0 // increment byte within quad
|
|
subq a1, 1, a1 // decrement bytes to fill
|
|
cmpeq t0, 8, t3 // t0 = 8? => finished quad
|
|
beq t3, 70b // if eq [false] more bytes to fill
|
|
|
|
75:
|
|
stq t1, 0(a0) // write merged quadword
|
|
|
|
//
|
|
// Finish up
|
|
//
|
|
|
|
80:
|
|
ret zero, (ra) // return
|
|
|
|
|
|
.end RtlZeroMemory
|
|
|
|
SBTTL("Fill Memory Ulong")
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlFillMemoryUlong (
|
|
// IN PVOID Destination,
|
|
// IN ULONG Length,
|
|
// IN ULONG Pattern
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function fills memory with the specified longowrd pattern by
|
|
// filling 64-byte blocks followed by 8-byte blocks and finally
|
|
// 4-byte blocks.
|
|
//
|
|
// N.B. This routine assumes that the destination address is aligned
|
|
// on a longword boundary and that the length is an even multiple
|
|
// of longwords.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the memory to fill.
|
|
//
|
|
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
|
|
//
|
|
// Pattern (a2) - Supplies the fill pattern.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlFillMemoryUlong)
|
|
|
|
bic a1, 3, a1 // make sure length is an even number
|
|
// of longwords
|
|
sll a2, 32, a3 // a3 = long pattern in upper 32 bits
|
|
srl a3, 32, t0 // clear upper bits, pattern in lower 32
|
|
bis a3, t0, a3 // a3 = quad version of fill pattern
|
|
|
|
//
|
|
// Make destination address quad-aligned
|
|
//
|
|
|
|
and a0, 4, t0 // is a0 quad aligned?
|
|
beq t0, 10f // if eq, then a0 quad aligned
|
|
stl a2, 0(a0) // fill first longword
|
|
addq a0, 4, a0 // quad align a0
|
|
subq a1, 4, a1 // bytes remaining to store
|
|
|
|
//
|
|
// Check for 64-byte blocks to fill
|
|
//
|
|
|
|
10:
|
|
srl a1, 6, t0 // t0 = # 64-byte blocks to fill
|
|
beq t0, 30f // if eq no 64 byte blocks
|
|
and a1, 64-1, a1 // a1 = residual bytes
|
|
|
|
20:
|
|
stq a3, 0(a0) // store 64 bytes
|
|
stq a3, 8(a0) //
|
|
stq a3, 16(a0) //
|
|
stq a3, 24(a0) //
|
|
stq a3, 32(a0) //
|
|
stq a3, 40(a0) //
|
|
stq a3, 48(a0) //
|
|
stq a3, 56(a0) //
|
|
subq t0, 1, t0 // t0 = blocks remaining
|
|
addq a0, 64, a0 // increment address pointer
|
|
bne t0, 20b // if ne more blocks to fill
|
|
|
|
//
|
|
// Fill 8 bytes at a time while we can, a1 = bytes remaining
|
|
//
|
|
|
|
30:
|
|
srl a1, 3, t0 // t0 = # quadwords to fill
|
|
beq t0, 50f // if eq no quadwords left
|
|
and a1, 8-1, a1 // a1 = residual bytes
|
|
40:
|
|
stq a3, 0(a0) // store quadword
|
|
subq t0, 1, t0 // t0 = quadwords remaining
|
|
addq a0, 8, a0 // increment address pointer
|
|
bne t0, 40b // if ne more quadwords to fill
|
|
|
|
//
|
|
// Fill last 4 bytes
|
|
//
|
|
|
|
50:
|
|
beq a1, 60f // if eq no longwords remain
|
|
stl a2, 0(a0) // fill last longword
|
|
|
|
//
|
|
// Finish up
|
|
//
|
|
|
|
60:
|
|
ret zero, (ra) // return to caller
|
|
|
|
|
|
.end RtlFillMemoryUlong
|
|
|
|
SBTTL("Copy Memory With Byte Granularity")
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlCopyBytes (
|
|
// IN PVOID Destination,
|
|
// IN PVOID Source,
|
|
// IN ULONG Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function copies non-overlapping memory, aligned or unaligned, in
|
|
// 64-byte blocks, followed by 8-byte blocks, followed by any remaining
|
|
// bytes. Unlike RtlCopyMemory or RtlMoveMemory the copy is done such
|
|
// that byte granularity is assured for all platforms.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the destination address of
|
|
// the move operation.
|
|
//
|
|
// Source (a1) - Supplies a pointer to the source address of the move
|
|
// operation.
|
|
//
|
|
// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlCopyBytes)
|
|
|
|
//
|
|
// Move memory forward aligned and unaligned.
|
|
//
|
|
|
|
xor a0, a1, t0 // compare alignment bits
|
|
and t0, 0x7, t0 // isolate alignment comparison
|
|
bne t0, CopyForwardUnaligned // if ne, incompatible alignment
|
|
|
|
//
|
|
// Source and Destination buffers have the same alignment. Move
|
|
// bytes until done or source and destination are quadword aligned
|
|
//
|
|
|
|
and a0, 0x7, t0 // t0 = unaligned bits
|
|
bne t0, 5f // if ne, not quad aligned
|
|
br zero, 20f // predicted taken
|
|
5:
|
|
bis zero, zero, t1 // t4 = destination byte zap mask
|
|
bis zero, 1, t2
|
|
sll t2, t0, t2 // t2 = next bit to set in zap mask
|
|
10:
|
|
beq a2, 15f // if eq, all bits set
|
|
bis t1, t2, t1 // set bit in zap mask
|
|
sll t2, 1, t2 // set next higher bit for zap mask
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
addq t0, 1, t0 // increment byte within quad
|
|
cmpeq t0, 8, t3 // finished the quadword?
|
|
beq t3, 10b // if eq [false], do next byte
|
|
15:
|
|
ldq_u t2, 0(a1) // get unaligned quadword from source
|
|
zapnot t2, t1, t2 // clear source bytes
|
|
bic a0, 7, a3 // a3 = quadword base of destination
|
|
retry1:
|
|
ldq_l t0, 0(a3) // load destination quadword
|
|
zap t0, t1, t0 // clear destination bytes
|
|
or t0, t2, t0 // merge in bytes from source
|
|
stq_c t0, 0(a3) // store merged quadword conditional
|
|
beq t0, retry1f // if eq, retry failed interlock
|
|
|
|
addq a0, 7, a0 // move to next quadword
|
|
bic a0, 7, a0 // aligned quadword
|
|
|
|
addq a1, 7, a1 // move to next quadword
|
|
bic a1, 7, a1 // aligned quadword
|
|
|
|
//
|
|
// Check for 64-byte block moves
|
|
//
|
|
|
|
20:
|
|
srl a2, 6, t0 // t0 = number of 64 byte blocks
|
|
beq t0, 40f // if eq no blocks to move
|
|
and a2, 64-1, a2 // a2 = residual bytes
|
|
|
|
30:
|
|
ldq t1, 0(a1) // load 64 bytes from source
|
|
addq a0, 64, a0 // increment destination pointer
|
|
ldq v0, 56(a1) //
|
|
ldq a3, 32(a1) //
|
|
stq t1, -64(a0) // write to destination
|
|
ldq t2, 8(a1) // into volatile registers
|
|
ldq t3, 16(a1) //
|
|
ldq t4, 24(a1) //
|
|
subq t0, 1, t0 // decrement number of blocks
|
|
stq t2, -56(a0) //
|
|
ldq a4, 40(a1) //
|
|
stq t3, -48(a0) //
|
|
ldq a5, 48(a1) //
|
|
stq t4, -40(a0) //
|
|
addq a1, 64, a1 // increment source pointer
|
|
stq a3, -32(a0) //
|
|
stq a4, -24(a0) //
|
|
stq a5, -16(a0) //
|
|
stq v0, -8(a0) //
|
|
bne t0, 30b // if ne, more blocks to copy
|
|
|
|
//
|
|
// Copy quadwords
|
|
//
|
|
|
|
40:
|
|
srl a2, 3, t0 // t0 = number of quadwords to move
|
|
beq t0, 60f // if eq no quadwords to move
|
|
and a2, 8-1, a2 // a2 = residual bytes
|
|
|
|
50:
|
|
ldq t1, 0(a1) // load quadword from source
|
|
addq a1, 8, a1 // increment source pointer
|
|
stq t1, 0(a0) // store quadword to destination
|
|
addq a0, 8, a0 // increment destination pointer
|
|
subq t0, 1, t0 // decrement number of quadwords
|
|
bne t0, 50b // if ne, more quadwords to move
|
|
|
|
//
|
|
// Move final residual bytes
|
|
//
|
|
|
|
60:
|
|
beq a2, 80f // if eq, no more bytes to move
|
|
mov a2, t0 // t0 = number of bytes to move
|
|
mov -1, t1 // t1 = bit mask
|
|
sll t0, 3, t0 // # of bytes to # of bits
|
|
srl t1, t0, t1 // clear t0 bits
|
|
sll t1, t0, t0 // move it back
|
|
ldq t1, 0(a1) // get last source quadword
|
|
bic t1, t0, t1 // clear bytes not copied
|
|
not t0, t0 // complement to clear destination
|
|
retry2:
|
|
ldq_l t2, 0(a0) // get last destination quadword locked
|
|
bic t2, t0, t2 // clear bytes to be copied
|
|
bis t2, t1, t2 // move bytes from source
|
|
stq_c t2, 0(a0) // store merged quadword conditional
|
|
beq t2, retry2f // if eq, retry failed interlock
|
|
|
|
//
|
|
// Finish aligned MoveForward
|
|
//
|
|
|
|
80:
|
|
ret zero, (ra) // return
|
|
|
|
//
|
|
// Move memory forward unaligned.
|
|
//
|
|
|
|
CopyForwardUnaligned: //
|
|
|
|
//
|
|
// Move bytes until the destination is aligned
|
|
//
|
|
|
|
and a0, 0x7, t0 // t0 = unaligned bits
|
|
beq t0, 100f // if eq, destination quad aligned
|
|
bis zero, zero, t1 // t4 = destination byte zap mask
|
|
bis zero, 1, t2
|
|
sll t2, t0, t2 // t2 = next bit to set in zap mask
|
|
mov zero, t4 // assemble destination bytes here
|
|
90:
|
|
beq a2, 95f // if eq no more bytes to move
|
|
bis t1, t2, t1 // set bit in zap mask
|
|
sll t2, 1, t2 // set next higher bit for zap mask
|
|
ldq_u t5, 0(a1) // get unaligned quad from source
|
|
extbl t5, a1, t5 // extract source byte
|
|
insbl t5, t0, t5 // t5 = source byte, in position
|
|
or t4, t5, t4 // merge in source byte
|
|
addq t0, 1, t0 // increment byte position
|
|
addq a1, 1, a1 // increment source pointer
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
cmpeq t0, 8, t3 // t0 = 8? => quad finished
|
|
beq t3, 90b // if eq [false], more bytes to move
|
|
95:
|
|
bic a0, 0x7, a3 // a3 = quadword base of destination
|
|
retry3:
|
|
ldq_l t0, 0(a3) // load destination quadword
|
|
zap t0, t1, t0 // clear destination bytes
|
|
or t0, t4, t0 // merge in bytes from source
|
|
stq_c t0, 0(a3) // store merged quadword conditional
|
|
beq t0, retry3f // if eq, retry failed interlock
|
|
|
|
addq a0, 7, a0 // increment to next quad
|
|
bic a0, 7, a0 // align next quadword
|
|
|
|
//
|
|
// Check for 64-byte blocks to move
|
|
//
|
|
|
|
100:
|
|
srl a2, 6, t0 // t0 = number of blocks to move
|
|
beq t0, 120f // if eq no blocks to move
|
|
and a2, 64-1, a2 // a2 = residual bytes to move
|
|
|
|
ldq_u t1, 0(a1) // t1 = first unaligned quad
|
|
110:
|
|
// get source data and merge it
|
|
// as we go
|
|
ldq_u t2, 7(a1) // t2 = second unaligned quad
|
|
extql t1, a1, t1 // extract applicable bytes from t1
|
|
extqh t2, a1, v0 // extract applicable bytes from t2
|
|
bis t1, v0, t1 // t1 = quad #1
|
|
ldq_u t3, 15(a1) // t3 = third unaligned quad
|
|
extql t2, a1, t2 // extract applicable bytes from t2
|
|
extqh t3, a1, v0 // extract applicable bytes from t3
|
|
stq t1, 0(a0) // store quad #1
|
|
bis t2, v0, t2 // t2 = quad #2
|
|
ldq_u t4, 23(a1) // t4 = fourth unaligned quad
|
|
extql t3, a1, t3 // extract applicable bytes from t3
|
|
extqh t4, a1, v0 // extract applicable bytes from t4
|
|
stq t2, 8(a0) // store quad #2
|
|
bis t3, v0, t3 // t3 = quad #3
|
|
ldq_u t5, 31(a1) // t5 = fifth unaligned quad
|
|
extql t4, a1, t4 // extract applicable bytes from t4
|
|
extqh t5, a1, v0 // extract applicable bytes from t5
|
|
stq t3, 16(a0) // store quad #3
|
|
bis t4, v0, t4 // t4 = quad #4
|
|
ldq_u a3, 39(a1) // a3 = sixth unaligned quad
|
|
extql t5, a1, t5 // extract applicable bytes from t5
|
|
extqh a3, a1, v0 // extract applicable bytes from a3
|
|
stq t4, 24(a0) // store quad #4
|
|
bis t5, v0, t5 // t5 = quad #5
|
|
ldq_u a4, 47(a1) // a4 = seventh unaligned quad
|
|
extql a3, a1, a3 // extract applicable bytes from a3
|
|
extqh a4, a1, v0 // extract applicable bytes from a4
|
|
stq t5, 32(a0) // store quad #5
|
|
bis a3, v0, a3 // a3 = quad #6
|
|
ldq_u a5, 55(a1) // a5 = eighth unaligned quad
|
|
extql a4, a1, a4 // extract applicable bytes from a4
|
|
extqh a5, a1, v0 // extract applicable bytes from a5
|
|
stq a3, 40(a0) // store quad #6
|
|
bis a4, v0, a4 // a4 = quad #7
|
|
ldq_u t1, 63(a1) // t1 = ninth unaligned = 1st of next
|
|
extql a5, a1, a5 // extract applicable bytes from a5
|
|
extqh t1, a1, v0 // extract applicable bytes from t1
|
|
stq a4, 48(a0) // store quad #7
|
|
bis a5, v0, a5 // a5 = quad #8
|
|
addq a1, 64, a1 // increment source pointer
|
|
stq a5, 56(a0) // store quad #8
|
|
addq a0, 64, a0 // increment destination pointer
|
|
subq t0, 1, t0 // decrement number of blocks
|
|
bne t0, 110b // if ne, more blocks to move
|
|
|
|
//
|
|
// Move unaligned source quads to aligned destination quads
|
|
//
|
|
|
|
120:
|
|
srl a2, 3, t0 // t0 = number of quads to move
|
|
beq t0, 140f // if eq no quads to move
|
|
and a2, 8-1, a2 // a2 = residual bytes
|
|
|
|
|
|
ldq_u t1, 0(a1) // t1 = first unaligned quad
|
|
130:
|
|
ldq_u t2, 7(a1) // t2 = second unaligned quad
|
|
addq a0, 8, a0 // increment destination pointer
|
|
extql t1, a1, t1 // extract applicable bytes from t1
|
|
extqh t2, a1, v0 // extract applicable bytes from t2
|
|
bis t1, v0, t1 // t1 = quadword of data
|
|
stq t1, -8(a0) // store data to destination
|
|
addq a1, 8, a1 // increment source pointer
|
|
subq t0, 1, t0 // decrement quads to move
|
|
bis t2, zero, t1 // t1 = first of next unaligned pair
|
|
bne t0, 130b // if ne, more quads to move
|
|
|
|
//
|
|
// Move remaining bytes to final quadword
|
|
//
|
|
|
|
140:
|
|
beq a2, 160f // if eq no more bytes to move
|
|
|
|
mov zero, t3 // t3 = position for next insertion
|
|
mov zero, t4 // assemble destination bytes here
|
|
mov a2, t0 // t0 = number of bytes to move
|
|
mov -1, t1 // t1 = bit mask
|
|
sll t0, 3, t0 // # of bytes to # of bits
|
|
srl t1, t0, t1 // clear t0 bits
|
|
sll t1, t0, t0 // move it back
|
|
not t0, t0 // complement for destination clear mask
|
|
150:
|
|
ldq_u t1, 0(a1) // get unaligned source quad
|
|
extbl t1, a1, t1 // t1 = source byte
|
|
insbl t1, t3, t1 // t1 = source byte, in position
|
|
bis t4, t1, t4 // merge in source byte
|
|
addq a1, 1, a1 // increment source pointer
|
|
subq a2, 1, a2 // decrement bytes to move
|
|
addq t3, 1, t3 // increment destination position
|
|
bne a2, 150b // more bytes to move
|
|
retry4:
|
|
ldq_l t2, 0(a0) // get last destination quadword locked
|
|
bic t2, t0, t2 // clear bytes to be copied
|
|
bis t2, t4, t2 // move bytes from source
|
|
stq_c t2, 0(a0) // store merged quadword conditional
|
|
beq t2, retry4f // if eq, retry failed interlock
|
|
|
|
//
|
|
// Finish unaligned MoveForward
|
|
//
|
|
|
|
160:
|
|
ret zero, (ra) // return
|
|
|
|
//
|
|
// Out of line branches for failed store conditional.
|
|
// Don't need to restore anything, just try again.
|
|
//
|
|
|
|
retry1f:
|
|
br retry1
|
|
retry2f:
|
|
br retry2
|
|
retry3f:
|
|
br retry3
|
|
retry4f:
|
|
br retry4
|
|
|
|
.end RtlCopyBytes
|
|
|
|
SBTTL("Zero Bytes")
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlZeroBytes (
|
|
// IN PVOID Destination,
|
|
// IN ULONG Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function zeros memory by first aligning the destination address to
|
|
// a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
|
|
// blocks, followed by any remaining bytes. Unlike RtlZeroMemory the copy is
|
|
// done such that byte granularity is assured for all platforms.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the memory to zero.
|
|
//
|
|
// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlZeroBytes)
|
|
|
|
bis zero, zero, a2 // set fill pattern
|
|
br zero, RtlpFillBytes //
|
|
|
|
|
|
SBTTL("Fill Bytes")
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlFillBytes (
|
|
// IN PVOID Destination,
|
|
// IN ULONG Length,
|
|
// IN UCHAR Fill
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function fills memory by first aligning the destination address to
|
|
// a longword boundary, and then filling 32-byte blocks, followed by 4-byte
|
|
// blocks, followed by any remaining bytes. Unlike RtlFillMemory the copy is
|
|
// done such that byte granularity is assured for all platforms.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the memory to fill.
|
|
//
|
|
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
|
|
//
|
|
// Fill (a2) - Supplies the fill byte.
|
|
//
|
|
// N.B. The alternate entry memset expects the length and fill arguments
|
|
// to be reversed. It also returns the Destination pointer
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
ALTERNATE_ENTRY(RtlFillBytes)
|
|
|
|
and a2, 0xff, a2 // clear excess bits
|
|
sll a2, 8, t0 // duplicate fill byte
|
|
bis a2, t0, a2 // generate fill word
|
|
sll a2, 16, t0 // duplicate fill word
|
|
bis a2, t0, a2 // generate fill longword
|
|
sll a2, 32, t0 // duplicate fill longword
|
|
bis a2, t0, a2 // generate fill quadword
|
|
|
|
.align 3 // ensure quadword aligned target
|
|
//
|
|
// Fill memory with the pattern specified in register a2.
|
|
//
|
|
|
|
RtlpFillBytes: //
|
|
|
|
//
|
|
// Align destination to quadword
|
|
//
|
|
|
|
beq a1, 80f // anything to fill? (paranoia)
|
|
and a0, 8-1, t0 // t0 = unaligned bits
|
|
bne t0, 5f // if ne, then not quad aligned
|
|
br zero, 20f // if eq, then quad aligned
|
|
|
|
5:
|
|
bis zero, zero, t1 // t4 = destination byte zap mask
|
|
bis zero, 1, t2
|
|
sll t2, t0, t2 // t2 = next bit to set in zap mask
|
|
10:
|
|
beq a1, 15f // if eq, all bits set
|
|
bis t1, t2, t1 // set bit in zap mask
|
|
sll t2, 1, t2 // set next higher bit for zap mask
|
|
subq a1, 1, a1 // decrement bytes to fill
|
|
addq t0, 1, t0 // increment byte within quad
|
|
cmpeq t0, 8, t3 // finished the quadword?
|
|
beq t3, 10b // if eq [false], do next byte
|
|
15:
|
|
zapnot a2, t1, t2 // clear fill bytes
|
|
bic a0, 7, a3 // a3 = quadword base of destination
|
|
retry5:
|
|
ldq_l t0, 0(a3) // load destination quadword
|
|
zap t0, t1, t0 // clear destination bytes
|
|
or t0, t2, t0 // merge in fill bytes
|
|
stq_c t0, 0(a3) // store merged quadword conditional
|
|
beq t0, retry5f // if eq, retry failed interlock
|
|
|
|
addq a0, 7, a0 // move a0 to next quadword
|
|
bic a0, 7, a0 // align a0 to quadword
|
|
|
|
//
|
|
// Check for 64-byte blocks
|
|
//
|
|
|
|
20:
|
|
srl a1, 6, t0 // t0 = number of 64 byte blocks
|
|
beq t0, 40f // if eq then no 64 byte blocks
|
|
and a1, 64-1, a1 // a1 = residual bytes to fill
|
|
|
|
30:
|
|
stq a2, 0(a0) // store 64 bytes
|
|
stq a2, 8(a0) //
|
|
stq a2, 16(a0) //
|
|
stq a2, 24(a0) //
|
|
stq a2, 32(a0) //
|
|
stq a2, 40(a0) //
|
|
stq a2, 48(a0) //
|
|
stq a2, 56(a0) //
|
|
|
|
subq t0, 1, t0 // decrement blocks remaining
|
|
addq a0, 64, a0 // increment destination pointer
|
|
bne t0, 30b // more blocks to write
|
|
|
|
|
|
|
|
//
|
|
// Fill aligned quadwords
|
|
//
|
|
|
|
40:
|
|
srl a1, 3, t0 // t0 = number of quadwords
|
|
bne t0, 55f // if ne quadwords left to fill
|
|
br zero, 60f // if eq no quadwords left
|
|
|
|
55:
|
|
and a1, 8-1, a1 // a1 = residual bytes to fill
|
|
|
|
50:
|
|
stq a2, 0(a0) // store quadword
|
|
subq t0, 1, t0 // decrement quadwords remaining
|
|
addq a0, 8, a0 // next quadword
|
|
bne t0, 50b // more quadwords to write
|
|
|
|
//
|
|
// Fill bytes for last quadword
|
|
//
|
|
|
|
60:
|
|
beq a1, 80f // if eq no more bytes to fill
|
|
|
|
mov a1, t0 // t0 = number of bytes to move
|
|
mov -1, t1 // t1 = bit mask
|
|
sll t0, 3, t0 // # of bytes to # of bits
|
|
srl t1, t0, t1 // clear t0 bits
|
|
sll t1, t0, t0 // move it back
|
|
bic a2, t0, t1 // clear fill bytes not copied
|
|
not t0, t0 // complement to clear destination
|
|
retry6:
|
|
ldq_l t2, 0(a0) // get last destination quadword locked
|
|
bic t2, t0, t2 // clear bytes to be copied
|
|
bis t2, t1, t2 // move bytes from source
|
|
stq_c t2, 0(a0) // store merged quadword conditional
|
|
beq t2, retry6f // if eq, retry failed interlock
|
|
|
|
//
|
|
// Finish up
|
|
//
|
|
|
|
80:
|
|
ret zero, (ra) // return
|
|
|
|
//
|
|
// Out of line branches for failed store conditional.
|
|
// Don't need to restore anything, just try again.
|
|
//
|
|
|
|
retry5f:
|
|
br retry5
|
|
retry6f:
|
|
br retry6
|
|
|
|
.end RtlZeroBytes
|