//      TITLE("Compare, Move, Zero, and Fill Memory Support")
//++
//
// Copyright (c) 1992  Digital Equipment Corporation
//
// Module Name:
//
//    mvmem.s
//
// Abstract:
//
//    This module implements functions to compare, move, zero, and fill
//    blocks of memory. If the memory is aligned, then these functions
//    are very efficient.
//
//    N.B. These routines MUST preserve all floating state since they are
//        frequently called from interrupt service routines that normally
//        do not save or restore floating state.
//
// Author:
//
//    Joe Notarangelo  21-May-1992
//
// Environment:
//
//    User or Kernel mode.
//
// Revision History:
//
//    Monty VanderBilt 14-Feb-1996 Avoid memory loads and branch takens between 
//                                 load lock and store conditional instructions
//                                 to conform with all alpha architecture rules.
//    Monty VanderBilt 27-Feb-1996 Added RtlZeroBytes and RtlFillBytes to support
//                                 byte granularity access when necessary.
//--

#include "ksalpha.h"

        SBTTL("Compare Memory")
//++
//
// ULONG
// RtlCompareMemory (
//    IN PVOID Source1,
//    IN PVOID Source2,
//    IN ULONG Length
//    )
//
// Routine Description:
//
//    This function compares two blocks of memory and returns the number
//    of bytes that compared equal.
//
// Arguments:
//
//    Source1 (a0) - Supplies a pointer to the first block of memory to
//       compare.
//
//    Source2 (a1) - Supplies a pointer to the second block of memory to
//       compare.
//
//    Length (a2) - Supplies the length, in bytes, of the memory to be
//       compared.
//
// Return Value:
//
//    The number of bytes that compared equal is returned as the function
//    value. If all bytes compared equal, then the length of the orginal
//    block of memory is returned.
//
//--


        LEAF_ENTRY(RtlCompareMemory)

        bis     a2, zero, v0            // save length of comparison
        beq     a2, 90f                 // (JAE) quit if nothing to compare
        xor     a0, a1, t0              // check for compatible alignment
        and     t0, 0x7, t0             // low bits only
        bne     t0, CompareUnaligned    // if ne, incompatible alignment

//
// Compare memory aligned
//

CompareAligned:                         //

//
// compare memory until sources are aligned
//
        and     a0, 0x7, t0             // get low bits
        bne     t0, 10f                 // if ne, sources not aligned yet
        br      zero, 30f               // already aligned, predicted


10:
        ldq_u   t1, 0(a0)               // get unaligned quad at source 1
        ldq_u   t2, 0(a1)               // get unaligned quad at source 2

20:
        extbl   t1, t0, t4              // byte at t0 in source 1 quad
        extbl   t2, t0, t5              // byte at t0 in source 2 quad
        xor     t4, t5, t3              // t1 = t2 ?
        bne     t3, 110f                // not equal, miscompare
        subq    a2, 1, a2               // decrement bytes to compare
        beq     a2, 90f                 // if eq, compare success
        addq    t0, 1, t0               // increment pointer within quad
        cmpeq   t0, 8, t3               // t0 = 8?, if so first quadword done
        beq     t3, 20b                 // continue while t0 < 8


        addq    a0, 8, a0               // increment to next quadword
        addq    a1, 8, a1               // increment source 2 to next also
        bic     a0, 7, a0               // align source 1 quadword
        bic     a1, 7, a1               // align source 2 quadword


//
// aligned block compare, compare blocks of 64 bytes
//

30:
        srl     a2, 6, t0               // t0 = number of 64 byte blocks
        beq     t0, 50f                 // if eq, no 64 byte blocks

//
// N.B. loads from each of the sources were separated in case these
//      blocks are fighting for the cache
//
        .set    noat
40:
        ldq     t1, 0(a0)               // t1 = source 1, quad 0
        ldq     t2, 8(a0)               // t2 = source 1, quad 1
        ldq     t3, 16(a0)              // t3 = source 1, quad 2
        addq    a1, 64, a1              // increment source 2 pointer
        ldq     t4, 24(a0)              // t4 = source 1, quad 3

        ldq     t5, -64(a1)             // t5 = source 2, quad 0
        ldq     a4, -56(a1)             // a4 = source 2, quad 1
        ldq     a5, -48(a1)             // a5 = source 2, quad 2
        xor     t1, t5, $at             // quad 0 match?
        bne     $at, 200f               // if ne[false], miscompare
        ldq     t5, -40(a1)             // t5 = source 2, quad 3
        ldq     t1, 32(a0)              // t1 = source 1, quad 4
        xor     t2, a4, $at             // quad 1 match?
        bne     $at, 122f               // if ne[false], miscompare
        ldq     t2, 40(a0)              // t2 = source 1, quad 5
        xor     t3, a5, $at             // quad 2 match?
        bne     $at, 124f               // if ne[false], miscompare
        ldq     t3, 48(a0)              // t3 = source 1, quad 6
        xor     t4, t5, $at             // quad 3 match?
        bne     $at, 126f               // if ne[false], miscompare
        ldq     t4, 56(a0)              // t4 = source 1, quad 7

        ldq     t5, -32(a1)             // t5 = source 2, quad 4
        addq    a0, 64, a0              // increment source 1 pointer
        ldq     a4, -24(a1)             // a4 = source 2, quad 5
        subq    t0, 1, t0               // decrement blocks to compare
        ldq     a5, -16(a1)             // a5 = source 2, quad 6
        xor     t1, t5, $at             // quad 4 match?
        bne     $at, 130f               // if ne[false], miscompare
        ldq     t5, -8(a1)              // t5 = source 2, quad 7
        xor     t2, a4, $at             // quad 5 match?
        bne     $at, 132f               // if ne[false], miscompare
        xor     t3, a5, $at             // quad 6 match?
        bne     $at, 134f               // if ne[false], miscompare
        xor     t4, t5, $at             // quad 7 match?
        bne     $at, 136f               // if ne[false], miscompare
        subq    a2, 64, a2              // decrement bytes to compare
        bne     t0, 40b                 // if ne, more blocks to compare
        .set    at


//
// Compare quadwords
//

50:
        srl     a2, 3, t0               // t0 = number of quadwords to compare
        beq     t0, 70f                 // if eq, no quadwords to compare

        .set    noat
60:
        ldq     t1, 0(a0)               // t1 = quad from source 1
        lda     a0, 8(a0)               // increment source 1 pointer
        ldq     t2, 0(a1)               // t2 = quad from source 2
        lda     a1, 8(a1)               // increment source 2 pointer
        xor     t1, t2, $at             // are quadwords equal?
        bne     $at, 200f               // if ne, miscompare
        subq    t0, 1, t0               // decrement quads to compare
        subq    a2, 8, a2               // decrement bytes to compare
        bne     t0, 60b                 // if ne, more quads to compare

        .set    at

//
// Compare bytes in last quadword
//

//      a2 =  number of bytes to compare, less than 8, greater than zero
//      a0, a1, quad-aligned to last quadword

        beq     a2, 80f                 // if eq, all bytes compared

        .set    noat
70:
        ldq     t1, 0(a0)               // t1 = quad at source 1
        ldq     t2, 0(a1)               // t2 = quad at source 2
        bis     zero, 0xff, t0          // zap mask
        sll     t0, a2, t0              //
        zap     t1, t0, t1              // zero bytes not compared
        zap     t2, t0, t2              // same for source 2
        xor     t1, t2, $at             // compare quadwords
        bne     $at, 200f               // if ne, miscompare

        .set    at
//
// Successful compare
//      v0 already contains full length
//

80:
        ret     zero, (ra)              // return


//
// Sources have incompatible alignment
//
CompareUnaligned:


//
// Compare until source 1 (a0) is aligned
//

        and     a0, 0x7, t0             // get byte position of pointer
        beq     t0, 30f                 // if eq, already aligned

        ldq_u   t1, 0(a0)               // get unaligned quad at a0

10:
        ldq_u   t2, 0(a1)               // get unaligned quad at a1
        extbl   t1, t0, t4              // get byte to compare from source 1
        extbl   t2, a1, t2              // get byte to compare from source 2
        xor     t4, t2, t3              // do bytes match?
        bne     t3, 110f                // if ne, miscompare
        subq    a2, 1, a2               // decrement bytes to compare
        beq     a2, 90f                 // (JAE) quit if nothing left to compare
        addq    t0, 1, t0               // increment byte within source 1
        addq    a1, 1, a1               // increment source 2 pointer
        cmpeq   t0, 8, t3               // finished with source 1 quad?
        beq     t3, 10b                 // if eq[false], more to compare

        addq    a0, 7, a0               // point to next source 1 quad
        bic     a0, 7, a0               // align to quadword


//
// Compare 64-byte blocks
//

30:
        srl     a2, 6, t0               // t0 = number of blocks to compare
        beq     t0, 50f                 // if eq, no blocks to move

        ldq_u   t1, 0(a1)               // get source 2 unaligned quad 1

        .set    noat
40:
        ldq_u   t2, 7(a1)               // get source 2 unaligned quad 2
        addq    a0, 64, a0              // increment source 1 pointer
        ldq_u   t3, 15(a1)              // get source 2 unaligned quad 3
        extql   t1, a1, t1              // bytes from unaligned quad 1
        extqh   t2, a1, $at             // bytes from unaligned quad 2
        ldq_u   t4, 23(a1)              // get source 2 unaligned quad 4
        bis     t1, $at, t1             // t1 = quadword 1 (source 2)
        ldq_u   t5, 31(a1)              // get source 2 unaligned quad 5
        extql   t2, a1, t2              // bytes from unaligned quad 2
        extqh   t3, a1, $at             // bytes from unaligned quad 3
        ldq     a3, -64(a0)             // a3 = quadword 1 (source 1)
        bis     t2, $at, t2             // t2 = quadword 2 (source 2)
        ldq     a4, -56(a0)             // a4 = quadword 2 (source 1)
        extql   t3, a1, t3              // bytes from unaligned quad 3
        extqh   t4, a1, $at             // bytes from unaligned quad 4
        ldq     a5, -48(a0)             // a5 = quadword 3 (source 1)
        bis     t3, $at, t3             // t3 = quadword 3 (source 2)
        extql   t4, a1, t4              // bytes from unaligned quad 4
        extqh   t5, a1, $at             // bytes from unaligned quad 5
        subq    t0, 1, t0               // decrement blocks to compare
        bis     t4, $at, t4             // t4 = quadword 4 (source 2)

        xor     t1, a3, $at             // match on quadword 1?
        ldq     a3, -40(a0)             // a3 = quadword 4 (source 1)
        bne     $at, 200f               // if ne, miscompare quad 1
        xor     t2, a4, $at             // match on quadword 2?
        ldq_u   t2, 39(a1)              // get source 2 unaligned quad 6
        bne     $at, 122f               // if ne, miscompare quad 2
        xor     t3, a5, $at             // match on quadword 3?
        ldq_u   t3, 47(a1)              // get source 2 unaligned quad 7
        bne     $at, 124f               // if ne, miscompare quad 3
        xor     t4, a3, $at             // match on quadword 4?
        ldq_u   t4, 55(a1)              // get source 2 unaligned quad 8
        bne     $at, 126f               // if ne, miscompare quad 4
        ldq_u   t1, 63(a1)              // get source 2 unaligned quad 9

        ldq     a3, -32(a0)             // a3 = quadword 5 (source 1)
        extql   t5, a1, t5              // bytes from unaligned quad 5
        extqh   t2, a1, $at             // bytes from unaligned quad 6
        ldq     a4, -24(a0)             // a4 = quadword 6 (source 1)
        ldq     a5, -16(a0)             // a5 = quadword 7 (source 1)
        bis     t5, $at, t5             // t5 = quadword 5 (source 2)

        xor     t5, a3, $at             // match on quadword 5?
        ldq     a3, -8(a0)              // a3 = quadword 8 (source 1)
        bne     $at, 130f               // if ne, miscompare quad 5
        extql   t2, a1, t2              // bytes from unaligned quad 6
        extqh   t3, a1, $at             // bytes from unaligned quad 7
        extql   t3, a1, t3              // bytes from unaligned quad 7
        bis     t2, $at, t2             // t2 = quadword 6 (source 2)
        xor     t2, a4, $at             // match on quadword 6?
        bne     $at, 132f               // if ne, miscompare quad 6
        extqh   t4, a1, $at             // bytes from unaligned quad 8
        extql   t4, a1, t4              // bytes from unaligned quad 8
        bis     t3, $at, t3             // t3 = quadword 7 (source 2)
        xor     t3, a5, $at             // match on quadword 7?
        bne     $at, 134f               // if ne, miscompare quad 7
        extqh   t1, a1, $at             // bytes from unaligned quad 9
        addq    a1, 64, a1              // increment source 2 pointer
        bis     t4, $at, t4             // t4 = quadword 8 (source 2)
        xor     t4, a3, $at             // match on quadword 8?
        bne     $at, 136f               // if ne, miscompare quad 8
        subq    a2, 64, a2              // decrement number of bytes to compare
        bne     t0, 40b                 // if ne, more blocks to compare

        .set    at

//
// Compare quadwords
//


50:
        srl     a2, 3, t0               // t0 = number of quads to compare
        beq     t0, 70f                 // if eq, no quads to compare
        ldq_u   t1, 0(a1)               // get unaligned quad 1 (source 2)

        .set    noat
60:
        ldq_u   t2, 7(a1)               // get unaligned quad 2 (source 2)
        ldq     t3, 0(a0)               // t3 = quadword 1 (source 1)
        extql   t1, a1, t1              // get bytes from unaligned quad 1
        extqh   t2, a1, $at             // get bytes from unaligned quad 2
        addq    a1, 8, a1               // increment source 2 pointer
        bis     t1, $at, t1             // t1 = quadword 1 (source 2)
        xor     t1, t3, $at             // match on quadword?
        bne     $at, 200f               // if ne, miscompare
        subq    t0, 1, t0               // decrement quadwords to compare
        addq    a0, 8, a0               // increment source 1 pointer
        subq    a2, 8, a2               // decrement bytes to compare
        bis     t2, zero, t1            // save low quadword for next loop
        bne     t0, 60b                 // if ne, more quads to compare

        .set    at

//
// Compare bytes for final quadword
//

70:
        beq     a2, 90f                 // if eq, comparison complete

        ldq     t1, 0(a0)               // get quadword from source 1
        bis     zero, zero, t0          // t0 = byte position to compare

        .set    noat
80:
        ldq_u   t2, 0(a1)               // get unaligned quad from source 2
        extbl   t1, t0, t3              // t3 = byte from source 1
        extbl   t2, a1, t2              // t2 = byte from source 2
        xor     t3, t2, $at             // match on byte?
        bne     $at, 100f               // if ne, miscompare on byte
        addq    t0, 1, t0               // increment byte position
        addq    a1, 1, a1               // increment source 2 pointer
        subq    a2, 1, a2               // decrement bytes to compare
        bne     a2, 80b                 // if ne, more bytes to compare

        .set    at
//
// Successful full comparison
//

90:
        ret     zero, (ra)              // return, v0 already set


//
// Miscompare on last quadword
//

100:
        subq    v0, a2, v0              // subtract bytes not compared
        ret     zero, (ra)              // return

//
// Miscompare on first quadword, unaligned case
//
//  v0 = total bytes to compare
//  a2 = bytes remaining to compare
//

110:
        subq    v0, a2, v0              // bytes compared successfully
        ret     zero, (ra)              // return

//
// Miscompare on 64-byte block compare
//

122:
        subq    a2, 8, a2               // miscompare on quad 2
        br      zero, 200f              // finish in common code

124:
        subq    a2, 16, a2              // miscompare on quad 3
        br      zero, 200f              // finish in common code

126:
        subq    a2, 24, a2              // miscompare on quad 4
        br      zero, 200f              // finish in common code

130:
        subq    a2, 32, a2              // miscompare on quad 5
        br      zero, 200f              // finish in common code

132:
        subq    a2, 40, a2              // miscompare on quad 6
        br      zero, 200f              // finish in common code

134:
        subq    a2, 48, a2              // miscompare on quad 7
        br      zero, 200f              // finish in common code

136:
        subq    a2, 56, a2              // miscompare on quad 8
        br      zero, 200f              // finish in common code

//
// Miscompare, determine number of bytes that successfully compared
//      $at = xor of relevant quads from sources, must be non-zero
//      a2 = number of bytes left to compare
//
        .set    noat
200:
        cmpbge  zero, $at, $at          // $at = mask of non-zero bytes

        //
        // look for the first bit cleared in $at, this is the
        //      number of the first byte which differed
        //
        bis     zero, zero, t0          // bit position to look for clear

210:
        blbc    $at, 220f               // if low clear, found difference
        srl     $at, 1, $at             // check next bit
        addq    t0, 1, t0               // count bit position checked
        br      zero, 210b

220:
        subq    v0, a2, v0              // subtract bytes yet to compare
        addq    v0, t0, v0              // add bytes that matched on last quad

        ret     zero, (ra)

        .set    at

        .end    RtlCompareMemory


        SBTTL("Move Memory")
//++
//
// VOID
// RtlMoveMemory (
//    IN PVOID Destination,
//    IN PVOID Source,
//    IN ULONG Length
//    )
//
// Routine Description:
//
//    This function moves memory either forward or backward, aligned or
//    unaligned, in 64-byte blocks, followed by 8-byte blocks, followed
//    by any remaining bytes.
//
// Arguments:
//
//    Destination (a0) - Supplies a pointer to the destination address of
//       the move operation.
//
//    Source (a1) - Supplies a pointer to the source address of the move
//       operation.
//
//    Length (a2) - Supplies the length, in bytes, of the memory to be moved.
//
// Return Value:
//
//    None.
//
//--

        LEAF_ENTRY(RtlMoveMemory)

        beq     a2, 80f                 // if eq, no bytes to move
//
// If the source address is less than the destination address and source
// address plus the length of the move is greater than the destination
// address, then the source and destination overlap such that the move
// must be performed backwards.
//

        cmpult  a0, a1, t0              // is destination less than source
        bne     t0, MoveForward         // if eq [true] no overlap possible
        addq    a1, a2, t0              // compute source ending address
        cmpult  t0, a0, t1              // is source end less than dest.
        beq     t1, MoveBackward        // if eq [false], overlap

//
// Move memory forward aligned and unaligned.
//

MoveForward:                            //
        xor     a0, a1, t0              // compare alignment bits
        and     t0, 0x7, t0             // isloate alignment comparison
        bne     t0, MoveForwardUnaligned  // if ne, incompatible alignment

//
// Move memory forward aligned.
//

MoveForwardAligned:                     //

//
// Move bytes until source and destination are quadword aligned
//

        and     a0, 0x7, t0             // t0 = unaligned bits
        bne     t0, 5f                  // if ne, not quad aligned
        br      zero, 20f               // predicted taken

5:
        ldq_u   t2, 0(a0)               // get unaligned quad from dest.
        ldq_u   t1, 0(a1)               // get unaligned quadword from source
10:
        beq     a2, 15f                 // if eq, all bytes moved
        extbl   t1, t0, t3              // t3 = byte from source
        insbl   t3, t0, t3              // t3 = byte from source, in position
        mskbl   t2, t0, t2              // clear position in dest. quad
        bis     t2, t3, t2              // merge in byte from source
        subq    a2, 1, a2               // decrement bytes to move
        addq    t0, 1, t0               // increment byte within quad
        cmpeq   t0, 8, t3               // finished the quadword?
        beq     t3, 10b                 // if eq [false], do next byte
15:
        stq_u   t2, 0(a0)               // store merged destination bytes

        addq    a0, 7, a0               // move to next quadword
        bic     a0, 7, a0               // aligned quadword

        addq    a1, 7, a1               // move to next quadword
        bic     a1, 7, a1               // aligned quadword

//
// Check for 64-byte block moves
//

20:
        srl     a2, 6, t0               // t0 = number of 64 byte blocks
        beq     t0, 40f                 // if eq no blocks to move
        and     a2, 64-1, a2            // a2 = residual bytes

30:
        ldq     t1, 0(a1)               // load 64 bytes from source
        addq    a0, 64, a0              // increment destination pointer
        ldq     v0, 56(a1)              //
        ldq     a3, 32(a1)              //
        stq     t1, -64(a0)             // write to destination
        ldq     t2, 8(a1)               //   into volatile registers
        ldq     t3, 16(a1)              //
        ldq     t4, 24(a1)              //
        subq    t0, 1, t0               // decrement number of blocks
        stq     t2, -56(a0)             //
        ldq     a4, 40(a1)              //
        stq     t3, -48(a0)             //
        ldq     a5, 48(a1)              //
        stq     t4, -40(a0)             //
        addq    a1, 64, a1              // increment source pointer
        stq     a3, -32(a0)             //
        stq     a4, -24(a0)             //
        stq     a5, -16(a0)             //
        stq     v0, -8(a0)              //
        bne     t0, 30b                 // if ne, more blocks to copy

//
// Copy quadwords
//

40:
        srl     a2, 3, t0               // t0 = number of quadwords to move
        beq     t0, 60f                 // if eq no quadwords to move
        and     a2, 8-1, a2             // a2 = residual bytes

50:
        ldq     t1, 0(a1)               // load quadword from source
        addq    a1, 8, a1               // increment source pointer
        stq     t1, 0(a0)               // store quadword to destination
        addq    a0, 8, a0               // increment destination pointer
        subq    t0, 1, t0               // decrement number of quadwords
        bne     t0, 50b                 // if ne, more quadwords to move

//
// Move final residual bytes
//

60:
        beq     a2, 80f                 // if eq, no more bytes to move
        ldq     t1, 0(a1)               // get last source quadword
        ldq     t2, 0(a0)               // get last dest. quadword
        bis     zero, zero, t0          // t0 = next byte number to move

70:
        extbl   t1, t0, t3              // extract byte from source
        insbl   t3, t0, t3              // t3 = source byte, in position
        mskbl   t2, t0, t2              // clear byte position for dest.
        bis     t2, t3, t2              // merge in source byte
        addq    t0, 1, t0               // increment byte position
        subq    a2, 1, a2               // decrement bytes to move
        bne     a2, 70b                 // if ne => more bytes to move

        stq     t2, 0(a0)               // store merged data

//
// Finish aligned MoveForward
//

80:
        ret     zero, (ra)              // return


//
// Move memory forward unaligned.
//

MoveForwardUnaligned:                   //


//
// Move bytes until the destination is aligned
//

        and     a0, 0x7, t0             // t0 = unaligned bits
        beq     t0, 100f                // if eq, destination quad aligned

        ldq_u   t2, 0(a0)               // get unaligned quad from dest

90:
        beq     a2, 95f                 // if eq no more bytes to move
        ldq_u   t1, 0(a1)               // get unaligned quad from source
        extbl   t1, a1, t1              // extract source byte
        insbl   t1, t0, t1              // t1 = source byte, in position
        mskbl   t2, t0, t2              // clear byte position in dest.
        bis     t2, t1, t2              // merge in source byte
        addq    t0, 1, t0               // increment byte position
        addq    a1, 1, a1               // increment source pointer
        subq    a2, 1, a2               // decrement bytes to move
        cmpeq   t0, 8, t3               // t0 = 8? => quad finished
        beq     t3, 90b                 // if eq [false], more bytes to move
95:
        stq_u   t2, 0(a0)               // store merged quadword
        addq    a0, 7, a0               // increment to next quad
        bic     a0, 7, a0               // align next quadword

//
// Check for 64-byte blocks to move
//

100:
        srl     a2, 6, t0               // t0 = number of blocks to move
        beq     t0, 120f                // if eq no blocks to move
        and     a2, 64-1, a2            // a2 = residual bytes to move


        ldq_u   t1, 0(a1)               // t1 = first unaligned quad

110:
                                        // get source data and merge it
                                        //  as we go
        ldq_u   t2, 7(a1)               // t2 = second unaligned quad
        extql   t1, a1, t1              // extract applicable bytes from t1
        extqh   t2, a1, v0              // extract applicable bytes from t2
        bis     t1, v0, t1              // t1 = quad #1
        ldq_u   t3, 15(a1)              // t3 = third unaligned quad
        extql   t2, a1, t2              // extract applicable bytes from t2
        extqh   t3, a1, v0              // extract applicable bytes from t3
        stq     t1, 0(a0)               // store quad #1
        bis     t2, v0, t2              // t2 = quad #2
        ldq_u   t4, 23(a1)              // t4 = fourth unaligned quad
        extql   t3, a1, t3              // extract applicable bytes from t3
        extqh   t4, a1, v0              // extract applicable bytes from t4
        stq     t2, 8(a0)               // store quad #2
        bis     t3, v0, t3              // t3 = quad #3
        ldq_u   t5, 31(a1)              // t5 = fifth unaligned quad
        extql   t4, a1, t4              // extract applicable bytes from t4
        extqh   t5, a1, v0              // extract applicable bytes from t5
        stq     t3, 16(a0)              // store quad #3
        bis     t4, v0, t4              // t4 = quad #4
        ldq_u   a3, 39(a1)              // a3 = sixth unaligned quad
        extql   t5, a1, t5              // extract applicable bytes from t5
        extqh   a3, a1, v0              // extract applicable bytes from a3
        stq     t4, 24(a0)              // store quad #4
        bis     t5, v0, t5              // t5 = quad #5
        ldq_u   a4, 47(a1)              // a4 = seventh unaligned quad
        extql   a3, a1, a3              // extract applicable bytes from a3
        extqh   a4, a1, v0              // extract applicable bytes from a4
        stq     t5, 32(a0)              // store quad #5
        bis     a3, v0, a3              // a3 = quad #6
        ldq_u   a5, 55(a1)              // a5 = eighth unaligned quad
        extql   a4, a1, a4              // extract applicable bytes from a4
        extqh   a5, a1, v0              // extract applicable bytes from a5
        stq     a3, 40(a0)              // store quad #6
        bis     a4, v0, a4              // a4 = quad #7
        ldq_u   t1, 63(a1)              // t1 = ninth unaligned = 1st of next
        extql   a5, a1, a5              // extract applicable bytes from a5
        extqh   t1, a1, v0              // extract applicable bytes from t1
        stq     a4, 48(a0)              // store quad #7
        bis     a5, v0, a5              // a5 = quad #8
        addq    a1, 64, a1              // increment source pointer
        stq     a5, 56(a0)              // store quad #8
        addq    a0, 64, a0              // increment destination pointer
        subq    t0, 1, t0               // decrement number of blocks
        bne     t0, 110b                // if ne, more blocks to move

//
// Move unaligned source quads to aligned destination quads
//

120:
        srl     a2, 3, t0               // t0 = number of quads to move
        beq     t0, 140f                // if eq no quads to move
        and     a2, 8-1, a2             // a2 = residual bytes


        ldq_u   t1, 0(a1)               // t1 = first unaligned quad
130:
        ldq_u   t2, 7(a1)               // t2 = second unaligned quad
        addq    a0, 8, a0               // increment destination pointer
        extql   t1, a1, t1              // extract applicable bytes from t1
        extqh   t2, a1, v0              // extract applicable bytes from t2
        bis     t1, v0, t1              // t1 = quadword of data
        stq     t1, -8(a0)              // store data to destination
        addq    a1, 8, a1               // increment source pointer
        subq    t0, 1, t0               // decrement quads to move
        bis     t2, zero, t1            // t1 = first of next unaligned pair
        bne     t0, 130b                // if ne, more quads to move

//
// Move remaining bytes to final quadword
//


140:
        beq     a2, 160f                // if eq no more bytes to move
        ldq     t2, 0(a0)               // t2 = destination quadword
        bis     zero, zero, t3          // t3 = position for next insertion

150:
        ldq_u   t1, 0(a1)               // get unaligned source quad
        extbl   t1, a1, t1              // t1 = source byte
        insbl   t1, t3, t1              // t1 = source byte, in position
        mskbl   t2, t3, t2              // clear byte in destination
        bis     t2, t1, t2              // merge in source byte
        addq    a1, 1, a1               // increment source pointer
        subq    a2, 1, a2               // decrement bytes to move
        addq    t3, 1, t3               // increment destination position
        bne     a2, 150b                // more bytes to move

        stq     t2, 0(a0)               // store merged data

//
// Finish unaligned MoveForward
//

160:
        ret     zero, (ra)              // return


//
// Move memory backward.
//

MoveBackward:                           //

        addq    a0, a2, a0              // compute ending destination address
        addq    a1, a2, a1              // compute ending source address
        subq    a0, 1, a0               // point to last destination byte
        subq    a1, 1, a1               // point to last source byte
        xor     a0, a1, t0              // compare alignment bits
        and     t0, 0x7, t0             // isolate alignment comparison
        bne     t0, MoveBackwardUnaligned   // if ne, incompatible alignment

//
// Move memory backward aligned.
//

MoveBackwardAligned:                    //

//
// Move bytes until source and destination are quadword aligned
//

        and     a0, 0x7, t0             // t0 = unaligned bits
        cmpeq   t0, 7, t1               // last byte position 7?
        beq     t1, 5f                  // if eq [false], not quad aligned
        subq    a0, 7, a0               // point to beginning of last quad
        subq    a1, 7, a1               // point to beginning of last quad
        br      zero, 30f               // predicted taken

5:
        ldq_u   t1, 0(a0)               // get unaligned quad from dest.
        ldq_u   t2, 0(a1)               // get unaligned quad from source

10:
        beq     a2, 20f                 // if eq, all bytes moved
        extbl   t2, t0, t3              // t3 = byte from source
        insbl   t3, t0, t3              // t3 = byte from source, in position
        mskbl   t1, t0, t1              // clear position in destination
        bis     t1, t3, t1              // merge in byte from source
        subq    a2, 1, a2               // decrement bytes to move
        subq    t0, 1, t0               // decrement byte within quadword
        cmplt   t0, zero, t3            // finished the quadword?
        beq     t3, 10b                 // if eq [false], do next byte

20:
        stq_u   t1, 0(a0)               // store merged destination bytes

        subq    a0, 8, a0               // move to previous quadword
        bic     a0, 7, a0               // aligned quadword

        subq    a1, 8, a1               // move to previous quadword
        bic     a1, 7, a1               // aligned quadword

//
// Check for 64-byte block moves
//

30:

        srl     a2, 6, t0               // t0 = number of 64 byte blocks
        beq     t0, 50f                 // if eq, no blocks to move
        and     a2, 64-1, a2            // a2 = residual bytes

40:
        ldq     t1, 0(a1)               // load 64 bytes from source into
        subq    a0, 64, a0              // decrement destination pointer
        ldq     v0, -56(a1)             //
        ldq     a3, -32(a1)             //
        stq     t1, 64(a0)              // write to destination
        ldq     t2, -8(a1)              //   into volatile registers
        ldq     a5, -48(a1)             //
        ldq     a4, -40(a1)             //
        stq     t2, 56(a0)              //
        ldq     t3, -16(a1)             //
        ldq     t4, -24(a1)             //
        subq    a1, 64, a1              // decrement source pointer
        stq     t3, 48(a0)              //
        stq     t4, 40(a0)              //
        stq     a3, 32(a0)              //
        subq    t0, 1, t0               // decrement number of blocks
        stq     a4, 24(a0)              //
        stq     a5, 16(a0)              //
        stq     v0, 8(a0)               //
        bne     t0, 40b                 // if ne, more blocks to copy

//
// Copy quadwords
//

50:
        srl     a2, 3, t0               // t0 = number of quadwords to move
        beq     t0, 70f                 // if eq no quadwords to move
        and     a2, 8-1, a2             // a2 = residual bytes

60:
        ldq     t1, 0(a1)               // load quadword from source
        subq    a1, 8, a1               // decrement source pointer
        stq     t1, 0(a0)               // store quadword to destination
        subq    a0, 8, a0               // decrement destination pointer
        subq    t0, 1, t0               // decrement quadwords to move
        bne     t0, 60b                 // if ne, more quadwords to move

//
// Move final residual bytes
//

70:
        beq     a2, 90f                 // if eq, no more bytes to move
        ldq     t1, 0(a1)               // get last source quadword
        ldq     t2, 0(a0)               // get last destination quadword
        bis     zero, 7, t0             // t0 = next byte number to move

80:
        extbl   t1, t0, t3              // extract byte from source
        insbl   t3, t0, t3              // t3 = source byte, in position
        mskbl   t2, t0, t2              // clear byte position for dest.
        bis     t2, t3, t2              // merge in source byte
        subq    t0, 1, t0               // decrement byte position
        subq    a2, 1, a2               // decrement bytes to move
        bne     a2, 80b                 // if ne, more bytes to move

        stq     t2, 0(a0)               // write destination data
//
// Finish aligned MoveBackward
//

90:

        ret     zero, (ra)              // return


//
// Move memory backward unaligned.
//

MoveBackwardUnaligned:                  //


//
// Move bytes until the destination is aligned
//

        and     a0, 0x7, t0             // t0 = unaligned bits
        cmpeq   t0, 7, t1               // last byte of a quadword
        beq     t1, 95f                 // if eq[false], not aligned
        subq    a0, 7, a0               // align pointer to beginning of quad
        br      zero, 120f              //

95:
        ldq_u   t2, 0(a0)               // get unaligned quad from dest.

100:
        beq     a2, 110f                // if eq, no more bytes to move
        ldq_u   t1, 0(a1)               // get unaligned quad from source
        extbl   t1, a1, t1              // extract source byte
        insbl   t1, t0, t1              // t1 = source byte in position
        mskbl   t2, t0, t2              // clear byte position in dest.
        bis     t2, t1, t2              // merge source byte
        subq    t0, 1, t0               // decrement byte position
        subq    a1, 1, a1               // decrement source pointer
        subq    a2, 1, a2               // decrement number of bytes to move
        cmplt   t0, zero, t3            // t0 < 0? => quad finished
        beq     t3, 100b                // if eq [false], more bytes to move

110:
        stq_u   t2, 0(a0)               // store merged quadword

        subq    a0, 8, a0               // decrement dest. to previous quad
        bic     a0, 7, a0               // align previous quadword

//
// Check for 64-byte blocks to move
//

120:

        srl     a2, 6, t0               // t0 = number of blocks to move
        subq    a1, 7, a1               // point to beginning of last quad
        beq     t0, 140f                // if eq no blocks to move
        and     a2, 64-1, a2            // a2 = residual bytes to move

        ldq_u   t1, 7(a1)               // t1 = first unaligned quad

130:
                                        // get source data and merge it
                                        //  as we go
        ldq_u   t2, 0(a1)               // t2 = second unaligned quad
        extqh   t1, a1, t1              // extract applicable bytes from t1
        extql   t2, a1, v0              // extract applicable bytes from t2
        bis     t1, v0, t1              // t1 = quad #1
        ldq_u   t3, -8(a1)              // t3 = third unaligned quad
        extqh   t2, a1, t2              // extract applicable bytes from t2
        extql   t3, a1, v0              // extract applicable bytes from t3
        stq     t1, 0(a0)               // store quad #1
        bis     t2, v0, t2              // t2 = quad #2
        ldq_u   t4, -16(a1)             // t4 = fourth unaligned quad
        extqh   t3, a1, t3              // extract applicable bytes from t3
        extql   t4, a1, v0              // extract applicable bytes from t4
        stq     t2, -8(a0)              // store quad #2
        bis     t3, v0, t3              // t3 = quad #3
        ldq_u   t5, -24(a1)             // t5 = fifth unaligned quad
        extqh   t4, a1, t4              // extract applicable bytes from t4
        extql   t5, a1, v0              // extract applicable bytes from t5
        stq     t3, -16(a0)             // store quad #3
        bis     t4, v0, t4              // t4 = quad #4
        ldq_u   a3, -32(a1)             // a3 = sixth unaligned quad
        extqh   t5, a1, t5              // extract applicable bytes from t5
        extql   a3, a1, v0              // extract applicable bytes from a3
        stq     t4, -24(a0)             // store quad #4
        bis     t5, v0, t5              // t5 = quad #5
        ldq_u   a4, -40(a1)             // a4 = seventh unaligned quad
        extqh   a3, a1, a3              // extract applicable bytes from a3
        extql   a4, a1, v0              // extract applicable bytes from a4
        stq     t5, -32(a0)             // store quad #5
        bis     a3, v0, a3              // a3 = quad #6
        ldq_u   a5, -48(a1)             // a5 = eighth unaligned quad
        extqh   a4, a1, a4              // extract applicable bytes from a4
        extql   a5, a1, v0              // extract applicable bytes from a5
        stq     a3, -40(a0)             // store quad #6
        bis     a4, v0, a4              // a4 = quad #7
        ldq_u   t1, -56(a1)             // t1 = ninth unaligned = 1st of next
        extqh   a5, a1, a5              // extract applicable bytes from a5
        extql   t1, a1, v0              // extract applicable bytes from t1
        stq     a4, -48(a0)             // store quad #7
        bis     a5, v0, a5              // a5 = quad #8
        subq    a1, 64, a1              // increment source pointer
        stq     a5, -56(a0)             // store quad #8
        subq    a0, 64, a0              // increment destination pointer
        subq    t0, 1, t0               // decrement number of blocks
        bne     t0, 130b                // if ne, more blocks to move


//
// Move unaligned source quads to aligned destination quads
//

140:
        srl     a2, 3, t0               // t0 = number of quads to move
        beq     t0, 160f                // if eq no quads to move
        and     a2, 8-1, a2             // a2 = residual bytes

        ldq_u   t1, 7(a1)               // t1 = first unaligned quad

150:
        ldq_u   t2, 0(a1)               // t2 = second unaligned quad
        subq    a0, 8, a0               // decrement destination pointer
        extqh   t1, a1, t1              // extract applicable bytes from t1
        extql   t2, a1, v0              // extract applicable bytes from t2
        bis     t1, v0, t1              // t1 = quadword of data
        stq     t1, 8(a0)               // store data to destination
        subq    a1, 8, a1               // decrement source pointer
        subq    t0, 1, t0               // decrement quads to move
        bis     t2, zero, t1            // t1 = first of next unaligned pair
        bne     t0, 150b                // if ne, more quads to move

//
// Move remaining bytes to final quadword
//

160:
        beq     a2, 180f                // if eq, no more bytes to move
        ldq     t2, 0(a0)               // t2 = destination quadword
        bis     zero, 7, t0             // t0 = position for next insertion

170:
        subq    a1, 1, a1               // decrement source pointer
        ldq_u   t1, 8(a1)               // get unaligned source quad
        extbl   t1, a1, t1              // t1 = source byte
        insbl   t1, t0, t1              // t1 = source byte, in position
        mskbl   t2, t0, t2              // clear byte position
        bis     t2, t1, t2              // merge in source byte
        subq    t0, 1, t0               // decrement byte position for dest.
        subq    a2, 1, a2               // decrement bytes to move
        bne     a2, 170b                // if ne, more bytes to move

        stq     t2, 0(a0)               //

//
// Finish unaligned MoveBackward
//

180:
        ret     zero, (ra)              // return

        .end    RtlMoveMemory

        SBTTL("Zero Memory")
//++
//
// VOID
// RtlZeroMemory (
//    IN PVOID Destination,
//    IN ULONG Length
//    )
//
// Routine Description:
//
//    This function zeros memory by first aligning the destination address to
//    a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
//    blocks, followed by any remaining bytes.
//
// Arguments:
//
//    Destination (a0) - Supplies a pointer to the memory to zero.
//
//    Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
//
// Return Value:
//
//    None.
//
//--

        LEAF_ENTRY(RtlZeroMemory)

        bis     zero, zero, a2          // set fill pattern
        br      zero, RtlpFillMemory    //


        SBTTL("Fill Memory")
//++
//
// VOID
// RtlFillMemory (
//    IN PVOID Destination,
//    IN ULONG Length,
//    IN UCHAR Fill
//    )
//
// Routine Description:
//
//    This function fills memory by first aligning the destination address to
//    a longword boundary, and then filling 32-byte blocks, followed by 4-byte
//    blocks, followed by any remaining bytes.
//
// Arguments:
//
//    Destination (a0) - Supplies a pointer to the memory to fill.
//
//    Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
//    Fill (a2) - Supplies the fill byte.
//
//    N.B. The alternate entry memset expects the length and fill arguments
//         to be reversed.  It also returns the Destination pointer
//
// Return Value:
//
//    None.
//
//--

        ALTERNATE_ENTRY(memset)

        bis     a0, zero, v0            // set return value
        bis     a1, zero, a3            // swap length and fill arguments
        bis     a2, zero, a1            //
        bis     a3, zero, a2            //

        ALTERNATE_ENTRY(RtlFillMemory)

        and     a2, 0xff, a2            // clear excess bits
        sll     a2, 8, t0               // duplicate fill byte
        bis     a2, t0, a2              // generate fill word
        sll     a2, 16, t0              // duplicate fill word
        bis     a2, t0, a2              // generate fill longword
        sll     a2, 32, t0              // duplicate fill longword
        bis     a2, t0, a2              // generate fill quadword

.align 3                                // ensure quadword aligned target
//
// Fill memory with the pattern specified in register a2.
//

RtlpFillMemory:                         //

//
// Align destination to quadword
//

        beq     a1, 80f                 // anything to fill? (paranoia)
        and     a0, 8-1, t0             // t0 = unaligned bits
        bne     t0, 5f                  // if ne, then not quad aligned
        br      zero, 20f               // if eq, then quad aligned

5:
        ldq_u   t1, 0(a0)               // get unaligned quadword
                                        //   for first group of bytes
10:
        beq     a1, 15f                 // if eq no more bytes to fill
        insbl   a2, t0, t2              // get fill byte into position
        mskbl   t1, t0, t1              // clear byte for fill
        bis     t1, t2, t1              // put in fill byte
        addq    t0, 1, t0               // increment to next byte position
        subq    a1, 1, a1               // decrement bytes to fill
        cmpeq   t0, 8, t2               // t0 = 8?
        beq     t2, 10b                 // if eq [false] more bytes to do

15:
        stq_u   t1, 0(a0)               // store modified bytes
        addq    a0, 7, a0               // move a0 to next quadword
        bic     a0, 7, a0               // align a0 to quadword

//
// Check for 64-byte blocks
//

20:
        srl     a1, 6, t0               // t0 = number of 64 byte blocks
        beq     t0, 40f                 // if eq then no 64 byte blocks
        and     a1, 64-1, a1            // a1 = residual bytes to fill

30:
        stq     a2, 0(a0)               // store 64 bytes
        stq     a2, 8(a0)               //
        stq     a2, 16(a0)              //
        stq     a2, 24(a0)              //
        stq     a2, 32(a0)              //
        stq     a2, 40(a0)              //
        stq     a2, 48(a0)              //
        stq     a2, 56(a0)              //

        subq    t0, 1, t0               // decrement blocks remaining
        addq    a0, 64, a0              // increment destination pointer
        bne     t0, 30b                 // more blocks to write


//
// Fill aligned quadwords
//

40:
        srl     a1, 3, t0               // t0 = number of quadwords
        bne     t0, 55f                 // if ne quadwords left to fill
        br      zero, 60f               // if eq no quadwords left

55:
        and     a1, 8-1, a1             // a1 = residual bytes to fill

50:
        stq     a2, 0(a0)               // store quadword
        subq    t0, 1, t0               // decrement quadwords remaining
        addq    a0, 8, a0               // next quadword
        bne     t0, 50b                 // more quadwords to write


//
// Fill bytes for last quadword
//

60:
        bne     a1, 65f                 // if ne bytes remain to be filled
        br      zero, 80f               // if eq no more bytes to fill

65:
        ldq     t1, 0(a0)               // get last quadword
        bis     zero, zero, t0          // t0 = byte position to start fill

70:
        beq     a1, 75f                 // if eq, no more bytes to fill
        insbl   a2, t0, t2              // get fill byte into position
        mskbl   t1, t0, t1              // clear fill byte position
        bis     t1, t2, t1              // insert fill byte
        addq    t0, 1, t0               // increment byte within quad
        subq    a1, 1, a1               // decrement bytes to fill
        cmpeq   t0, 8, t3               // t0 = 8? => finished quad
        beq     t3, 70b                 // if eq [false] more bytes to fill

75:
        stq     t1, 0(a0)               // write merged quadword

//
// Finish up
//

80:
        ret     zero, (ra)              // return


        .end    RtlZeroMemory

        SBTTL("Fill Memory Ulong")
//++
//
// VOID
// RtlFillMemoryUlong (
//    IN PVOID Destination,
//    IN ULONG Length,
//    IN ULONG Pattern
//    )
//
// Routine Description:
//
//    This function fills memory with the specified longowrd pattern by
//    filling 64-byte blocks followed by 8-byte blocks and finally
//    4-byte blocks.
//
//    N.B. This routine assumes that the destination address is aligned
//         on a longword boundary and that the length is an even multiple
//         of longwords.
//
// Arguments:
//
//    Destination (a0) - Supplies a pointer to the memory to fill.
//
//    Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
//    Pattern (a2) - Supplies the fill pattern.
//
// Return Value:
//
//    None.
//
//--

        LEAF_ENTRY(RtlFillMemoryUlong)

        bic     a1, 3, a1               // make sure length is an even number
                                        // of longwords
        sll     a2, 32, a3              // a3 = long pattern in upper 32 bits
        srl     a3, 32, t0              // clear upper bits, pattern in lower 32
        bis     a3, t0, a3              // a3 = quad version of fill pattern

//
// Make destination address quad-aligned
//

        and     a0, 4, t0               // is a0 quad aligned?
        beq     t0, 10f                 // if eq, then a0 quad aligned
        stl     a2, 0(a0)               // fill first longword
        addq    a0, 4, a0               // quad align a0
        subq    a1, 4, a1               // bytes remaining to store

//
// Check for 64-byte blocks to fill
//

10:
        srl     a1, 6, t0               // t0 = # 64-byte blocks to fill
        beq     t0, 30f                 // if eq no 64 byte blocks
        and     a1, 64-1, a1            // a1 = residual bytes

20:
        stq     a3, 0(a0)               // store 64 bytes
        stq     a3, 8(a0)               //
        stq     a3, 16(a0)              //
        stq     a3, 24(a0)              //
        stq     a3, 32(a0)              //
        stq     a3, 40(a0)              //
        stq     a3, 48(a0)              //
        stq     a3, 56(a0)              //
        subq    t0, 1, t0               // t0 = blocks remaining
        addq    a0, 64, a0              // increment address pointer
        bne     t0, 20b                 // if ne more blocks to fill

//
// Fill 8 bytes at a time while we can, a1 = bytes remaining
//

30:
        srl     a1, 3, t0               // t0 = # quadwords to fill
        beq     t0, 50f                 // if eq no quadwords left
        and     a1, 8-1, a1             // a1 = residual bytes
40:
        stq     a3, 0(a0)               // store quadword
        subq    t0, 1, t0               // t0 = quadwords remaining
        addq    a0, 8, a0               // increment address pointer
        bne     t0, 40b                 // if ne more quadwords to fill

//
// Fill last 4 bytes
//

50:
        beq     a1, 60f                 // if eq no longwords remain
        stl     a2, 0(a0)               // fill last longword

//
// Finish up
//

60:
        ret     zero, (ra)              // return to caller


        .end    RtlFillMemoryUlong

        SBTTL("Copy Memory With Byte Granularity")
//++
//
// VOID
// RtlCopyBytes (
//    IN PVOID Destination,
//    IN PVOID Source,
//    IN ULONG Length
//    )
//
// Routine Description:
//
//    This function copies non-overlapping memory, aligned or unaligned, in
//    64-byte blocks, followed by 8-byte blocks, followed by any remaining
//    bytes.  Unlike RtlCopyMemory or RtlMoveMemory the copy is done such
//    that byte granularity is assured for all platforms.
//
// Arguments:
//
//    Destination (a0) - Supplies a pointer to the destination address of
//       the move operation.
//
//    Source (a1) - Supplies a pointer to the source address of the move
//       operation.
//
//    Length (a2) - Supplies the length, in bytes, of the memory to be moved.
//
// Return Value:
//
//    None.
//
//--

        LEAF_ENTRY(RtlCopyBytes)

//
// Move memory forward aligned and unaligned.
//

        xor     a0, a1, t0              // compare alignment bits
        and     t0, 0x7, t0             // isolate alignment comparison
        bne     t0, CopyForwardUnaligned // if ne, incompatible alignment

//
// Source and Destination buffers have the same alignment. Move
// bytes until done or source and destination are quadword aligned
//

        and     a0, 0x7, t0             // t0 = unaligned bits
        bne     t0, 5f                  // if ne, not quad aligned
        br      zero, 20f               // predicted taken
5:
        bis     zero, zero, t1          // t4 = destination byte zap mask
        bis     zero, 1, t2
        sll     t2, t0, t2              // t2 = next bit to set in zap mask
10:
        beq     a2, 15f                 // if eq, all bits set
        bis     t1, t2, t1              // set bit in zap mask
        sll     t2, 1, t2               // set next higher bit for zap mask
        subq    a2, 1, a2               // decrement bytes to move
        addq    t0, 1, t0               // increment byte within quad
        cmpeq   t0, 8, t3               // finished the quadword?
        beq     t3, 10b                 // if eq [false], do next byte
15:
        ldq_u   t2, 0(a1)               // get unaligned quadword from source
        zapnot  t2, t1, t2              // clear source bytes
        bic     a0, 7, a3               // a3 = quadword base of destination
retry1:
        ldq_l   t0, 0(a3)               // load destination quadword
        zap     t0, t1, t0              // clear destination bytes
        or      t0, t2, t0              // merge in bytes from source
        stq_c   t0, 0(a3)               // store merged quadword conditional
        beq     t0, retry1f             // if eq, retry failed interlock

        addq    a0, 7, a0               // move to next quadword
        bic     a0, 7, a0               // aligned quadword

        addq    a1, 7, a1               // move to next quadword
        bic     a1, 7, a1               // aligned quadword

//
// Check for 64-byte block moves
//

20:
        srl     a2, 6, t0               // t0 = number of 64 byte blocks
        beq     t0, 40f                 // if eq no blocks to move
        and     a2, 64-1, a2            // a2 = residual bytes

30:
        ldq     t1, 0(a1)               // load 64 bytes from source
        addq    a0, 64, a0              // increment destination pointer
        ldq     v0, 56(a1)              //
        ldq     a3, 32(a1)              //
        stq     t1, -64(a0)             // write to destination
        ldq     t2, 8(a1)               //   into volatile registers
        ldq     t3, 16(a1)              //
        ldq     t4, 24(a1)              //
        subq    t0, 1, t0               // decrement number of blocks
        stq     t2, -56(a0)             //
        ldq     a4, 40(a1)              //
        stq     t3, -48(a0)             //
        ldq     a5, 48(a1)              //
        stq     t4, -40(a0)             //
        addq    a1, 64, a1              // increment source pointer
        stq     a3, -32(a0)             //
        stq     a4, -24(a0)             //
        stq     a5, -16(a0)             //
        stq     v0, -8(a0)              //
        bne     t0, 30b                 // if ne, more blocks to copy

//
// Copy quadwords
//

40:
        srl     a2, 3, t0               // t0 = number of quadwords to move
        beq     t0, 60f                 // if eq no quadwords to move
        and     a2, 8-1, a2             // a2 = residual bytes

50:
        ldq     t1, 0(a1)               // load quadword from source
        addq    a1, 8, a1               // increment source pointer
        stq     t1, 0(a0)               // store quadword to destination
        addq    a0, 8, a0               // increment destination pointer
        subq    t0, 1, t0               // decrement number of quadwords
        bne     t0, 50b                 // if ne, more quadwords to move

//
// Move final residual bytes
//

60:
        beq     a2, 80f                 // if eq, no more bytes to move
        mov     a2, t0                  // t0 = number of bytes to move
        mov     -1, t1                  // t1 = bit mask
        sll     t0, 3, t0               // # of bytes to # of bits
        srl     t1, t0, t1              // clear t0 bits
        sll     t1, t0, t0              // move it back
        ldq     t1, 0(a1)               // get last source quadword
        bic     t1, t0, t1              // clear bytes not copied
        not     t0, t0                  // complement to clear destination
retry2:
        ldq_l   t2, 0(a0)               // get last destination quadword locked
        bic     t2, t0, t2              // clear bytes to be copied
        bis     t2, t1, t2              // move bytes from source
        stq_c   t2, 0(a0)               // store merged quadword conditional
        beq     t2, retry2f             // if eq, retry failed interlock

//
// Finish aligned MoveForward
//

80:
        ret     zero, (ra)              // return

//
// Move memory forward unaligned.
//

CopyForwardUnaligned:                   //

//
// Move bytes until the destination is aligned
//

        and     a0, 0x7, t0             // t0 = unaligned bits
        beq     t0, 100f                // if eq, destination quad aligned
        bis     zero, zero, t1          // t4 = destination byte zap mask
        bis     zero, 1, t2
        sll     t2, t0, t2              // t2 = next bit to set in zap mask
        mov     zero, t4                // assemble destination bytes here
90:
        beq     a2, 95f                 // if eq no more bytes to move
        bis     t1, t2, t1              // set bit in zap mask
        sll     t2, 1, t2               // set next higher bit for zap mask
        ldq_u   t5, 0(a1)               // get unaligned quad from source
        extbl   t5, a1, t5              // extract source byte
        insbl   t5, t0, t5              // t5 = source byte, in position
        or      t4, t5, t4              // merge in source byte
        addq    t0, 1, t0               // increment byte position
        addq    a1, 1, a1               // increment source pointer
        subq    a2, 1, a2               // decrement bytes to move
        cmpeq   t0, 8, t3               // t0 = 8? => quad finished
        beq     t3, 90b                 // if eq [false], more bytes to move
95:
        bic     a0, 0x7, a3             // a3 = quadword base of destination
retry3:
        ldq_l   t0, 0(a3)               // load destination quadword
        zap     t0, t1, t0              // clear destination bytes
        or      t0, t4, t0              // merge in bytes from source
        stq_c   t0, 0(a3)               // store merged quadword conditional
        beq     t0, retry3f             // if eq, retry failed interlock

        addq    a0, 7, a0               // increment to next quad
        bic     a0, 7, a0               // align next quadword

//
// Check for 64-byte blocks to move
//

100:
        srl     a2, 6, t0               // t0 = number of blocks to move
        beq     t0, 120f                // if eq no blocks to move
        and     a2, 64-1, a2            // a2 = residual bytes to move

        ldq_u   t1, 0(a1)               // t1 = first unaligned quad
110:
                                        // get source data and merge it
                                        //  as we go
        ldq_u   t2, 7(a1)               // t2 = second unaligned quad
        extql   t1, a1, t1              // extract applicable bytes from t1
        extqh   t2, a1, v0              // extract applicable bytes from t2
        bis     t1, v0, t1              // t1 = quad #1
        ldq_u   t3, 15(a1)              // t3 = third unaligned quad
        extql   t2, a1, t2              // extract applicable bytes from t2
        extqh   t3, a1, v0              // extract applicable bytes from t3
        stq     t1, 0(a0)               // store quad #1
        bis     t2, v0, t2              // t2 = quad #2
        ldq_u   t4, 23(a1)              // t4 = fourth unaligned quad
        extql   t3, a1, t3              // extract applicable bytes from t3
        extqh   t4, a1, v0              // extract applicable bytes from t4
        stq     t2, 8(a0)               // store quad #2
        bis     t3, v0, t3              // t3 = quad #3
        ldq_u   t5, 31(a1)              // t5 = fifth unaligned quad
        extql   t4, a1, t4              // extract applicable bytes from t4
        extqh   t5, a1, v0              // extract applicable bytes from t5
        stq     t3, 16(a0)              // store quad #3
        bis     t4, v0, t4              // t4 = quad #4
        ldq_u   a3, 39(a1)              // a3 = sixth unaligned quad
        extql   t5, a1, t5              // extract applicable bytes from t5
        extqh   a3, a1, v0              // extract applicable bytes from a3
        stq     t4, 24(a0)              // store quad #4
        bis     t5, v0, t5              // t5 = quad #5
        ldq_u   a4, 47(a1)              // a4 = seventh unaligned quad
        extql   a3, a1, a3              // extract applicable bytes from a3
        extqh   a4, a1, v0              // extract applicable bytes from a4
        stq     t5, 32(a0)              // store quad #5
        bis     a3, v0, a3              // a3 = quad #6
        ldq_u   a5, 55(a1)              // a5 = eighth unaligned quad
        extql   a4, a1, a4              // extract applicable bytes from a4
        extqh   a5, a1, v0              // extract applicable bytes from a5
        stq     a3, 40(a0)              // store quad #6
        bis     a4, v0, a4              // a4 = quad #7
        ldq_u   t1, 63(a1)              // t1 = ninth unaligned = 1st of next
        extql   a5, a1, a5              // extract applicable bytes from a5
        extqh   t1, a1, v0              // extract applicable bytes from t1
        stq     a4, 48(a0)              // store quad #7
        bis     a5, v0, a5              // a5 = quad #8
        addq    a1, 64, a1              // increment source pointer
        stq     a5, 56(a0)              // store quad #8
        addq    a0, 64, a0              // increment destination pointer
        subq    t0, 1, t0               // decrement number of blocks
        bne     t0, 110b                // if ne, more blocks to move

//
// Move unaligned source quads to aligned destination quads
//

120:
        srl     a2, 3, t0               // t0 = number of quads to move
        beq     t0, 140f                // if eq no quads to move
        and     a2, 8-1, a2             // a2 = residual bytes


        ldq_u   t1, 0(a1)               // t1 = first unaligned quad
130:
        ldq_u   t2, 7(a1)               // t2 = second unaligned quad
        addq    a0, 8, a0               // increment destination pointer
        extql   t1, a1, t1              // extract applicable bytes from t1
        extqh   t2, a1, v0              // extract applicable bytes from t2
        bis     t1, v0, t1              // t1 = quadword of data
        stq     t1, -8(a0)              // store data to destination
        addq    a1, 8, a1               // increment source pointer
        subq    t0, 1, t0               // decrement quads to move
        bis     t2, zero, t1            // t1 = first of next unaligned pair
        bne     t0, 130b                // if ne, more quads to move

//
// Move remaining bytes to final quadword
//

140:    
        beq     a2, 160f                // if eq no more bytes to move

        mov     zero, t3                // t3 = position for next insertion
        mov     zero, t4                // assemble destination bytes here
        mov     a2, t0                  // t0 = number of bytes to move
        mov     -1, t1                  // t1 = bit mask
        sll     t0, 3, t0               // # of bytes to # of bits
        srl     t1, t0, t1              // clear t0 bits
        sll     t1, t0, t0              // move it back
        not     t0, t0                  // complement for destination clear mask
150:
        ldq_u   t1, 0(a1)               // get unaligned source quad
        extbl   t1, a1, t1              // t1 = source byte
        insbl   t1, t3, t1              // t1 = source byte, in position
        bis     t4, t1, t4              // merge in source byte
        addq    a1, 1, a1               // increment source pointer
        subq    a2, 1, a2               // decrement bytes to move
        addq    t3, 1, t3               // increment destination position
        bne     a2, 150b                // more bytes to move
retry4:
        ldq_l   t2, 0(a0)               // get last destination quadword locked
        bic     t2, t0, t2              // clear bytes to be copied
        bis     t2, t4, t2              // move bytes from source
        stq_c   t2, 0(a0)               // store merged quadword conditional
        beq     t2, retry4f             // if eq, retry failed interlock
           
//
// Finish unaligned MoveForward
//

160:
        ret     zero, (ra)              // return

//
// Out of line branches for failed store conditional.
// Don't need to restore anything, just try again.
//

retry1f:
        br      retry1
retry2f:
        br      retry2
retry3f:
        br      retry3
retry4f:
        br      retry4

        .end    RtlCopyBytes

        SBTTL("Zero Bytes")
//++
//
// VOID
// RtlZeroBytes (
//    IN PVOID Destination,
//    IN ULONG Length
//    )
//
// Routine Description:
//
//    This function zeros memory by first aligning the destination address to
//    a quadword boundary, and then zeroing 64-byte blocks, followed by 8-byte
//    blocks, followed by any remaining bytes. Unlike RtlZeroMemory the copy is
//    done such that byte granularity is assured for all platforms.
//
// Arguments:
//
//    Destination (a0) - Supplies a pointer to the memory to zero.
//
//    Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
//
// Return Value:
//
//    None.
//
//--

        LEAF_ENTRY(RtlZeroBytes)

        bis     zero, zero, a2          // set fill pattern
        br      zero, RtlpFillBytes     //


        SBTTL("Fill Bytes")
//++
//
// VOID
// RtlFillBytes (
//    IN PVOID Destination,
//    IN ULONG Length,
//    IN UCHAR Fill
//    )
//
// Routine Description:
//
//    This function fills memory by first aligning the destination address to
//    a longword boundary, and then filling 32-byte blocks, followed by 4-byte
//    blocks, followed by any remaining bytes. Unlike RtlFillMemory the copy is
//    done such that byte granularity is assured for all platforms.
//
// Arguments:
//
//    Destination (a0) - Supplies a pointer to the memory to fill.
//
//    Length (a1) - Supplies the length, in bytes, of the memory to be filled.
//
//    Fill (a2) - Supplies the fill byte.
//
//    N.B. The alternate entry memset expects the length and fill arguments
//         to be reversed.  It also returns the Destination pointer
//
// Return Value:
//
//    None.
//
//--

        ALTERNATE_ENTRY(RtlFillBytes)

        and     a2, 0xff, a2            // clear excess bits
        sll     a2, 8, t0               // duplicate fill byte
        bis     a2, t0, a2              // generate fill word
        sll     a2, 16, t0              // duplicate fill word
        bis     a2, t0, a2              // generate fill longword
        sll     a2, 32, t0              // duplicate fill longword
        bis     a2, t0, a2              // generate fill quadword

.align 3                                // ensure quadword aligned target
//
// Fill memory with the pattern specified in register a2.
//

RtlpFillBytes:                          //

//
// Align destination to quadword
//

        beq     a1, 80f                 // anything to fill? (paranoia)
        and     a0, 8-1, t0             // t0 = unaligned bits
        bne     t0, 5f                  // if ne, then not quad aligned
        br      zero, 20f               // if eq, then quad aligned

5:
        bis     zero, zero, t1          // t4 = destination byte zap mask
        bis     zero, 1, t2
        sll     t2, t0, t2              // t2 = next bit to set in zap mask
10:
        beq     a1, 15f                 // if eq, all bits set
        bis     t1, t2, t1              // set bit in zap mask
        sll     t2, 1, t2               // set next higher bit for zap mask
        subq    a1, 1, a1               // decrement bytes to fill
        addq    t0, 1, t0               // increment byte within quad
        cmpeq   t0, 8, t3               // finished the quadword?
        beq     t3, 10b                 // if eq [false], do next byte
15:
        zapnot  a2, t1, t2              // clear fill bytes
        bic     a0, 7, a3               // a3 = quadword base of destination
retry5:
        ldq_l   t0, 0(a3)               // load destination quadword
        zap     t0, t1, t0              // clear destination bytes
        or      t0, t2, t0              // merge in fill bytes
        stq_c   t0, 0(a3)               // store merged quadword conditional
        beq     t0, retry5f            // if eq, retry failed interlock

        addq    a0, 7, a0               // move a0 to next quadword
        bic     a0, 7, a0               // align a0 to quadword

//
// Check for 64-byte blocks
//

20:
        srl     a1, 6, t0               // t0 = number of 64 byte blocks
        beq     t0, 40f                 // if eq then no 64 byte blocks
        and     a1, 64-1, a1            // a1 = residual bytes to fill

30:
        stq     a2, 0(a0)               // store 64 bytes
        stq     a2, 8(a0)               //
        stq     a2, 16(a0)              //
        stq     a2, 24(a0)              //
        stq     a2, 32(a0)              //
        stq     a2, 40(a0)              //
        stq     a2, 48(a0)              //
        stq     a2, 56(a0)              //

        subq    t0, 1, t0               // decrement blocks remaining
        addq    a0, 64, a0              // increment destination pointer
        bne     t0, 30b                 // more blocks to write


//
// Fill aligned quadwords
//

40:
        srl     a1, 3, t0               // t0 = number of quadwords
        bne     t0, 55f                 // if ne quadwords left to fill
        br      zero, 60f               // if eq no quadwords left

55:
        and     a1, 8-1, a1             // a1 = residual bytes to fill

50:
        stq     a2, 0(a0)               // store quadword
        subq    t0, 1, t0               // decrement quadwords remaining
        addq    a0, 8, a0               // next quadword
        bne     t0, 50b                 // more quadwords to write

//
// Fill bytes for last quadword
//

60:
        beq     a1, 80f                 // if eq no more bytes to fill

        mov     a1, t0                  // t0 = number of bytes to move
        mov     -1, t1                  // t1 = bit mask
        sll     t0, 3, t0               // # of bytes to # of bits
        srl     t1, t0, t1              // clear t0 bits
        sll     t1, t0, t0              // move it back
        bic     a2, t0, t1              // clear fill bytes not copied
        not     t0, t0                  // complement to clear destination
retry6:
        ldq_l   t2, 0(a0)               // get last destination quadword locked
        bic     t2, t0, t2              // clear bytes to be copied
        bis     t2, t1, t2              // move bytes from source
        stq_c   t2, 0(a0)               // store merged quadword conditional
        beq     t2, retry6f             // if eq, retry failed interlock

//
// Finish up
//

80:
        ret     zero, (ra)              // return

//
// Out of line branches for failed store conditional.
// Don't need to restore anything, just try again.
//

retry5f:
        br      retry5
retry6f:
        br      retry6

        .end    RtlZeroBytes