title  "Memory functions"
;++
;
; Copyright (c) 2000  Microsoft Corporation
;
; Module Name:
;
;   movemem.asm
;
; Abstract:
;
;   This module implements functions to fill, copy , and compare blocks of
;   memory.
;
; Author:
;
;   David N. Cutler (davec) 6-Jul-2000
;
; Environment:
;
;   Any mode.
;
;--

include ksamd64.inc

        altentry RtlCopyMemoryAlternate

        subttl "Compare Memory"
;++
;
; SIZE_T
; RtlCompareMemory (
;     IN PVOID Source1,
;     IN PVOID Source2,
;     IN SIZE_T Length
;     )
;
; Routine Description:
;
;   This function compares two unaligned blocks of memory and returns the
;   number of bytes that compared equal.
;
; Arguments:
;
;   Source1 (rcx) - Supplies a pointer to the first block of memory to
;       compare.
;
;   Source2 (rdx) - Supplies a pointer to the second block of memory to
;       compare.
;
;   Length (r8) - Supplies the Length, in bytes, of the memory to be
;       compared.
;
; Return Value:
;
;   The number of bytes that compared equal is returned as the function
;   value. If all bytes compared equal, then the length of the orginal
;   block of memory is returned.
;
;--

CmFrame struct
        Fill    dq ?                    ; fill to 8 mod 16
        SavedRsi dq ?                   ; saved nonvolatile registers
        SavedRdi dq ?                   ;
CmFrame ends

        NESTED_ENTRY RtlCompareMemory, _TEXT$00

        push_reg rdi                    ; save nonvolatile registers
        push_reg rsi                    ;
        alloc_stack (sizeof CmFrame - (2 * 8)) ; allocate stack frame

        END_PROLOGUE

        mov     rsi, rcx                ; set address of first string
        mov     rdi, rdx                ; set address of second string
        xor     edx, ecx                ; check if compatible alignment
        and     edx, 07h                ;
        jnz     short RlCM50            ; if nz, incompatible alignment
        cmp     r8, 8                   ; check if length to align
        jb      short RlCM50            ; if b, insufficient alignment length

;
; Buffer alignment is compatible and there are enough bytes for alignment.
;

        mov     r9, rdi                 ; copy destination address
        neg     ecx                     ; compute alignment length
        and     ecx, 07h                ; 
        jz      short RlCM10            ; if z, buffers already aligned
        sub     r8, rcx                 ; reduce count by align length
   repe cmpsb                           ; compare bytes to alignment
        jnz     short RlCM30            ; if nz, not all bytes matched
RlCM10: mov     rcx, r8                 ;
        and     rcx, 0fffffff8h         ; check if and quarwords to compare
        jz      short RlCM20            ; if z, no quadwords to compare
        sub     r8, rcx                 ; reduce length by compare count
        shr     rcx, 3                  ; compute number of quadwords
   repe cmpsq                           ; compare quadwords
        jz      short RlCM20            ; if z, all quadwords compared
        inc     rcx                     ; increment remaining count
        sub     rsi, 8                  ; back up source address
        sub     rdi, 8                  ; back up destination address
        shl     rcx, 3                  ; compute uncompared bytes
RlCM20: add     r8, rcx                 ; compute residual bytes to compare
        jz      short RlCM40            ; if z, all bytes compared equal
        mov     rcx, r8                 ; set remaining bytes to compare
   repe cmpsb                           ; compare bytes
        jz      short RlCM40            ; if z, all byte compared equal
RlCM30: dec     rdi                     ; back up destination address
RlCM40: sub     rdi, r9                 ; compute number of bytes matched
        mov     rax, rdi                ;
        add     rsp, sizeof CmFrame - (2 * 8) ; deallocate stack frame
        pop     rsi                     ; restore nonvolatile register
        pop     rdi                     ;
        ret                             ; return

;
; Buffer alignment is incompatible or there is less than 8 bytes to compare.
;

RlCM50: test    r8, r8                  ; test if any bytes to compare
        jz      short RlCM60            ; if z, no bytes to compare
        mov     rcx, r8                 ; set number of bytes to compare
   repe cmpsb                           ; compare bytes
        jz      short RlCM60            ; if z, all bytes compared equal
        inc     rcx                     ; increment remaining count
        sub     r8, rcx                 ; compute number of bytes matched
RlCM60: mov     rax, r8                 ;
        add     rsp, sizeof CmFrame - (2 * 8) ; deallocate stack frame
        pop     rsi                     ; restore nonvolatile register
        pop     rdi                     ;
        ret                             ; return

        NESTED_END RtlCompareMemory, _TEXT$00

        subttl  "Compare Memory 32-bits"
;++
;
; SIZE_T
; RtlCompareMemoryUlong (
;     IN PVOID Source,
;     IN SIZE_T Length,
;     IN ULONG Pattern
;     )
;
; Routine Description:
;
;   This function compares a block of dword aligned memory with a specified
;   pattern 32-bits at a time.
;
;   N.B. The low two bits of the length are assumed to be zero and are
;        ignored.
;
; Arguments:
;
;   Source (rcx) - Supplies a pointer to the block of memory to compare.
;
;   Length (rdx) - Supplies the length, in bytes, of the memory to compare.       compare.
;
;   Pattern (r8d) - Supplies the pattern to be compared against.
;
; Return Value:
;
;   The number of bytes that compared equal is returned as the function
;   value. If all bytes compared equal, then the length of the orginal
;   block of memory is returned.
;
;--

        NESTED_ENTRY RtlCompareMemoryUlong, _TEXT$00

        push_reg rdi                    ; save nonvolatile register

        END_PROLOGUE

        mov     rdi, rcx                ; set destination address
        shr     rdx, 2                  ; compute number of dwords
        jz      short RlCU10            ; if z, no dwords to compare
        mov     rcx, rdx                ; set length of compare in dwords
        mov     eax, r8d                ; set comparison pattern
   repe scasd                           ; compare memory with pattern
        jz      short RlCU10            ; if z, all dwords compared
        inc     rcx                     ; increment remaining count
        sub     rdx, rcx                ; compute number of bytes matched
RlCU10: lea     rax, [rdx*4]            ; compute successful compare in bytes
        pop     rdi                     ; restore nonvolatile register
        ret                             ; return

        NESTED_END RtlCompareMemoryUlong, _TEXT$00

        subttl  "Copy Memory"
;++
;
; VOID
; RtlCopyMemory (
;     OUT VOID UNALIGNED *Destination,
;     IN CONST VOID UNALIGNED * Sources,
;     IN SIZE_T Length
;     )
;
; Routine Description:
;
;   This function copies nonoverlapping from one unaligned buffer to another.
;
; Arguments:
;
;   Destination (rcx) - Supplies a pointer to the destination buffer.
;
;   Sources (rdx) - Supplies a pointer to the source buffer.
;
;   Length (r8) - Supplies the length, in bytes, of the copy operation.
;
; Return Value:
;
;   None.
;
;--

CpFrame struct
        Fill    dq ?                    ; fill to 8 mod 16
        SavedRsi dq ?                   ; saved nonvolatile registers
        SavedRdi dq ?                   ;
CpFrame ends

        NESTED_ENTRY RtlCopyMemory, _TEXT$00

        push_reg rdi                    ; save nonvolatile registers
        push_reg rsi                    ;
        alloc_stack (sizeof CpFrame - (2 * 8)) ; allocate stack frame

        END_PROLOGUE

        ALTERNATE_ENTRY RtlCopyMemoryAlternate

        mov     rdi, rcx                ; set destination address
        mov     rsi, rdx                ; set source address
        xor     edx, ecx                ; check if compatible alignment
        and     edx, 07h                ;
        jnz     short RlCP20            ; if nz, incompatible alignment
        cmp     r8, 8                   ; check if 8 bytes to move
        jb      short RlCP20            ; if b, less than 8 bytes to move

;
; Buffer alignment is compatible and there are enough bytes for alignment.
;

        neg     ecx                     ; compute alignment length
        and     ecx, 07h                ; 
        jz      short RlCP10            ; if z, buffers already aligned
        sub     r8, rcx                 ; reduce count by align length
    rep movsb                           ; move bytes to alignment

;
; Move 8-byte blocks.
;

RlCP10: mov     rcx, r8                 ; compute number of 8-byte blocks
        and     rcx, 0fffffff8h         ;
        jz      short RlCP20            ; if z, no 8-byte blocks
        sub     r8, rcx                 ; subtract 8-byte blocks from count
        shr     rcx, 3                  ; compute number of 8-byte blocks
    rep movsq                           ; move 8-byte blocks

;
; Move residual bytes.
;

RlCP20: test    r8, r8                  ; test if any bytes to move
        jz      short RlCP30            ; if z, no bytes to move
        mov     rcx, r8                 ; set remaining byte to move
    rep movsb                           ; move bytes to destination
RlCP30: add     rsp, sizeof CpFrame - (2 * 8) ; deallocate stack frame
        pop     rsi                     ; restore nonvolatile registers
        pop     rdi                     ;
        ret                             ; return

        NESTED_END RtlCopyMemory, _TEXT$00

        subttl  "Copy Memory NonTemporal"
;++
;
; VOID
; RtlCopyMemoryNonTemporal (
;     OUT VOID UNALIGNED *Destination,
;     IN CONST VOID UNALIGNED * Sources,
;     IN SIZE_T Length
;     )
;
; Routine Description:
;
;   This function copies nonoverlapping from one buffer to another using
;   nontemporal moves that do not polute the cache.
;
; Arguments:
;
;   Destination (rcx) - Supplies a pointer to the destination buffer.
;
;   Sources (rdx) - Supplies a pointer to the source buffer.
;
;   Length (r8) - Supplies the length, in bytes, of the copy operation.
;
; Return Value:
;
;   None.
;
;--

NtFrame struct
        Fill    dq ?                    ; fill to 8 mod 16
        SavedRsi dq ?                   ; saved nonvolatile registers
        SavedRdi dq ?                   ;
NtFrame ends

        NESTED_ENTRY RtlCopyMemoryNonTemporal, _TEXT$00

        push_reg rdi                    ; save nonvolatile registers
        push_reg rsi                    ;
        alloc_stack (sizeof NtFrame - (2 * 8)) ; allocate stack frame

        END_PROLOGUE

        mov     rdi, rcx                ; set destination address
        mov     rsi, rdx                ; set source address
        cmp     r8, 16                  ; check if 16 bytes to move
        jb      RlNT50                  ; if b, less than 16 bytes to move

;
; Align the destination to a 16-byte boundary.
;

        neg     ecx                     ; compute alignment length
        and     ecx, 0fh                ; 
        jz      short RlNT10            ; if z, destination already aligned
        sub     r8, rcx                 ; reduce count by align length
    rep movsb                           ; move bytes to alignment

;
; Move 64-byte blocks.
;

RlNT10: mov     rax, r8                 ; compute number of 64-byte blocks
        and     rax, 0ffffffc0h         ;
        jz      short RlNT30            ; if z, no 64-byte blocks to move
        sub     r8, rax                 ; subtract 64-byte blocks from count
RlNT20: prefetchnta 0[rsi]              ; prefetch start of source block
        prefetchnta 63[rsi]             ; prefetch end source block
        movdqu  xmm0, [rsi]             ; move 64-byte block
        movdqu  xmm1, 16[rsi]           ;
        movdqu  xmm2, 32[rsi]           ;
        movdqu  xmm3, 48[rsi]           ;
        movntq  [rdi], xmm0             ;
        movntq  16[rdi], xmm1           ;
        movntq  32[rdi], xmm2           ;
        movntq  48[rdi], xmm3           ;
        add     rdi, 64                 ; advance destination address
        add     rsi, 64                 ; advance source address
        sub     rax, 64                 ; subtract number of bytes moved
        jnz     short RlNT20            ; if nz, more 64-byte blocks to move

;
; Move 16-byte blocks.
;

RlNT30: mov     rax, r8                 ; compute number of 16-byte blocks
        and     rax, 0fffffff0h         ;
        jz      short RlNT50            ; if z, no 16-byte blocks
        sub     r8, rax                 ; subract 16-byte blocks from count
RlNT40: movdqu  xmm0, [rsi]             ; move 16-byte block
        movntq  [rdi], xmm0             ;
        add     rdi, 16                 ; advance destination address
        add     rsi, 16                 ; advance source address
        sub     rax, 16                 ; subtract number of bytes moved
        jnz     short RlNT40            ; if nz, more 16-byte blocks to move

;
; Move residual bytes.
;

RlNT50: test    r8, r8                  ; test if any bytes to move
        jz      short RlNT60            ; if z, no bytes to move
        mov     rcx, r8                 ; set residual bytes to move
    rep movsb                           ; move residual bytes
RlNT60: sfence                          ; make sure all stores complete
        add     rsp, sizeof NtFrame - (2 * 8) ; deallocate stack frame
        pop     rsi                     ; restore nonvolatile registers
        pop     rdi                     ;
        ret                             ; return

        NESTED_END RtlCopyMemoryNonTemporal, _TEXT$00

        subttl  "Fill Memory"
;++
;
; VOID
; RtlFillMemory (
;     IN VOID UNALIGNED *Destination,
;     IN SIZE_T Length,
;     IN UCHAR Fill
;     )
;
; Routine Description:
;
;   This function fills a block of unaligned memory with a specified pattern.
;
; Arguments:
;
;   Destination (rcx) - Supplies a pointer to the memory to fill.
;
;   Length (rdx) - Supplies the length, in bytes, of the memory to fill.
;
;   Fill (r8d) - Supplies the value to fill memory with.
;
; Return Value:
;
;   None.
;
;--

        NESTED_ENTRY RtlFillMemory, _TEXT$00

        push_reg rdi                    ; save nonvolatile register

        END_PROLOGUE

        mov     rdi, rcx                ; set destination address
        mov     eax, r8d                ; set fill pattern
        cmp     rdx, 8                  ; check if 8 bytes to fill
        jb      short RlFM20            ; if b, less than 8 bytes to fill

;
; Fill alignment bytes.
;

        neg     ecx                     ; compute alignment length
        and     ecx, 07h                ; 
        jz      short RlFM10            ; if z, buffers already aligned
        sub     rdx, rcx                ; reduce count by align length
    rep stosb                           ; fill bytes to alignment

;
; Fill 8-byte blocks.
;

RlFM10: mov     rcx, rdx                ; compute number of 8-byte blocks
        and     rcx, 0fffffff8h         ;
        jz      short RlFM20            ; if z, no 8-byte blocks
        sub     rdx, rcx                ; subtract 8-byte blocks from count
        shr     rcx, 3                  ; compute number of 8-byte blocks
        mov     ah, al                  ; replicate pattern to dword
        shl     eax, 16                 ;
        mov     al, r8b                 ;
        mov     ah, al                  ;
        mov     r9, rax                 ;
        shl     rax, 32                 ;
        or      rax, r9                 ;
    rep stosq                           ; fill 8-byte blocks

;
; Fill residual bytes.
;

RlFM20: test    rdx, rdx                ; test if any bytes to fill
        jz      short RlFM30            ; if z, no bytes to fill
        mov     rcx, rdx                ; set remaining byte to fill
    rep stosb                           ; fill residual bytes
RlFM30: pop     rdi                     ; restore nonvolatile register
        ret                             ; return

        NESTED_END RtlFillMemory, _TEXT$00

        subttl  "Fill Memory 32-bits"
;++
;
; VOID
; RtlFillMemoryUlong (
;     IN PVOID Destination,
;     IN SIZE_T Length,
;     IN ULONG Fill
;     )
;
; Routine Description:
;
;   This function fills a block of dword aligned memory with a specified
;   pattern 32-bits at a time.
;
;   N.B. The low two bits of the length are assumed to be zero and are
;        ignored.
;
; Arguments:
;
;   Destination (rcx) - Supplies a pointer to the memory to fill.
;
;   Length (rdx) - Supplies the length, in bytes, of the memory to fill.
;
;   Fill (r8d) - Supplies the value to fill memory with.
;
; Return Value:
;
;   None.
;
;--

        NESTED_ENTRY RtlFillMemoryUlong, _TEXT$00

        push_reg rdi                    ; save nonvolatile register

        END_PROLOGUE

        mov     rdi, rcx                ; set destination address
        mov     rcx, rdx                ; set length of fill in bytes
        shr     rcx, 2                  ; compute number of dwords
        jz      short RlFL10            ; if z, no dwords to fill
        mov     eax, r8d                ; set fill pattern
    rep stosd                           ; fill memory with pattern
RlFl10: pop     rdi                     ; restore nonvolatile register
        ret                             ; return

        NESTED_END RtlFillMemoryUlong, _TEXT$00

        subttl  "Fill Memory 64-bits"
;++
;
; VOID
; RtlFillMemoryUlonglong (
;     IN PVOID Destination,
;     IN SIZE_T Length,
;     IN ULONGLONG Fill
;     )
;
; Routine Description:
;
;   This function fills a block of qword aligned memory with a specified
;   pattern 64-bits at a time.
;
;   N.B. The low three bits of the length parameter are assumed to be zero
;        and are ignored.
;
; Arguments:
;
;   Destination (rcx) - Supplies a pointer to the memory to fill.
;
;   Length (rdx) - Supplies the length, in bytes, of the memory to fill.
;
;   Fill (r8) - Supplies the value to fill memory with.
;
; Return Value:
;
;   None.
;
;--

        NESTED_ENTRY RtlFillMemoryUlonglong, _TEXT$00

        push_reg rdi                    ; save nonvolatile register

        END_PROLOGUE

        mov     rdi, rcx                ; set destination address
        mov     rcx, rdx                ; set length of fill in bytes
        shr     rcx, 3                  ; compute number of quadwords
        jz      short RlFU10            ; if z, no quadwords to fill
        mov     rax, r8                 ; set fill pattern
    rep stosq                           ; fill memory with pattern
RlFU10: pop     rdi                     ; restore nonvolatile register
        ret                             ; return

        NESTED_END RtlFillMemoryUlonglong, _TEXT$00

        subttl  "Move Memory"
;++
;
; VOID
; RtlMoveMemory (
;     OUT VOID UNALIGNED *Destination,
;     IN CONST VOID UNALIGNED * Sources,
;     IN SIZE_T Length
;     )
;
; Routine Description:
;
;   This function copies from one unaligned buffer to another.
;
; Arguments:
;
;   Destination (rcx) - Supplies a pointer to the destination buffer.
;
;   Sources (rdx) - Supplies a pointer to the source buffer.
;
;   Length (r8) - Supplies the length, in bytes, of the copy operation.
;
; Return Value:
;
;   None.
;
;--

MmFrame struct
        Fill    dq ?                    ; fill to 8 mod 16
        SavedRsi dq ?                   ; saved nonvolatile registers
        SavedRdi dq ?                   ;
MmFrame ends

        NESTED_ENTRY RtlMoveMemory, _TEXT$00

        push_reg rdi                    ; save nonvolatile registers
        push_reg rsi                    ;
        alloc_stack (sizeof MmFrame - (2 * 8)) ; allocate stack frame

        END_PROLOGUE

        cmp     rcx, rdx                ; check if possible buffer overlap
        jbe     RtlCopyMemoryAlternate  ; if be, no overlap possible
        mov     rsi, rdx                ; compute ending source address
        add     rsi, r8                 ;
        dec     rsi                     ;
        cmp     rcx, rsi                ; check for buffer overlap
        jg      RtlCopyMemoryAlternate  ; if g, no overlap possible
        mov     rdi, rcx                ; compute ending destination address
        add     rdi, r8                 ;
        dec     rdi                     ;
        mov     rcx, r8                 ; set count of bytes to move
        std                             ; set direction flag
    rep movsb                           ; move bytes backward to destination
        cld                             ; clear direction flag
        add     rsp, sizeof MmFrame - (2 * 8) ; deallocate stack frame
        pop     rsi                     ; restore nonvolatile registers
        pop     rdi                     ;
        ret                             ; return

        NESTED_END RtlMoveMemory, _TEXT$00

        subttl  "Prefetch Memory NonTemporal"
;++
;
; VOID
; RtlPrefetchMemoryNonTemporal (
;     IN CONST PVOID Source,
;     IN SIZE_T Length
;     )
;
; Routine Description:
;
;   This function prefetches memory at Source, for Length bytes into the
;   closest cache to the processor.
;
; Arguments:
;
;   Source (rcx) - Supplies a pointer to the memory to be prefetched.
;
;   Length (rdx) - Supplies the length, in bytes, of the operation.
;
; Return Value:
;
;   None.
;
;--

        LEAF_ENTRY RtlPrefetchMemoryNonTemporal, _TEXT$00

RlPF10: prefetchnta 0[rcx]              ; prefetch line
        add     rcx, 64                 ; increment address to prefetch
        sub     rdx, 64                 ; subtract number of bytes prefetched
        ja      RlPF10                  ; if above zero, more bytes to move
        ret                             ; return

        LEAF_END RtlPrefetchMemoryNonTemporal, _TEXT$00

        subttl  "Zero Memory"
;++
;
; VOID
; RtlZeroMemory (
;     IN VOID UNALIGNED *Destination,
;     IN SIZE_T Length
;     )
;
; Routine Description:
;
;   This function fills a block of unaligned memory with zero.
;
; Arguments:
;
;   Destination (rcx) - Supplies a pointer to the memory to fill.
;
;   Length (rdx) - Supplies the length, in bytes, of the memory to fill.
;
; Return Value:
;
;   None.
;
;--

        LEAF_ENTRY RtlZeroMemory, _TEXT$00

        xor     r8, r8                  ; set fill pattern
        jmp     RtlFillMemory           ; finish in common code

        LEAF_END RtlZeroMemory, _TEXT$00

        end