You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
290 lines
6.5 KiB
290 lines
6.5 KiB
title "Hal Copy using Movnti"
|
|
;++
|
|
;
|
|
;Copyright (c) 2000 Microsoft Corporation
|
|
;
|
|
;Module Name:
|
|
;
|
|
; ixmovnti.asm
|
|
;
|
|
;Abstract:
|
|
;
|
|
; HAL routine that uses movnti instruction to copy buffer
|
|
; similar to RtlMovememory but does not support backwards and
|
|
; overlapped move
|
|
; Based on a previously tested fast copy by Jim crossland.
|
|
;Author:
|
|
; Gautham chinya
|
|
; Intel Corp
|
|
;
|
|
;Revision History:
|
|
;
|
|
;--
|
|
|
|
.386p
|
|
|
|
.xlist
|
|
include callconv.inc ; calling convention macros
|
|
include mac386.inc
|
|
.list
|
|
;
|
|
; Register Definitions (for instruction macros).
|
|
;
|
|
|
|
rEAX equ 0
|
|
rECX equ 1
|
|
rEDX equ 2
|
|
rEBX equ 3
|
|
rESP equ 4
|
|
rEBP equ 5
|
|
rESI equ 6
|
|
rEDI equ 7
|
|
|
|
MEMORY_ALIGNMENT_MASK0 = 63
|
|
MEMORY_ALIGNMENT_LOG2_0 = 6
|
|
|
|
MEMORY_ALIGNMENT_MASK1 = 3
|
|
MEMORY_ALIGNMENT_LOG2_1 = 2
|
|
|
|
sfence macro
|
|
db 0FH, 0AEH, 0F8H
|
|
endm
|
|
|
|
prefetchnta_short macro GeneralReg, Offset
|
|
db 0FH, 018H, 040H + GeneralReg, Offset
|
|
endm
|
|
|
|
prefetchnta_long macro GeneralReg, Offset
|
|
db 0FH, 018H, 080h + GeneralReg
|
|
dd Offset
|
|
endm
|
|
|
|
movnti_eax macro GeneralReg, Offset
|
|
db 0FH, 0C3H, 040H + GeneralReg, Offset
|
|
endm
|
|
|
|
movnti_eax_0_disp macro GeneralReg
|
|
db 0FH, 0C3H, 000H + GeneralReg
|
|
endm
|
|
|
|
movnti_ebx macro GeneralReg, Offset
|
|
db 0FH, 0C3H, 058H + GeneralReg, Offset
|
|
endm
|
|
|
|
;
|
|
;
|
|
; Macro that moves 64bytes (1 cache line using movnti (eax and ebx registers)
|
|
;
|
|
;
|
|
|
|
movnticopy64bytes macro
|
|
mov eax, [esi]
|
|
mov ebx, [esi + 4]
|
|
movnti_eax_0_disp rEDI
|
|
movnti_ebx rEDI, 4
|
|
|
|
mov eax, [esi + 8]
|
|
mov ebx, [esi + 12]
|
|
movnti_eax rEDI, 8
|
|
movnti_ebx rEDI, 12
|
|
|
|
mov eax, [esi + 16]
|
|
mov ebx, [esi + 20]
|
|
movnti_eax rEDI, 16
|
|
movnti_ebx rEDI, 20
|
|
|
|
mov eax, [esi + 24]
|
|
mov ebx, [esi + 28]
|
|
movnti_eax rEDI, 24
|
|
movnti_ebx rEDI, 28
|
|
|
|
mov eax, [esi + 32]
|
|
mov ebx, [esi + 36]
|
|
movnti_eax rEDI,32
|
|
movnti_ebx rEDI, 36
|
|
|
|
mov eax, [esi + 40]
|
|
mov ebx, [esi + 44]
|
|
movnti_eax rEDI, 40
|
|
movnti_ebx rEDI, 44
|
|
|
|
mov eax, [esi + 48]
|
|
mov ebx, [esi + 52]
|
|
movnti_eax rEDI,48
|
|
movnti_ebx rEDI, 52
|
|
|
|
mov eax, [esi + 56]
|
|
mov ebx, [esi + 60]
|
|
movnti_eax rEDI, 56
|
|
movnti_ebx rEDI, 60
|
|
endm
|
|
|
|
|
|
|
|
_TEXT$03 SEGMENT DWORD PUBLIC 'CODE'
|
|
ASSUME DS:FLAT, ES:FLAT, SS:NOTHING, FS:NOTHING, GS:NOTHING
|
|
page ,132
|
|
subttl "HalpMovntiCopyBuffer"
|
|
;++
|
|
;
|
|
; VOID
|
|
; HalpMovntiCopyBuffer(
|
|
; IN PVOID Destination,
|
|
; IN PVOID Source,
|
|
; IN ULONG Length
|
|
; )
|
|
;
|
|
; Routine Description:
|
|
;
|
|
; This function tries to copy buffers
|
|
; in 4-byte blocks using movnti, but also handles
|
|
; smaller requests
|
|
;
|
|
; Arguments:
|
|
;
|
|
; Destination - Supplies a pointer to the destination of the move.
|
|
;
|
|
; Source - Supplies a pointer to the memory to move.
|
|
;
|
|
; Length - Supplies the Length, in bytes, of the memory to be moved.
|
|
;
|
|
; Return Value:
|
|
;
|
|
; None.
|
|
;
|
|
;--
|
|
|
|
cPublicProc _HalpMovntiCopyBuffer ,3
|
|
|
|
; Definitions of arguments
|
|
; (TOS) = Return address
|
|
|
|
EmmDestination equ [ebp + 4 + 4]
|
|
EmmSource equ [ebp + 4 + 8]
|
|
EmmLength equ [ebp + 4 + 12]
|
|
|
|
push ebp
|
|
mov ebp, esp
|
|
push esi
|
|
push edi
|
|
push ebx
|
|
|
|
mov esi, EmmSource
|
|
mov edi, EmmDestination
|
|
mov ecx, EmmLength
|
|
|
|
;
|
|
; Can't use movnti for this wee-quest
|
|
;
|
|
cmp ecx, 4
|
|
jl RemainingBytes
|
|
|
|
;
|
|
; Before prefetching we must guarantee the TLB is valid.
|
|
;
|
|
mov eax, [esi]
|
|
|
|
cld
|
|
|
|
;
|
|
;Check if less than 64 bytes
|
|
;
|
|
|
|
mov edx, ecx
|
|
and ecx, MEMORY_ALIGNMENT_MASK0
|
|
shr edx, MEMORY_ALIGNMENT_LOG2_0
|
|
je Copy4
|
|
dec edx
|
|
je copy64
|
|
|
|
prefetchnta_short rESI, 128
|
|
dec edx
|
|
je copy128
|
|
|
|
prefetchnta_short rESI, 192
|
|
dec edx
|
|
je copy192
|
|
|
|
|
|
|
|
copyLoop:
|
|
|
|
prefetchnta_long rESI, 256
|
|
|
|
movnticopy64bytes
|
|
lea esi, [esi + 64]
|
|
lea edi, [edi + 64]
|
|
|
|
dec edx
|
|
jnz copyLoop
|
|
|
|
|
|
copy192:
|
|
|
|
|
|
movnticopy64bytes
|
|
lea esi, [esi + 64]
|
|
lea edi, [edi + 64]
|
|
|
|
copy128:
|
|
|
|
|
|
movnticopy64bytes
|
|
lea esi, [esi + 64]
|
|
lea edi, [edi + 64]
|
|
|
|
copy64:
|
|
|
|
movnticopy64bytes
|
|
|
|
or ecx, ecx ; anything less than 64 to do?
|
|
jz ExitRoutine
|
|
|
|
prefetchnta_short rESI, 0
|
|
;
|
|
;Update pointer for last copy
|
|
;
|
|
|
|
lea esi, [esi + 64]
|
|
lea edi, [edi + 64]
|
|
|
|
;
|
|
;Handle extra bytes here in 32 bit chuncks and then 8-bit bytes
|
|
;
|
|
|
|
Copy4:
|
|
mov edx, ecx
|
|
and ecx, MEMORY_ALIGNMENT_MASK1
|
|
shr edx, MEMORY_ALIGNMENT_LOG2_1
|
|
|
|
;
|
|
; If the number of 32-bit words to move is non-zero, then do it
|
|
;
|
|
jz RemainingBytes
|
|
|
|
Copy4Loop:
|
|
mov eax, [esi]
|
|
movnti_eax_0_disp rEDI
|
|
lea esi, [esi+4]
|
|
lea edi, [edi+4]
|
|
dec edx
|
|
jnz Copy4Loop
|
|
|
|
RemainingBytes:
|
|
or ecx, ecx
|
|
jz ExitRoutine
|
|
rep movsb
|
|
|
|
ExitRoutine:
|
|
|
|
sfence ;Make all stores globally visible
|
|
pop ebx
|
|
pop edi
|
|
pop esi
|
|
pop ebp
|
|
stdRET _HalpMovntiCopyBuffer
|
|
|
|
stdENDP _HalpMovntiCopyBuffer
|
|
|
|
_TEXT$03 ends
|
|
end
|