|
|
title "Compute Checksum"
;/*++ ; ; Copyright (c) Microsoft Corporation. All rights reserved. ; ; Module Name: ; ; xsum.x86 ; ; Abstract: ; ; This module implements a function to compute the checksum of a buffer. ; ; Author: ; ; David N. Cutler (davec) 27-Jan-1992 ; ; Revision History: ; ; Who When What ; -------- -------- ---------------------------------------------- ; mikeab 01-22-94 Pentium optimization ; ; Environment: ; ; Any mode. ; ; Revision History: ; ;--*/
LOOP_UNROLLING_BITS equ 5 LOOP_UNROLLING equ (1 SHL LOOP_UNROLLING_BITS)
.386 .model small,c
assume cs:FLAT,ds:FLAT,es:FLAT,ss:FLAT assume fs:nothing,gs:nothing
.xlist include callconv.inc include ks386.inc .list
.code
;++ ; ; ULONG ; tcpxsum( ; IN ULONG cksum, ; IN PUCHAR buf, ; IN ULONG len ; ) ; ; Routine Description: ; ; This function computes the checksum of the specified buffer. ; ; Arguments: ; ; cksum - Suppiles the initial checksum value, in 16-bit form, ; with the high word set to 0. ; ; buf - Supplies a pointer to the buffer to the checksum buffer. ; ; len - Supplies the length of the buffer in bytes. ; ; Return Value: ; ; The computed checksum in 32-bit two-partial-accumulators form, added to ; the initial checksum, is returned as the function value. ; ;--
cksum equ 12 ; stack offset to initial checksum buf equ 16 ; stack offset to source address len equ 20 ; stack offset to length in words
to_checksum_last_word: jmp checksum_last_word
to_checksum_done: jmp checksum_done
to_checksum_dword_loop_done: jmp checksum_dword_loop_done
cPublicProc tcpxsum,3
; FPO = 0 dwords locals allocated in prolog ; 3 dword parameters ; 2 bytes in prolog ; 2 registers saved ; 0 EBP is not used ; 0 frame type = FPO .FPO (0,3,2,2,0,0)
push ebx ; save nonvolatile register push esi ; save nonvolatile register
mov ecx,[esp + len] ; get length in bytes sub eax,eax ; clear computed checksum test ecx,ecx ; any bytes to checksum at all? jz short to_checksum_done ; no bytes to checksum
; ; if the checksum buffer is not word aligned, then add the first byte of ; the buffer to the input checksum. ;
mov esi,[esp + buf] ; get source address sub edx,edx ; set up to load word into EDX below test esi,1 ; check if buffer word aligned jz short checksum_word_aligned ; if zf, buffer word aligned mov ah,[esi] ; get first byte (we know we'll have ; to swap at the end) inc esi ; increment buffer address dec ecx ; decrement number of bytes jz short to_checksum_done ; if zf set, no more bytes
; ; If the buffer is not an even number of of bytes, then initialize ; the computed checksum with the last byte of the buffer. ;
checksum_word_aligned: ; shr ecx,1 ; convert to word count jnc short checksum_start ; if nc, even number of bytes mov al,[esi+ecx*2] ; initialize the computed checksum jz short to_checksum_done ; if zf set, no more bytes
; ; Compute checksum in large blocks of dwords, with one partial word up front if ; necessary to get dword alignment, and another partial word at the end if ; needed. ;
; ; Compute checksum on the leading word, if that's necessary to get dword ; alignment. ;
checksum_start: ; test esi,02h ; check if source dword aligned jz short checksum_dword_aligned ; source is already dword aligned mov dx,[esi] ; get first word to checksum add esi,2 ; update source address add eax,edx ; update partial checksum ; (no carry is possible, because EAX ; and EDX are both 16-bit values) dec ecx ; count off this word (zero case gets ; picked up below)
; ; Checksum as many words as possible by processing a dword at a time. ;
checksum_dword_aligned: push ecx ; so we can tell if there's a trailing ; word later shr ecx,1 ; # of dwords to checksum jz short to_checksum_last_word ; no dwords to checksum
mov edx,[esi] ; preload the first dword add esi,4 ; point to the next dword dec ecx ; count off the dword we just loaded jz short to_checksum_dword_loop_done ; skip the loop if that was the only ; dword mov ebx,ecx ; EBX = # of dwords left to checksum add ecx,LOOP_UNROLLING-1 ; round up loop count shr ecx,LOOP_UNROLLING_BITS ; convert from word count to unrolled ; loop count and ebx,LOOP_UNROLLING-1 ; # of partial dwords to do in first ; loop jz short checksum_dword_loop ; special-case when no partial loop, ; because fixup below doesn't work ; in that case (carry flag is ; cleared at this point, as required ; at loop entry) lea esi,[esi+ebx*4-(LOOP_UNROLLING*4)] ; adjust buffer pointer back to ; compensate for hardwired displacement ; at loop entry point ; ***doesn't change carry flag*** jmp loop_entry[ebx*4] ; enter the loop to do the first, ; partial iteration, after which we can ; just do 64-word blocks ; ***doesn't change carry flag***
checksum_dword_loop:
DEFLAB macro pre,suf pre&suf: endm
TEMP=0 REPT LOOP_UNROLLING deflab loop_entry_,%TEMP adc eax,edx mov edx,[esi + TEMP] TEMP=TEMP+4 ENDM
checksum_dword_loop_end:
lea esi,[esi + LOOP_UNROLLING * 4] ; update source address ; ***doesn't change carry flag*** dec ecx ; count off unrolled loop iteration ; ***doesn't change carry flag*** jnz checksum_dword_loop ; do more blocks
checksum_dword_loop_done label proc adc eax,edx ; finish dword checksum mov edx,0 ; prepare to load trailing word adc eax,edx
; ; Compute checksum on the trailing word, if there is one. ; High word of EDX = 0 at this point ; Carry flag set iff there's a trailing word to do at this point ;
checksum_last_word label proc ; "proc" so not scoped to function pop ecx ; get back word count test ecx,1 ; is there a trailing word? jz short checksum_done ; no trailing word add ax,[esi] ; add in the trailing word adc eax,0 ;
checksum_done label proc ; "proc" so not scoped to function mov ecx,eax ; fold the checksum to 16 bits ror ecx,16 add eax,ecx mov ebx,[esp + buf] shr eax,16 test ebx,1 ; check if buffer word aligned jz short checksum_combine ; if zf set, buffer word aligned ror ax,8 ; byte aligned--swap bytes back checksum_combine label proc ; "proc" so not scoped to function add ax,word ptr [esp + cksum] ; combine checksums pop esi ; restore nonvolatile register adc eax,0 ; pop ebx ; restore nonvolatile register stdRET tcpxsum
REFLAB macro pre,suf dd pre&suf endm
align 4 loop_entry label dword dd 0 TEMP=LOOP_UNROLLING*4 REPT LOOP_UNROLLING-1 TEMP=TEMP-4 reflab loop_entry_,%TEMP ENDM
stdENDP tcpxsum
ifndef NO_XMMI
LOOP_UNROLLING_BITS_XMMI equ 4 LOOP_UNROLLING_XMMI equ (1 SHL LOOP_UNROLLING_BITS_XMMI)
;VRSTEST EQU 0 ifdef VRSTEST ; ; Test tcpxsum_xmmi for correctness. tcksum equ 8[ebp] ; stack offset to initial checksum tbuf equ 12[ebp] ; stack offset to source address tlen equ 16[ebp] ; stack offset to length in words align cPublicProc tcpxsum_xmmi,3 ;int 3 push ebp mov ebp, esp push ebx push esi mov ebx, offset tcpxsum mov esi, offset tcpxsum_xmmi1 ; Get a "random" number .586p rdtsc .386p and eax, 10H jz old_then_new ; Swap which routine is called first push ebx mov ebx, esi pop esi old_then_new: ; Call the first routine push tlen push tbuf push tcksum call ebx ; Save the answer push eax ; Call the second routine push tlen push tbuf push tcksum call esi ; Check the answer cmp eax, [esp] jnz different_xsum ; Same answer, we are done pop eax pop esi pop ebx pop ebp stdRET tcpxsum_xmmi align ; Different answers, need to debug the problem different_xsum: ; Get both checksums onto the stack push eax ; ... and bugcheck ;EXTRNP _KeBugCheck,1,IMPORT ;stdCall _KeBugCheck, <0> again: int 3 jmp again stdENDP tcpxsum_xmmi endif
;++ ; ; ULONG ; tcpxsum_xmmi( ; IN ULONG cksum, ; IN PUCHAR buf, ; IN ULONG len ; ) ; ; Routine Description: ; ; This function computes the checksum of the specified buffer. ; It uses Processor's prefetch instruction. ; ; Arguments: ; ; cksum - Suppiles the initial checksum value, in 16-bit form, ; with the high word set to 0. ; ; buf - Supplies a pointer to the buffer to the checksum buffer. ; ; len - Supplies the length of the buffer in bytes. ; ; Return Value: ; ; The computed checksum in 32-bit two-partial-accumulators form, added to ; the initial checksum, is returned as the function value. ; ;--
cksum equ 12 ; stack offset to initial checksum buf equ 16 ; stack offset to source address len equ 20 ; stack offset to length in words
to_checksum_last_word_xmmi: jmp checksum_last_word_xmmi
to_checksum_done_xmmi: jmp checksum_done_xmmi
to_checksum_dword_loop_done_xmmi: jmp checksum_dword_loop_done_xmmi
ifdef VRSTEST cPublicProc tcpxsum_xmmi1,3 else cPublicProc tcpxsum_xmmi,3 endif
; FPO = 0 dwords locals allocated in prolog ; 3 dword parameters ; 2 bytes in prolog ; 2 registers saved ; 0 EBP is not used ; 0 frame type = FPO .FPO (0,3,2,2,0,0)
push ebx ; save nonvolatile register push esi ; save nonvolatile register
mov ecx,[esp + len] ; get length in bytes sub eax,eax ; clear computed checksum test ecx,ecx ; any bytes to checksum at all? jz short to_checksum_done_xmmi ; no bytes to checksum
; ; if the checksum buffer is not word aligned, then add the first byte of ; the buffer to the input checksum. ;
mov esi,[esp + buf] ; get source address sub edx,edx ; set up to load word into EDX below test esi,1 ; check if buffer word aligned jz short checksum_word_aligned ; if zf, buffer word aligned mov ah,[esi] ; get first byte (we know we'll have ; to swap at the end) inc esi ; increment buffer address dec ecx ; decrement number of bytes jz short to_checksum_done_xmmi ; if zf set, no more bytes
; ; If the buffer is not an even number of of bytes, then initialize ; the computed checksum with the last byte of the buffer. ;
checksum_word_aligned: ; shr ecx,1 ; convert to word count jnc short checksum_start ; if nc, even number of bytes mov al,[esi+ecx*2] ; initialize the computed checksum jz short to_checksum_done_xmmi ; if zf set, no more bytes
; ; Compute checksum in large blocks of dwords, with one partial word up front if ; necessary to get dword alignment, and another partial word at the end if ; needed. ;
; ; Compute checksum on the leading word, if that's necessary to get dword ; alignment. ;
checksum_start: ; test esi,02h ; check if source dword aligned jz short checksum_dword_aligned ; source is already dword aligned mov dx,[esi] ; get first word to checksum add esi,2 ; update source address add eax,edx ; update partial checksum ; (no carry is possible, because EAX ; and EDX are both 16-bit values) dec ecx ; count off this word (zero case gets ; picked up below)
; ; Checksum as many words as possible by processing a dword at a time. ;
checksum_dword_aligned: push ecx ; so we can tell if there's a trailing ; word later shr ecx,1 ; # of dwords to checksum jz short to_checksum_last_word_xmmi ; no dwords to checksum
mov edx,[esi] ; preload the first dword add esi,4 ; point to the next dword dec ecx ; count off the dword we just loaded jz short to_checksum_dword_loop_done_xmmi ; skip the loop if that was the only ; dword mov ebx,ecx ; EBX = # of dwords left to checksum add ecx,LOOP_UNROLLING_XMMI-1 ; round up loop count shr ecx,LOOP_UNROLLING_BITS_XMMI ; convert from word count to unrolled ; loop count and ebx,LOOP_UNROLLING_XMMI-1 ; # of partial dwords to do in first ; loop jz short checksum_dword_loop ; special-case when no partial loop, ; because fixup below doesn't work ; in that case (carry flag is ; cleared at this point, as required ; at loop entry) lea esi,[esi+ebx*4-(LOOP_UNROLLING_XMMI*4)] ; adjust buffer pointer back to ; compensate for hardwired displacement ; at loop entry point ; ***doesn't change carry flag*** jmp xmmi_loop_entry[ebx*4] ; enter the loop to do the first, ; partial iteration, after which we can ; just do 64-word blocks ; ***doesn't change carry flag***
checksum_dword_loop: ; prefetch the 32-byte cache line from [esi+0] db 0fH db 18H db 46H db 00H
; prefetch the 32-byte cache line from [esi+20h] db 0fH db 18H db 46H db 20H
; prefetch the 32-byte cache line from [esi+40h] db 0fH db 18H db 46H db 40H
; prefetch the 32-byte cache line from [esi+60h] db 0fH db 18H db 46H db 60H
DEFLAB macro pre,suf pre&suf: endm
TEMP=0 REPT LOOP_UNROLLING_XMMI deflab xmmi_loop_entry_,%TEMP adc eax,edx mov edx,[esi + TEMP] TEMP=TEMP+4 ENDM
checksum_dword_loop_end:
lea esi,[esi + LOOP_UNROLLING_XMMI * 4] ; update source address ; ***doesn't change carry flag*** dec ecx ; count off unrolled loop iteration ; ***doesn't change carry flag*** jnz checksum_dword_loop ; do more blocks
checksum_dword_loop_done_xmmi label proc adc eax,edx ; finish dword checksum mov edx,0 ; prepare to load trailing word adc eax,edx
; ; Compute checksum on the trailing word, if there is one. ; High word of EDX = 0 at this point ; Carry flag set iff there's a trailing word to do at this point ;
checksum_last_word_xmmi label proc ; "proc" so not scoped to function pop ecx ; get back word count test ecx,1 ; is there a trailing word? jz short checksum_done_xmmi; no trailing word add ax,[esi] ; add in the trailing word adc eax,0 ;
checksum_done_xmmi label proc ; "proc" so not scoped to function mov ecx,eax ; fold the checksum to 16 bits ror ecx,16 add eax,ecx mov ebx,[esp + buf] shr eax,16 test ebx,1 ; check if buffer word aligned jz short checksum_combine_xmmi ; if zf set, buffer word aligned ror ax,8 ; byte aligned--swap bytes back checksum_combine_xmmi label proc ; "proc" so not scoped to function add ax,word ptr [esp + cksum] ; combine checksums pop esi ; restore nonvolatile register adc eax,0 ; pop ebx ; restore nonvolatile register stdRET tcpxsum
REFLAB macro pre,suf dd pre&suf endm
align 4 xmmi_loop_entry label dword dd 0 TEMP=LOOP_UNROLLING_XMMI*4 REPT LOOP_UNROLLING_XMMI-1 TEMP=TEMP-4 reflab xmmi_loop_entry_,%TEMP ENDM
ifdef VRSTEST stdENDP tcpxsum_xmmi1 else stdENDP tcpxsum_xmmi endif
endif ; NO_XMMI
ifndef NO_OLD_FLUSHSLIST
;++ ; ; PSINGLE_LIST_ENTRY ; FASTCALL ; InterlockedFlushSList ( ; IN PSINGLE_LIST_ENTRY ListHead ; ) ; ; Routine Description: ; ; This function removes the entire list from a sequenced singly ; linked list so that access to the list is synchronized in an MP system. ; If there are no entries in the list, then a value of NULL is returned. ; Otherwise, the address of the entry at the top of the list is removed ; and returned as the function value and the list header is set to point ; to NULL. ; ; N.B. The cmpxchg8b instruction is only supported on some processors. ; If the host processor does not support this instruction, then ; then following code is patched to contain a jump to the normal ; pop entry code which has a compatible calling sequence and data ; structure. ; ; Arguments: ; ; (ecx) = ListHead - Supplies a pointer to the sequenced listhead from ; which the list is to be flushed. ; ; Return Value: ; ; The address of the entire current list, or NULL if the list is ; empty. ; ;--
cPublicProc InterlockedFlushSList, 1
; ; Save nonvolatile registers and read the listhead sequence number followed ; by the listhead next link. ; ; N.B. These two dwords MUST be read exactly in this order. ;
push ecx
push ebx ; save nonvolatile registers push ebp ; mov ecx, [esp+16] mov ebp, ecx ; save listhead address mov edx, [ebp] + 4 ; get current sequence number mov eax, [ebp] + 0 ; get current next link
; ; N.B. The following code is the retry code should the compare ; part of the compare exchange operation fail ; ; If the list is empty, then there is nothing that can be removed. ;
Efls10: or eax, eax ; check if list is empty jz short Efls20 ; if z set, list is empty mov ecx, 0 ; clear sequence number and depth mov ebx, 0 ; clear successor entry pointer
.586 ifndef NT_UP
lock cmpxchg8b qword ptr [ebp] ; compare and exchange
else
cmpxchg8b qword ptr [ebp] ; compare and exchange
endif .386
jnz short Efls10 ; if z clear, exchange failed
; ; Restore nonvolatile registers and return result. ;
Efls20: pop ebp ; restore nonvolatile registers pop ebx ; pop ecx
stdRET InterlockedFlushSList
stdENDP InterlockedFlushSList
endif ; NO_OLD_FLUSHSLIST
end
|