title  "Compute Checksum"
;++
;
; Copyright (c) Microsoft Corporation.  All rights reserved.
;
; Module Name:
;
;   xsum.amd
;
; Abstract:
;
;   This module implements the platform specific function to compute the
;   checksum of a buffer.
;
; Author:
;
;   David N. Cutler (davec) 6-Jul-2000
;
; Environment:
;
;   Any mode.
;
;--

include ksamd64.inc

;++
;
; ULONG
; tcpxsum(
;    IN ULONG Checksum,
;    IN PUCHAR Source,
;    IN ULONG Length
;    )
;
; Routine Description:
;
;    This function computes the checksum of the specified buffer and combines
;    the computed checksum with the specified checksum.
;
; Arguments:
;
;    Checksum (ecx) - Suppiles the initial checksum value, in 16-bit form,
;            with the high word set to 0.
;
;    Source (rdx) - Supplies a pointer to the checksum buffer.
;
;    Length (r8d) - Supplies the length of the buffer in bytes.
;
; Return Value:
;
;    The computed checksum, in 16-bit form, with the high word set to 0.
;
;--

        NESTED_ENTRY tcpxsum, _TEXT$00

        push_reg rbx                    ; save nonvolatile register

        END_PROLOGUE

        mov     r11, rdx                ; save initial buffer address
        mov     bx, cx                  ; save initial checksum
        mov     r10, rdx                ; set checksum buffer address
        mov     ecx, r8d                ; set buffer length
        xor     eax, eax                ; clear computed checksum
        test    ecx, ecx                ; test if any bytes to checksum
        jz      combine                 ; if z, no bytes to checksum

;
; If the checksum buffer is not word aligned, then add the first byte of
; the buffer to the checksum.
;
; N.B. First buffer address check is done using rdx rather than r10 so
;      the register ah can be used.
;

        test    dl, 1                   ; test if buffer word aligned
        jz      short word_aligned      ; if z, buffer word aligned
        mov     ah, [rdx]               ; get first byte of checksum
        inc     r10                     ; increment buffer address
        dec     ecx                     ; decrement number of bytes
        jz      done                    ; if z set, no more bytes

;
; If the buffer is not an even number of bytes, then add the last byte of
; the buffer to the checksum.
;

word_aligned:                           ;
        shr     ecx, 1                  ; convert to word count
        jnc     short word_count        ; if nc, even number of bytes
        mov     al, [r10][rcx * 2]      ; initialize the computed checksum
        jz      done                    ; if z set, no more bytes

;
; If the buffer is not quadword aligned, then add words to the checksum until
; the buffer is quadword aligned.
;

word_count:                             ;
        test    r10b, 6                 ; test if buffer quadword aligned
        jz      short qword_aligned     ; if z, buffer quadword aligned
qword_align:                            ;
        add     ax, [r10]               ; add next word of checksum
        adc     eax, 0                  ; propagate carry
        add     r10, 2                  ; increment buffer address
        dec     ecx                     ; decrement number of words
        jz      done                    ; if z, no more words
        test    r10b, 6                 ; test if buffer qword aligned
        jnz     short qword_align       ; if nz, buffer not qword aligned

;
; Compute checksum in large blocks of qwords.
;

qword_aligned:                          ;
        mov     edx, ecx                ; copy number or words remaining
        shr     edx, 2                  ; compute number of quadwords
        jz      residual_words          ; if z, no quadwords to checksum
        mov     r8d, edx                ; compute number of loop iterations
        shr     r8d, 4                  ;
        and     edx, 16 - 1             ; isolate partial loop iteration
        jz      short checksum_loop     ; if z, no partial loop iteration
        sub     rdx, 16                 ; compute negative loop top offset
        lea     r10, [r10][rdx * 8]     ; bias initial buffer address
        neg     rdx                     ; compute positive loop top offset
        add     r8d, 1                  ; increment loop iteration count

        ;
        ; ASSEMBLER WORKAROUND - when fixed, remove the following data
        ; byte
        ;

        db      04ch

        lea     r9, checksum_start      ; get address of checksum array
        lea     r9, [r9][rdx * 4]       ; compute initial iteration address
        jmp     r9                      ; start checksum

;
; Checksum quadwords.
;
; N.B. This loop is entered with carry clear.
;

        align   16
checksum_loop:                          ;
        prefetchnta 0[r10]              ; prefetch start of 128-byte block
        prefetchnta 120[r10]            ; prefetch end of 128-byte block

;
; N.B. The first 16 of following instructions are exactly 4 bytes long.
;

checksum_start:

;       adc     rax, 0[r10]             ; Compute checksum
                                        ;
        db      049h                    ; Manually encode the 4-byte
        db      013h                    ; version of the instruction
        db      042h                    ;
        db      000h                    ; adc rax, 0[r10]

        adc     rax, 8[r10]             ;
        adc     rax, 16[r10]            ;
        adc     rax, 24[r10]            ;
        adc     rax, 32[r10]            ;
        adc     rax, 40[r10]            ;
        adc     rax, 48[r10]            ;
        adc     rax, 56[r10]            ;
        adc     rax, 64[r10]            ;
        adc     rax, 72[r10]            ;
        adc     rax, 80[r10]            ;
        adc     rax, 88[r10]            ;
        adc     rax, 96[r10]            ;
        adc     rax, 104[r10]           ;
        adc     rax, 112[r10]           ;
        adc     rax, 120[r10]           ;

        .errnz  (($ - checksum_start) - (4 * 16))

        lea     r10, 128[r10]           ; update source address
        dec     r8d                     ; decrement loop count
        jnz     short checksum_loop     ; if nz, more iterations
        adc     rax, 0                  ; propagate last carry

;
; Compute checksum of residual words.
;

residual_words:                         ;
        and     ecx, 3                  ; isolate residual words
        jz      short done              ; if z, no residual words
add_word:                               ;
        add     ax, [r10]               ; add word to checksum
        adc     ax, 0                   ; propagate carry
        add     r10, 2                  ; increment buffer address
        dec     ecx                     ; decrement word count
        jnz     short add_word          ; if nz, more words remaining

;
; Fold the computed checksum to 32-bits and then to 16-bits.
;

done:                                   ;
        mov     rcx, rax                ; fold the checksum to 32-bits
        ror     rcx, 32                 ; swap high and low dwords
        add     rax, rcx                ; produce sum + carry in high 32-bits
        shr     rax, 32                 ; extract 32-bit checksum
        mov     ecx, eax                ; fold the checksum to 16-bits
        ror     ecx, 16                 ; swap high and low words
        add     eax, ecx                ; produce sum + carry in high 16-bits
        shr     eax, 16                 ; extract 16-bit check sum
        test    r11b, 1                 ; test if buffer word aligned
        jz      short combine           ; if z set, buffer word aligned
        ror     ax, 8                   ; swap checksum bytes

;
; Combine the input checksum with the computed checksum.
;

combine:                                ;
        add     ax, bx                  ; combine checksums
        adc     eax, 0                  ; add carry to low 16-bits
        pop     rbx                     ; restore nonvolatile register
        retq                            ; return

        NESTED_END tcpxsum, _TEXT$00

        end