//      TITLE("Compute Checksum")
//++
//
// Copyright (c) Microsoft Corporation.  All rights reserved.
//
// Module Name:
//
//    xsum.s
//
// Abstract:
//
//    This module implements a function to compute the checksum of a buffer.
//
// Author:
//
//    John Vert (jvert) 11-Jul-1994
//
// Environment:
//
// Revision History:
//
//--

#include "ksalpha.h"


        SBTTL("Compute Checksum")
//++
//
// ULONG
// tcpxsum (
//    IN ULONG Checksum,
//    IN PUSHORT Source,
//    IN ULONG Length
//    )
//
// Routine Description:
//
//    This function computes the checksum of the specified buffer.
//
// Arguments:
//
//    Checksum (a0) - Supplies the initial checksum value.
//
//    Source (a1) - Supplies a pointer to the checksum buffer
//
//    Length (a2) - Supplies the length of the buffer in words.
//
// Return Value:
//
//    The computed checksum is returned as the function value.
//
//--

        LEAF_ENTRY(tcpxsum)
        zap     a0, 0xf0, a0            // clear high half of a0
        bis     a1, zero, t6            // save initial buffer address
        bis     zero, zero, v0          // clear accumulated checksum

//
// Check if the buffer is quadword aligned.
//
// If the buffer is not quadword aligned, then add the leading words to the
// checksum.
//
        ldq_u   t0, 0(a1)               // get containing quadword of first part
        blbc    a1, 10f                 // check for word alignment
        beq     a2, 65f                 // if zero bytes, don't do anything
        extbl   t0, a1, t1              // get leading byte
        sll     t1, 8, v0               // shift it to correct spot for later byte swap
        addq    a1, 1, a1               // increment buffer to first full word
        subq    a2, 1, a2               // decrement byte count

10:
        and     a1, 6, t2               // check if buffer quadword aligned
        beq     t2, 20f                 // if eq, quadword aligned
        extql   t0, t2, t0              // extract bytes to checksum
        and     a1, 7, t3               // compute bytes summed
        subq    zero, t3, t3
        addq    t3, 8, t3
        addq    a1, 8, a1               // advance buffer address to next qword
        bic     a1, 7, a1               //
        subq    a2, t3, t2
        blt     t2, 55f                 // if ltz, too many, jump to residual code

        addq    v0, t0, v0              // add bytes to partial checksum
        cmpult  v0, t0, t1              // generate carry
        addq    t1, v0, v0              // add carry back into checksum

        bis     t2, zero, a2            // reduce count of bytes to checksum
        beq     t2, 60f                 // if eq, no more bytes

20:
//
// Compute the checksum in 64-byte blocks
//
        bic     a2, 7, t4               // subtract out residual bytes
        beq     t4, 40f                 // if eq, no quadwords to checksum
        subq    zero, t4, t2            // compute negative of byte count
        and     t2, 15 << 2, t3         // compute bytes in first iteration
        ldq     t0, 0(a1)               // get first quadword to checksum
        beq     t3, 35f                 // if eq, full 64-byte block
        subq    a1, t3, a1              // bias buffer address by offset
        bic     t4, 64-1, t4            // subtract out bytes in first iteration
        lda     t2, 30f                 // get base address of code vector
        addl    t3, t3, t3              //
        addq    t3, t2, t2              // compute code vector offset
        bis     t0, zero, t1            // copy first quadword to checksum
        jmp     (t2)                    // dispatch


30:
//
// The following code vector computes the checksum of a 64-byte block.
//
.set noreorder
        ldq     t1, 8(a1)
        addq    v0, t0, v0
        cmpult  v0, t0, t2
        addq    v0, t2, v0

        ldq     t0, 16(a1)
        addq    v0, t1, v0
        cmpult  v0, t1, t2
        addq    v0, t2, v0

        ldq     t1, 24(a1)
        addq    v0, t0, v0
        cmpult  v0, t0, t2
        addq    v0, t2, v0

        ldq     t0, 32(a1)
        addq    v0, t1, v0
        cmpult  v0, t1, t2
        addq    v0, t2, v0

        ldq     t1, 40(a1)
        addq    v0, t0, v0
        cmpult  v0, t0, t2
        addq    v0, t2, v0

        ldq     t0, 48(a1)
        addq    v0, t1, v0
        cmpult  v0, t1, t2
        addq    v0, t2, v0

        ldq     t1, 56(a1)
        addq    v0, t0, v0
        cmpult  v0, t0, t2
        addq    v0, t2, v0

        addq    a1, 64, a1
        addq    v0, t1, v0
        cmpult  v0, t1, t2
        addq    v0, t2, v0
.set reorder

        beq     t4, 40f                 // if zero, end of block

35:
        ldq     t0, 0(a1)
//
// The following loop is allowed to be reordered by the assembler for
// optimal scheduling.  It is never branched into.
//
        subq    t4, 64, t4              // reduce byte count of longwords

        ldq     t1, 8(a1)
        addq    v0, t0, v0
        cmpult  v0, t0, t2
        addq    v0, t2, v0

        ldq     t0, 16(a1)
        addq    v0, t1, v0
        cmpult  v0, t1, t2
        addq    v0, t2, v0

        ldq     t1, 24(a1)
        addq    v0, t0, v0
        cmpult  v0, t0, t2
        addq    v0, t2, v0

        ldq     t0, 32(a1)
        addq    v0, t1, v0
        cmpult  v0, t1, t2
        addq    v0, t2, v0

        ldq     t1, 40(a1)
        addq    v0, t0, v0
        cmpult  v0, t0, t2
        addq    v0, t2, v0

        ldq     t0, 48(a1)
        addq    v0, t1, v0
        cmpult  v0, t1, t2
        addq    v0, t2, v0

        ldq     t1, 56(a1)
        addq    v0, t0, v0
        cmpult  v0, t0, t2
        addq    v0, t2, v0

        addq    a1, 64, a1
        addq    v0, t1, v0
        cmpult  v0, t1, t2
        addq    v0, t2, v0

        bne     t4, 35b                 // if ne zero, not end of block

40:
//
// Check for any remaining bytes.
//
        and     a2, 7, a2               // isolate residual bytes
        beq     a2, 60f                 // if eq, no residual bytes
50:
//
// Checksum remaining bytes.
//
// The technique we use here is to load the final quadword, then
// zero out the bytes that are not included.
//
        ldq     t0, 0(a1)               // get quadword surrounding remainder
55:
        ornot   zero, zero, t1          // get FF mask
        sll     t1, a2, t2              // shift to produce byte mask
        zap     t0, t2, t0              // zero out bytes past end of buffer
        addq    v0, t0, v0              // add quadword to partial checksum
        cmpult  v0, t0, t1              // generate carry
        addq    t1, v0, v0              // add carry back into checksum
60:
//
// Byte swap the 64-bit checksum if the start of the buffer was not word aligned
//
        blbc    t6, 65f
        zap     v0, 0xAA, t0            // isolate even bytes
        sll     t0, 8, t0               // shift even bytes into odd positions
        srl     v0, 8, t1               // shift odd bytes into even positions
        zap     t1, 0xAA, t1            // isolate odd bytes
        bis     t0, t1, v0              // merge bytes back together

65:
//
// add computed checksum to original checksum, and fold the 64-bit
// result down to 16 bits.
//
        addq    v0, a0, v0              // add computed checksum to original
        cmpult  v0, a0, t0              // generate carry
        addq    v0, t0, v0              // add carry back into checksum

//
// swap the longwords in order to sum two longwords and their carry in one add.
//
        sll     v0, 32, t0              // shift low longword into high
        srl     v0, 32, t1              // shift high longword into low
        bis     t1, t0, t5              // merge back together

        addq    v0, t5, t0              // produce sum + carry in high longword
        srl     t0, 32, t1              // shift back down to low half
//
// swap the words in order to sum two words and their carry in one add
//
        sll     t1, 16, t2              // shift high word into low
        srl     t1, 16, t3              // shift low word into high
        bis     t2, t3, t4              // merge back together
        addq    t4, t1, t2              // produce sum and carry in high word
        extwl   t2, 2, v0               // extract result.
        ret     zero, (ra)              // return

        .end    tcpxsum