windows-server-2003/base/published/xsum_axp


								//      TITLE("Compute Checksum")

								//++

								//

								// Copyright (c) Microsoft Corporation.  All rights reserved.

								//

								// Module Name:

								//

								//    xsum.s

								//

								// Abstract:

								//

								//    This module implements a function to compute the checksum of a buffer.

								//

								// Author:

								//

								//    John Vert (jvert) 11-Jul-1994

								//

								// Environment:

								//

								// Revision History:

								//

								//--


								#include "ksalpha.h"


								        SBTTL("Compute Checksum")

								//++

								//

								// ULONG

								// tcpxsum (

								//    IN ULONG Checksum,

								//    IN PUSHORT Source,

								//    IN ULONG Length

								//    )

								//

								// Routine Description:

								//

								//    This function computes the checksum of the specified buffer.

								//

								// Arguments:

								//

								//    Checksum (a0) - Supplies the initial checksum value.

								//

								//    Source (a1) - Supplies a pointer to the checksum buffer

								//

								//    Length (a2) - Supplies the length of the buffer in words.

								//

								// Return Value:

								//

								//    The computed checksum is returned as the function value.

								//

								//--


								        LEAF_ENTRY(tcpxsum)

								        zap     a0, 0xf0, a0            // clear high half of a0

								        bis     a1, zero, t6            // save initial buffer address

								        bis     zero, zero, v0          // clear accumulated checksum


								//

								// Check if the buffer is quadword aligned.

								//

								// If the buffer is not quadword aligned, then add the leading words to the

								// checksum.

								//

								        ldq_u   t0, 0(a1)               // get containing quadword of first part

								        blbc    a1, 10f                 // check for word alignment

								        beq     a2, 65f                 // if zero bytes, don't do anything

								        extbl   t0, a1, t1              // get leading byte

								        sll     t1, 8, v0               // shift it to correct spot for later byte swap

								        addq    a1, 1, a1               // increment buffer to first full word

								        subq    a2, 1, a2               // decrement byte count


								10:

								        and     a1, 6, t2               // check if buffer quadword aligned

								        beq     t2, 20f                 // if eq, quadword aligned

								        extql   t0, t2, t0              // extract bytes to checksum

								        and     a1, 7, t3               // compute bytes summed

								        subq    zero, t3, t3

								        addq    t3, 8, t3

								        addq    a1, 8, a1               // advance buffer address to next qword

								        bic     a1, 7, a1               //

								        subq    a2, t3, t2

								        blt     t2, 55f                 // if ltz, too many, jump to residual code


								        addq    v0, t0, v0              // add bytes to partial checksum

								        cmpult  v0, t0, t1              // generate carry

								        addq    t1, v0, v0              // add carry back into checksum


								        bis     t2, zero, a2            // reduce count of bytes to checksum

								        beq     t2, 60f                 // if eq, no more bytes


								20:

								//

								// Compute the checksum in 64-byte blocks

								//

								        bic     a2, 7, t4               // subtract out residual bytes

								        beq     t4, 40f                 // if eq, no quadwords to checksum

								        subq    zero, t4, t2            // compute negative of byte count

								        and     t2, 15 << 2, t3         // compute bytes in first iteration

								        ldq     t0, 0(a1)               // get first quadword to checksum

								        beq     t3, 35f                 // if eq, full 64-byte block

								        subq    a1, t3, a1              // bias buffer address by offset

								        bic     t4, 64-1, t4            // subtract out bytes in first iteration

								        lda     t2, 30f                 // get base address of code vector

								        addl    t3, t3, t3              //

								        addq    t3, t2, t2              // compute code vector offset

								        bis     t0, zero, t1            // copy first quadword to checksum

								        jmp     (t2)                    // dispatch


								30:

								//

								// The following code vector computes the checksum of a 64-byte block.

								//

								.set noreorder

								        ldq     t1, 8(a1)

								        addq    v0, t0, v0

								        cmpult  v0, t0, t2

								        addq    v0, t2, v0


								        ldq     t0, 16(a1)

								        addq    v0, t1, v0

								        cmpult  v0, t1, t2

								        addq    v0, t2, v0


								        ldq     t1, 24(a1)

								        addq    v0, t0, v0

								        cmpult  v0, t0, t2

								        addq    v0, t2, v0


								        ldq     t0, 32(a1)

								        addq    v0, t1, v0

								        cmpult  v0, t1, t2

								        addq    v0, t2, v0


								        ldq     t1, 40(a1)

								        addq    v0, t0, v0

								        cmpult  v0, t0, t2

								        addq    v0, t2, v0


								        ldq     t0, 48(a1)

								        addq    v0, t1, v0

								        cmpult  v0, t1, t2

								        addq    v0, t2, v0


								        ldq     t1, 56(a1)

								        addq    v0, t0, v0

								        cmpult  v0, t0, t2

								        addq    v0, t2, v0


								        addq    a1, 64, a1

								        addq    v0, t1, v0

								        cmpult  v0, t1, t2

								        addq    v0, t2, v0

								.set reorder


								        beq     t4, 40f                 // if zero, end of block


								35:

								        ldq     t0, 0(a1)

								//

								// The following loop is allowed to be reordered by the assembler for

								// optimal scheduling.  It is never branched into.

								//

								        subq    t4, 64, t4              // reduce byte count of longwords


								        ldq     t1, 8(a1)

								        addq    v0, t0, v0

								        cmpult  v0, t0, t2

								        addq    v0, t2, v0


								        ldq     t0, 16(a1)

								        addq    v0, t1, v0

								        cmpult  v0, t1, t2

								        addq    v0, t2, v0


								        ldq     t1, 24(a1)

								        addq    v0, t0, v0

								        cmpult  v0, t0, t2

								        addq    v0, t2, v0


								        ldq     t0, 32(a1)

								        addq    v0, t1, v0

								        cmpult  v0, t1, t2

								        addq    v0, t2, v0


								        ldq     t1, 40(a1)

								        addq    v0, t0, v0

								        cmpult  v0, t0, t2

								        addq    v0, t2, v0


								        ldq     t0, 48(a1)

								        addq    v0, t1, v0

								        cmpult  v0, t1, t2

								        addq    v0, t2, v0


								        ldq     t1, 56(a1)

								        addq    v0, t0, v0

								        cmpult  v0, t0, t2

								        addq    v0, t2, v0


								        addq    a1, 64, a1

								        addq    v0, t1, v0

								        cmpult  v0, t1, t2

								        addq    v0, t2, v0


								        bne     t4, 35b                 // if ne zero, not end of block


								40:

								//

								// Check for any remaining bytes.

								//

								        and     a2, 7, a2               // isolate residual bytes

								        beq     a2, 60f                 // if eq, no residual bytes

								50:

								//

								// Checksum remaining bytes.

								//

								// The technique we use here is to load the final quadword, then

								// zero out the bytes that are not included.

								//

								        ldq     t0, 0(a1)               // get quadword surrounding remainder

								55:

								        ornot   zero, zero, t1          // get FF mask

								        sll     t1, a2, t2              // shift to produce byte mask

								        zap     t0, t2, t0              // zero out bytes past end of buffer

								        addq    v0, t0, v0              // add quadword to partial checksum

								        cmpult  v0, t0, t1              // generate carry

								        addq    t1, v0, v0              // add carry back into checksum

								60:

								//

								// Byte swap the 64-bit checksum if the start of the buffer was not word aligned

								//

								        blbc    t6, 65f

								        zap     v0, 0xAA, t0            // isolate even bytes

								        sll     t0, 8, t0               // shift even bytes into odd positions

								        srl     v0, 8, t1               // shift odd bytes into even positions

								        zap     t1, 0xAA, t1            // isolate odd bytes

								        bis     t0, t1, v0              // merge bytes back together


								65:

								//

								// add computed checksum to original checksum, and fold the 64-bit

								// result down to 16 bits.

								//

								        addq    v0, a0, v0              // add computed checksum to original

								        cmpult  v0, a0, t0              // generate carry

								        addq    v0, t0, v0              // add carry back into checksum


								//

								// swap the longwords in order to sum two longwords and their carry in one add.

								//

								        sll     v0, 32, t0              // shift low longword into high

								        srl     v0, 32, t1              // shift high longword into low

								        bis     t1, t0, t5              // merge back together


								        addq    v0, t5, t0              // produce sum + carry in high longword

								        srl     t0, 32, t1              // shift back down to low half

								//

								// swap the words in order to sum two words and their carry in one add

								//

								        sll     t1, 16, t2              // shift high word into low

								        srl     t1, 16, t3              // shift low word into high

								        bis     t2, t3, t4              // merge back together

								        addq    t4, t1, t2              // produce sum and carry in high word

								        extwl   t2, 2, v0               // extract result.

								        ret     zero, (ra)              // return


								        .end    tcpxsum