|
|
#include "ksia64.h"
//++ // // Copyright (c) Microsoft Corporation. All rights reserved. // // Routine: // // ULONG // tcpxsum( // IN ULONG ChkSum, // IN PUCHAR Buffer // IN ULONG BufferLength // ) // // Routine Description: // // This function computes the checksum of the specified buffer. // // Arguments: // // a0: supplies the initial checksum value, in 16-bit form, // with the high word set to 0. // // a1: supplies a pointer to the buffer buffer. // // a2: supplies the length of the buffer in bytes. // // // Return Value: // // The computed checksum in 32-bit form two-partial-accumulators form, // added to the initial checksum is returned as the function value. // // Author: // // Thierry Fevrier (Hewlett-Packard) for Microsoft Corporation. // // Notes: // // !!WARNING!! - Thierry - 07/10/2000 // The following code has been carefully optimized. // Please consider this before making any modifications... Thank you. // //--
LEAF_ENTRY(tcpxsum)
and t1 = -4, a1 and t2 = -4, a1 brp.dptk.imp xUA, UAbrlabel
and t17 = -8, a1 // mod 8 the address cmp.gtu pt2, pt3 = 96, a2 // is size < 96? ;; add t3 = 8, t2
(pt3) ld8 t16 = [t17], 64 // load first data needed for loop cmp.eq pt0, pt1 = 20, a2 // is length 20 bytes ? nop.i 0
mov t4 = 128;; nop.m 0 cmp.gtu pt2 = a2, t4;; //is a2 > 128?
(pt2) lfetch [t17], 64;; // if yes, you can prefetch 4 (pt2) lfetch [t17], 64 // do prefetches of data needed nop.i 0;;
nop.m 0 nop.i 0 (pt1) br.cond.dptk.few x32start;;
ld4 t11 = [t2], 4 tbit.nz pt9 = a1, 0 nop.b 0
ld4 t12 = [t3], 4 cmp.ltu pt1 = t1, a1 // if not 4 byte aligned (pt1) br.cond.dpnt.few x32start;;
ld4 t13 = [t2], 4 ld4 t14 = [t3], 4 nop.i 0;;
ld4 t15 = [t3] add t20 = t11, t12 add t21 = t13, t14;;
add t20 = t20, t21;; add t20 = t20, t15 nop.i 0;;
xfold: addl t10 = 0xffff, zero // fold 64 bit into 16 bits dep t0 = -1, zero, 0, 32 nop.i 0;;
and t1 = t20, t0 extr.u t2 = t20, 32, 32;; add t20 = t1, t2;;
and t1 = t20, t0 extr.u t2 = t20, 32, 32;; add t20 = t1, t2;;
and t2 = t20, t10 extr.u t1 = t20, 16, 16;; add t20 = t1, t2;;
and t2 = t20, t10 extr.u t1 = t20, 16, 1;; add t20 = t1, t2;;
(pt9) nop.m 0 // swap bytes if necessary (pt9) extr.u t1 = t20, 8, 8 (pt9) nop.i 0;;
(pt9) nop.m 0 (pt9) dep t20 = t20, t1, 8, 8 (pt9) nop.i 0;;
add t20 = a0, t20 // add seed, fold again nop.i 0 nop.i 0;;
extr.u t1 = t20, 32, 1 extr.u t2 = t20, 0, 32;; add t20 = t1, t2;;
and t1 = t20, t10 extr.u t2 = t20, 16, 16;; add t20 = t1, t2;;
and t1 = t20, t10 extr.u t2 = t20, 16, 1;; add t20 = t1, t2;; add v0 = zero, t20 nop.i 0 br.ret.sptk.few b0;;
x32start: // not 20 bytes
and t1 = -8, a1 cmp.eq pt3 = 1, zero cmp.eq pt4 = 1, zero
add t10 = a1, a2 mov t20 = zero tbit.nz pt9 = a1, 0;;
cmp.ltu pt1 = t1, a1 brp.sptk.imp x32startA, x32Abrlabel UAbrlabel: (pt1) br.cond.dptk.few xUA;;
x32startA: // now it is 8 byte aligned
and t10 = -8, t10 dep t9 = zero, a2, 0, 6 // make last 6 bits of count 0 // 6 bits => 64 = # bytes consumed // in one iteration adds t2 = 8, t1;;
cmp.gtu pt2 = 96, a2 // count < 96 add t5 = t1, t9 (pt2) br.cond.dpnt.few xLT32;;
ld8 t3 = [t1], 16 // initial load can eliminated. It may no // longer be valid if alignment occurred, it // was there to provide order mov t4 = 128;; cmp.gtu pt2 = a2, t4;; // is a2 > 256?
ld8 t4 = [t2], 16 (pt2) lfetch [t17], 64 mov t14 = zero;;
(pt2) lfetch [t17], 64 mov t11 = zero mov t13 = zero
ld8 t18 = [t1], 16 ld8 t19 = [t2], 16 mov t12 = zero;;
x32loop: // t5 = address to stop fetching at // t17 = next addr to prefetch
ld8 t6 = [t1], 16 // modified main loop; unrolled a little more // and using prefetches ld8 t7 = [t2], 16 add t11 = t11, t3 add t12 = t12, t4 add t13 = t13, t18 add t14 = t14, t19;;
ld8 t8 = [t1], 16 ld8 t9 = [t2], 16 cmp.ltu pt1 = t11, t3
cmp.ltu pt2 = t12, t4 cmp.ltu pt3 = t13, t18 cmp.ltu pt4 = t14, t19;;
cmp.ltu pt0 = t1, t5 cmp.ltu pt5 = t17, t5 (pt1) adds t11 = 1, t11
(pt2) adds t12 = 1, t12 (pt3) adds t13 = 1, t13 (pt4) adds t14 = 1, t14;;
(pt0) ld8 t3 = [t1], 16 (pt5) lfetch [t17], 64 add t11 = t11, t6 add t12 = t12, t7 add t13 = t13, t8 add t14 = t14, t9;;
(pt0) ld8 t4 = [t2], 16 (pt0) ld8 t18 = [t1], 16 cmp.ltu pt1 = t11, t6
cmp.ltu pt2 = t12, t7 cmp.ltu pt3 = t13, t8 cmp.ltu pt4 = t14, t9;;
(pt0) ld8 t19 = [t2], 16 (pt1) adds t11 = 1, t11 (pt2) adds t12 = 1, t12
(pt3) adds t13 = 1, t13 (pt4) adds t14 = 1, t14 (pt0) br.cond.dptk.many x32loop;; // merge parallel adds add t21 = t11, t12;; nop.m 0 cmp.ltu pt8 = t21, t11;;
(pt8) adds t21 = 1, t21;; nop.m 0 add t20 = t20, t21;;
cmp.ltu pt1 = t20, t21;; add t21 = t13, t14 (pt1) adds t20 = 1, t20;;
cmp.ltu pt2 = t21, t13 nop.i 0;; (pt2) adds t21 = 1, t21;;
add t20 = t20, t21 nop.i 0;; cmp.ltu pt1 = t20, t21;;
(pt1) adds t20 = 1, t20 nop.i 0 nop.i 0
xLT32: // < 32
nop.m 0 cmp.ltu pt0, pt1 = t1, t10 (pt1) br.cond.dpnt.few xtail ld8 t11 = [t1], 8;; add t20 = t20, t11 nop.i 0;;
cmp.ltu pt0 = t20, t11;; (pt0) adds t20 = 1, t20 nop.i 0;;
nop.m 0 nop.f 0 br.cond.sptk.many xLT32
xtail: // < 8
and t5 = 7, a2;; cmp.eq pt0 = zero, t5 nop.i 0
nop.m 0 nop.f 0 (pt0) br.cond.sptk.many xfold
ld8 t11 = [t1] sub t6 = 8, t5 adds t7 = -1, zero;;
nop.m 0 shl t6 = t6, 3 nop.b 0;;
nop.m 0 shr.u t7 = t7, t6;; and t11 = t11, t7;;
add t20 = t20, t11 nop.i 0;; cmp.ltu pt0 = t20, t11;;
(pt0) adds t20 = 1, t20 nop.f 0 br.cond.sptk.many xfold
xUA: // unaligned and t5 = 7, a1 dep t1 = zero, a1, 0, 3 adds t6 = -1, zero;;
ld8 t11 = [t1], 8 sub t7 = 8, t5 ;; cmp.ltu pt0, pt1 = a2, t7;;
(pt0) sub t9 = t7, a2 shl t8 = t5, 3;; (pt0) shl t12 = t9, 3;;
nop.m 0 (pt0) shr.u t14 = t6, t12 shl t13 = t6, t8;;
and t20 = t11, t13;; (pt0) and t20 = t20, t14 (pt0) mov a2 = zero
(pt1) sub a2 = a2, t7 mov a1 = t1 x32Abrlabel: br.cond.sptk.many x32startA LEAF_EXIT(tcpxsum)
|