Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

341 lines
7.6 KiB

  1. #include "ksia64.h"
  2. //++
  3. //
  4. // Copyright (c) Microsoft Corporation. All rights reserved.
  5. //
  6. // Routine:
  7. //
  8. // ULONG
  9. // tcpxsum(
  10. // IN ULONG ChkSum,
  11. // IN PUCHAR Buffer
  12. // IN ULONG BufferLength
  13. // )
  14. //
  15. // Routine Description:
  16. //
  17. // This function computes the checksum of the specified buffer.
  18. //
  19. // Arguments:
  20. //
  21. // a0: supplies the initial checksum value, in 16-bit form,
  22. // with the high word set to 0.
  23. //
  24. // a1: supplies a pointer to the buffer buffer.
  25. //
  26. // a2: supplies the length of the buffer in bytes.
  27. //
  28. //
  29. // Return Value:
  30. //
  31. // The computed checksum in 32-bit form two-partial-accumulators form,
  32. // added to the initial checksum is returned as the function value.
  33. //
  34. // Author:
  35. //
  36. // Thierry Fevrier (Hewlett-Packard) for Microsoft Corporation.
  37. //
  38. // Notes:
  39. //
  40. // !!WARNING!! - Thierry - 07/10/2000
  41. // The following code has been carefully optimized.
  42. // Please consider this before making any modifications... Thank you.
  43. //
  44. //--
  45. LEAF_ENTRY(tcpxsum)
  46. and t1 = -4, a1
  47. and t2 = -4, a1
  48. brp.dptk.imp xUA, UAbrlabel
  49. and t17 = -8, a1 // mod 8 the address
  50. cmp.gtu pt2, pt3 = 96, a2 // is size < 96?
  51. ;;
  52. add t3 = 8, t2
  53. (pt3) ld8 t16 = [t17], 64 // load first data needed for loop
  54. cmp.eq pt0, pt1 = 20, a2 // is length 20 bytes ?
  55. nop.i 0
  56. mov t4 = 128;;
  57. nop.m 0
  58. cmp.gtu pt2 = a2, t4;; //is a2 > 128?
  59. (pt2) lfetch [t17], 64;; // if yes, you can prefetch 4
  60. (pt2) lfetch [t17], 64 // do prefetches of data needed
  61. nop.i 0;;
  62. nop.m 0
  63. nop.i 0
  64. (pt1) br.cond.dptk.few x32start;;
  65. ld4 t11 = [t2], 4
  66. tbit.nz pt9 = a1, 0
  67. nop.b 0
  68. ld4 t12 = [t3], 4
  69. cmp.ltu pt1 = t1, a1 // if not 4 byte aligned
  70. (pt1) br.cond.dpnt.few x32start;;
  71. ld4 t13 = [t2], 4
  72. ld4 t14 = [t3], 4
  73. nop.i 0;;
  74. ld4 t15 = [t3]
  75. add t20 = t11, t12
  76. add t21 = t13, t14;;
  77. add t20 = t20, t21;;
  78. add t20 = t20, t15
  79. nop.i 0;;
  80. xfold:
  81. addl t10 = 0xffff, zero // fold 64 bit into 16 bits
  82. dep t0 = -1, zero, 0, 32
  83. nop.i 0;;
  84. and t1 = t20, t0
  85. extr.u t2 = t20, 32, 32;;
  86. add t20 = t1, t2;;
  87. and t1 = t20, t0
  88. extr.u t2 = t20, 32, 32;;
  89. add t20 = t1, t2;;
  90. and t2 = t20, t10
  91. extr.u t1 = t20, 16, 16;;
  92. add t20 = t1, t2;;
  93. and t2 = t20, t10
  94. extr.u t1 = t20, 16, 1;;
  95. add t20 = t1, t2;;
  96. (pt9) nop.m 0 // swap bytes if necessary
  97. (pt9) extr.u t1 = t20, 8, 8
  98. (pt9) nop.i 0;;
  99. (pt9) nop.m 0
  100. (pt9) dep t20 = t20, t1, 8, 8
  101. (pt9) nop.i 0;;
  102. add t20 = a0, t20 // add seed, fold again
  103. nop.i 0
  104. nop.i 0;;
  105. extr.u t1 = t20, 32, 1
  106. extr.u t2 = t20, 0, 32;;
  107. add t20 = t1, t2;;
  108. and t1 = t20, t10
  109. extr.u t2 = t20, 16, 16;;
  110. add t20 = t1, t2;;
  111. and t1 = t20, t10
  112. extr.u t2 = t20, 16, 1;;
  113. add t20 = t1, t2;;
  114. add v0 = zero, t20
  115. nop.i 0
  116. br.ret.sptk.few b0;;
  117. x32start: // not 20 bytes
  118. and t1 = -8, a1
  119. cmp.eq pt3 = 1, zero
  120. cmp.eq pt4 = 1, zero
  121. add t10 = a1, a2
  122. mov t20 = zero
  123. tbit.nz pt9 = a1, 0;;
  124. cmp.ltu pt1 = t1, a1
  125. brp.sptk.imp x32startA, x32Abrlabel
  126. UAbrlabel:
  127. (pt1) br.cond.dptk.few xUA;;
  128. x32startA: // now it is 8 byte aligned
  129. and t10 = -8, t10
  130. dep t9 = zero, a2, 0, 6 // make last 6 bits of count 0
  131. // 6 bits => 64 = # bytes consumed
  132. // in one iteration
  133. adds t2 = 8, t1;;
  134. cmp.gtu pt2 = 96, a2 // count < 96
  135. add t5 = t1, t9
  136. (pt2) br.cond.dpnt.few xLT32;;
  137. ld8 t3 = [t1], 16 // initial load can eliminated. It may no
  138. // longer be valid if alignment occurred, it
  139. // was there to provide order
  140. mov t4 = 128;;
  141. cmp.gtu pt2 = a2, t4;; // is a2 > 256?
  142. ld8 t4 = [t2], 16
  143. (pt2) lfetch [t17], 64
  144. mov t14 = zero;;
  145. (pt2) lfetch [t17], 64
  146. mov t11 = zero
  147. mov t13 = zero
  148. ld8 t18 = [t1], 16
  149. ld8 t19 = [t2], 16
  150. mov t12 = zero;;
  151. x32loop: // t5 = address to stop fetching at
  152. // t17 = next addr to prefetch
  153. ld8 t6 = [t1], 16 // modified main loop; unrolled a little more
  154. // and using prefetches
  155. ld8 t7 = [t2], 16
  156. add t11 = t11, t3
  157. add t12 = t12, t4
  158. add t13 = t13, t18
  159. add t14 = t14, t19;;
  160. ld8 t8 = [t1], 16
  161. ld8 t9 = [t2], 16
  162. cmp.ltu pt1 = t11, t3
  163. cmp.ltu pt2 = t12, t4
  164. cmp.ltu pt3 = t13, t18
  165. cmp.ltu pt4 = t14, t19;;
  166. cmp.ltu pt0 = t1, t5
  167. cmp.ltu pt5 = t17, t5
  168. (pt1) adds t11 = 1, t11
  169. (pt2) adds t12 = 1, t12
  170. (pt3) adds t13 = 1, t13
  171. (pt4) adds t14 = 1, t14;;
  172. (pt0) ld8 t3 = [t1], 16
  173. (pt5) lfetch [t17], 64
  174. add t11 = t11, t6
  175. add t12 = t12, t7
  176. add t13 = t13, t8
  177. add t14 = t14, t9;;
  178. (pt0) ld8 t4 = [t2], 16
  179. (pt0) ld8 t18 = [t1], 16
  180. cmp.ltu pt1 = t11, t6
  181. cmp.ltu pt2 = t12, t7
  182. cmp.ltu pt3 = t13, t8
  183. cmp.ltu pt4 = t14, t9;;
  184. (pt0) ld8 t19 = [t2], 16
  185. (pt1) adds t11 = 1, t11
  186. (pt2) adds t12 = 1, t12
  187. (pt3) adds t13 = 1, t13
  188. (pt4) adds t14 = 1, t14
  189. (pt0) br.cond.dptk.many x32loop;; // merge parallel adds
  190. add t21 = t11, t12;;
  191. nop.m 0
  192. cmp.ltu pt8 = t21, t11;;
  193. (pt8) adds t21 = 1, t21;;
  194. nop.m 0
  195. add t20 = t20, t21;;
  196. cmp.ltu pt1 = t20, t21;;
  197. add t21 = t13, t14
  198. (pt1) adds t20 = 1, t20;;
  199. cmp.ltu pt2 = t21, t13
  200. nop.i 0;;
  201. (pt2) adds t21 = 1, t21;;
  202. add t20 = t20, t21
  203. nop.i 0;;
  204. cmp.ltu pt1 = t20, t21;;
  205. (pt1) adds t20 = 1, t20
  206. nop.i 0
  207. nop.i 0
  208. xLT32: // < 32
  209. nop.m 0
  210. cmp.ltu pt0, pt1 = t1, t10
  211. (pt1) br.cond.dpnt.few xtail
  212. ld8 t11 = [t1], 8;;
  213. add t20 = t20, t11
  214. nop.i 0;;
  215. cmp.ltu pt0 = t20, t11;;
  216. (pt0) adds t20 = 1, t20
  217. nop.i 0;;
  218. nop.m 0
  219. nop.f 0
  220. br.cond.sptk.many xLT32
  221. xtail: // < 8
  222. and t5 = 7, a2;;
  223. cmp.eq pt0 = zero, t5
  224. nop.i 0
  225. nop.m 0
  226. nop.f 0
  227. (pt0) br.cond.sptk.many xfold
  228. ld8 t11 = [t1]
  229. sub t6 = 8, t5
  230. adds t7 = -1, zero;;
  231. nop.m 0
  232. shl t6 = t6, 3
  233. nop.b 0;;
  234. nop.m 0
  235. shr.u t7 = t7, t6;;
  236. and t11 = t11, t7;;
  237. add t20 = t20, t11
  238. nop.i 0;;
  239. cmp.ltu pt0 = t20, t11;;
  240. (pt0) adds t20 = 1, t20
  241. nop.f 0
  242. br.cond.sptk.many xfold
  243. xUA: // unaligned
  244. and t5 = 7, a1
  245. dep t1 = zero, a1, 0, 3
  246. adds t6 = -1, zero;;
  247. ld8 t11 = [t1], 8
  248. sub t7 = 8, t5 ;;
  249. cmp.ltu pt0, pt1 = a2, t7;;
  250. (pt0) sub t9 = t7, a2
  251. shl t8 = t5, 3;;
  252. (pt0) shl t12 = t9, 3;;
  253. nop.m 0
  254. (pt0) shr.u t14 = t6, t12
  255. shl t13 = t6, t8;;
  256. and t20 = t11, t13;;
  257. (pt0) and t20 = t20, t14
  258. (pt0) mov a2 = zero
  259. (pt1) sub a2 = a2, t7
  260. mov a1 = t1
  261. x32Abrlabel:
  262. br.cond.sptk.many x32startA
  263. LEAF_EXIT(tcpxsum)