Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

334 lines
8.3 KiB

  1. //##########################################################################
  2. //**
  3. //** Copyright (C) 1996-2000 Intel Corporation. All rights reserved.
  4. //**
  5. //** The information and source code contained herein is the exclusive
  6. //** property of Intel Corporation and may not be disclosed, examined
  7. //** or reproduced in whole or in part without explicit written authorization
  8. //** from the company.
  9. //**
  10. //###########################################################################
  11. .file "rerun.s"
  12. .section .text
  13. .align 32
  14. .proc _xrun1args#
  15. .global _xrun1args#
  16. .align 32
  17. _xrun1args:
  18. alloc r31=ar.pfs,4,4,0,0 // r32, r33, r34, r35, r36, r37, r38, r39
  19. // OpCode is in r32
  20. // &fpsr is in r33
  21. // &fr1 (output) is in r34
  22. // &fr2 (input) is in r35
  23. // save old FPSR in r36
  24. mov.m r36 = ar40
  25. // save predicates in r38
  26. mov r38 = pr;;
  27. // load fpsr in r37
  28. ld8 r37 = [r33];;
  29. // set new value of FPSR
  30. mov ar40 = r37;;
  31. // clear predicates
  32. movl r39 = 0x0000000000000001;;
  33. // load clear predicates from r39
  34. mov pr = r39,0x1ffff;;
  35. // load input argument into f8
  36. ldf.fill f8 = [r35];;
  37. cmp4.eq p1, p2 = 1, r32;; // fprsqrta [not used]
  38. (p2) cmp4.eq.unc p2, p3 = 2, r32;; // fpcvt_fx
  39. (p3) cmp4.eq.unc p3, p4 = 3, r32;; // fpcvt_fxu
  40. (p4) cmp4.eq.unc p4, p5 = 4, r32;; // fpcvt_fx_trunc
  41. (p5) cmp4.eq.unc p5, p6 = 5, r32;; // fpcvt_fxu_trunc
  42. (p1) fprsqrta.s0 f9,p7 = f8;; // 1/sqrt(f8) in f9
  43. (p2) fpcvt.fx.s0 f9 = f8;;
  44. (p3) fpcvt.fxu.s0 f9 = f8;;
  45. (p4) fpcvt.fx.trunc.s0 f9 = f8;;
  46. (p5) fpcvt.fxu.trunc.s0 f9 = f8;;
  47. (p6) mov f9 = f0 // return 0
  48. // restore predicates from r38
  49. mov pr = r38,0x1ffff;;
  50. // store result
  51. stf.spill [r34] = f9;;
  52. // save FPSR
  53. mov.m r37 = ar40;;
  54. st8 [r33] = r37
  55. // restore FPSR
  56. mov ar40 = r36;;
  57. // return
  58. br.ret.sptk b0
  59. .endp _xrun1args
  60. .proc _xrun2args#
  61. .global _xrun2args#
  62. .align 32
  63. _xrun2args:
  64. alloc r31=ar.pfs,5,4,0,0 // r32, r33, r34, r35, r36, r37, r38, r39, r40
  65. // OpCode is in r32
  66. // &fpsr is in r33
  67. // &fr1 (output) is in r34
  68. // &fr2 (input) is in r35
  69. // &fr3 (input) is in r36
  70. // save old FPSR in r37
  71. mov r37 = ar40
  72. // save predicates in r39
  73. mov r39 = pr;;
  74. // load fpsr in r38
  75. ld8 r38 = [r33];;
  76. // set new value of FPSR
  77. mov ar40 = r38;;
  78. // clear predicates
  79. movl r40 = 0x0000000000000001;;
  80. // load clear predicates from r40
  81. mov pr = r40,0x1ffff;;
  82. // load first input argument into f8
  83. ldf.fill f8 = [r35]
  84. // load second input argument into f9
  85. ldf.fill f9 = [r36];;
  86. cmp4.eq p1, p2 = 1, r32;; // fprcpa [not used - fprcpa not re-executed]
  87. (p2) cmp4.eq.unc p2, p3 = 2, r32;; // fpcmp_eq
  88. (p3) cmp4.eq.unc p3, p4 = 3, r32;; // fpcmp_lt
  89. (p4) cmp4.eq.unc p4, p5 = 4, r32;; // fpcmp_le
  90. (p5) cmp4.eq.unc p5, p6 = 5, r32;; // fpcmp_unord
  91. (p6) cmp4.eq.unc p6, p7 = 6, r32;; // fpcmp_neq
  92. (p7) cmp4.eq.unc p7, p8 = 7, r32;; // fpcmp_nlt
  93. (p8) cmp4.eq.unc p8, p9 = 8, r32;; // fpcmp_nle
  94. (p9) cmp4.eq.unc p9, p10 = 9, r32;; // fpcmp_ord
  95. (p10) cmp4.eq.unc p10, p11 = 10, r32;; // fpmin
  96. (p11) cmp4.eq.unc p11, p12 = 11, r32;; // fpmax
  97. (p12) cmp4.eq.unc p12, p13 = 12, r32;; // fpamin
  98. (p13) cmp4.eq.unc p13, p14 = 13, r32;; // fpamax
  99. (p1) fprcpa.s0 f10 , p15 = f8, f9;; // 1 / f3 in f4
  100. (p2) fpcmp.eq.s0 f10 = f8, f9;;
  101. (p3) fpcmp.lt.s0 f10 = f8, f9;;
  102. (p4) fpcmp.le.s0 f10 = f8, f9;;
  103. (p5) fpcmp.unord.s0 f10 = f8, f9;;
  104. (p6) fpcmp.neq.s0 f10 = f8, f9;;
  105. (p7) fpcmp.nlt.s0 f10 = f8, f9;;
  106. (p8) fpcmp.nle.s0 f10 = f8, f9;;
  107. (p9) fpcmp.ord.s0 f10 = f8, f9;;
  108. (p10) fpmin.s0 f10 = f8, f9;;
  109. (p11) fpmax.s0 f10 = f8, f9;;
  110. (p12) fpamin.s0 f10 = f8, f9;;
  111. (p13) fpamax.s0 f10 = f8, f9;;
  112. (p14) mov f10 = f0 // return 0
  113. // restore predicates from r39
  114. mov pr = r39,0x1ffff;;
  115. // store result
  116. stf.spill [r34] = f10
  117. // save FPSR
  118. mov.m r38 = ar40;;
  119. st8 [r33] = r38
  120. // restore FPSR
  121. mov ar40 = r37;;
  122. // return
  123. br.ret.sptk b0
  124. .endp _xrun2args
  125. .proc _xrun3args#
  126. .global _xrun3args#
  127. .align 32
  128. _xrun3args:
  129. alloc r31=ar.pfs,6,4,0,0 // r32, r33, r34, r35, r36, r37, r38, r39, r40, r41
  130. // OpCode is in r32
  131. // &fpsr is in r33
  132. // &fr1 (output) is in r34
  133. // &fr2 (input) is in r35
  134. // &fr3 (input) is in r36
  135. // &fr4 (input) is in r37
  136. // save old FPSR in r38
  137. mov r38 = ar40
  138. // save predicates in r40
  139. mov r40 = pr;;
  140. // load fpsr in r39
  141. ld8 r39 = [r33];;
  142. // set new value of FPSR
  143. mov ar40 = r39;;
  144. // clear predicates
  145. movl r41 = 0x0000000000000001;;
  146. // load clear predicates from r41
  147. mov pr = r41,0x1ffff;;
  148. // load first input argument into f8
  149. ldf.fill f8 = [r35]
  150. // load second input argument into f9
  151. ldf.fill f9 = [r36];;
  152. // load third input argument into f10
  153. ldf.fill f10 = [r37];;
  154. cmp4.eq p1, p2 = 1, r32;; // fpma
  155. (p2) cmp4.eq.unc p2, p3 = 2, r32;; // fpms
  156. (p3) cmp4.eq.unc p3, p4 = 3, r32;; // fpnma
  157. (p1) fpma.s0 f11 = f8, f9, f10;; // f11 = f8 * f9 + f10
  158. (p2) fpms.s0 f11 = f8, f9, f10;; // f11 = f8 * f9 - f10
  159. (p3) fpnma.s0 f11 = f8, f9, f10;; // f11 = -f8 * f9 + f10
  160. (p4) mov f11 = f0 // return 0
  161. // restore predicates from r40
  162. mov pr = r40,0x1ffff;;
  163. // store result
  164. stf.spill [r34] = f11
  165. // save FPSR
  166. mov.m r39 = ar40;;
  167. st8 [r33] = r39
  168. // restore FPSR
  169. mov ar40 = r38
  170. // return
  171. br.ret.sptk b0
  172. .endp _xrun3args
  173. .proc _thmB#
  174. .global _thmB#
  175. .align 32
  176. _thmB:
  177. alloc r31=ar.pfs,4,2,0,0 // r32, r33, r34, r35, r36, r37
  178. // &a is in r32
  179. // &b is in r33
  180. // &div is in r34 (the address of the divide result)
  181. // &fpsr is in r35
  182. // general registers used: r31, r32, r33, r34, r35, r36, r37
  183. // predicate registers used: p6
  184. // floating-point registers used: f6, f7, f8
  185. // save old FPSR in r36
  186. mov r36 = ar40
  187. // load fpsr in r37
  188. ld8 r37 = [r35];;
  189. // set new value of FPSR
  190. mov ar40 = r37
  191. // load a, the first argument, in f6
  192. ldfs f6 = [r32];;
  193. // load b, the second argument, in f7
  194. ldfs f7 = [r33];;
  195. // Step (1)
  196. // y0 = 1 / b in f8
  197. frcpa.s0 f8,p6=f6,f7;;
  198. // Step (2)
  199. // q0 = a * y0 in f6
  200. (p6) fma.s1 f6=f6,f8,f0
  201. // Step (3)
  202. // e0 = 1 - b * y0 in f7
  203. (p6) fnma.s1 f7=f7,f8,f1;;
  204. // Step (4)
  205. // q1 = q0 + e0 * q0 in f6
  206. (p6) fma.s1 f6=f7,f6,f6
  207. // Step (5)
  208. // e1 = e0 * e0 in f7
  209. (p6) fma.s1 f7=f7,f7,f0;;
  210. // Step (6)
  211. // q2 = q1 + e1 * q1 in f6
  212. (p6) fma.s1 f6=f7,f6,f6
  213. // Step (7)
  214. // e2 = e1 * e1 in f7
  215. (p6) fma.s1 f7=f7,f7,f0;;
  216. // Step (8)
  217. // q3 = q2 + e2 * q2 in f6
  218. (p6) fma.d.s1 f6=f7,f6,f6;;
  219. // Step (9)
  220. // q3' = q3 in f8
  221. (p6) fma.s.s0 f8=f6,f1,f0;;
  222. // store result
  223. stfs [r34]=f8
  224. // save fpsr
  225. mov.m r37 = ar40;;
  226. st8 [r35] = r37
  227. // restore FPSR
  228. mov ar40 = r36;;
  229. // return
  230. br.ret.sptk b0
  231. .endp _thmB
  232. .proc _thmH#
  233. .global _thmH#
  234. .align 32
  235. _thmH:
  236. alloc r31=ar.pfs,3,2,0,0 // r32, r33, r34, r35, r36
  237. // &a is in r32
  238. // &sqrt is in r33 (the address of the sqrt result)
  239. // &fpsr in r34
  240. // general registers used: r31, r32, r33, r34, r35
  241. // predicate registers used: p6
  242. // floating-point registers used: f6, f7, f8, f9, f10, f11, f12
  243. // save old FPSR in r35
  244. mov r35 = ar40
  245. // load fpsr in r36
  246. ld8 r36 = [r34];;
  247. // set new value of FPSR
  248. mov ar40 = r36
  249. // exponent of +1/2 in r2
  250. movl r2 = 0x0fffe;;
  251. // +1/2 in f7
  252. setf.exp f7 = r2
  253. // load the argument a in f6
  254. ldfs f6 = [r32];;
  255. // Step (1)
  256. // y0 = 1/sqrt(a) in f8
  257. frsqrta.s0 f8,p6=f6;;
  258. // Step (2)
  259. // h = +1/2 * a in f9
  260. (p6) fma.s1 f9=f7,f6,f0
  261. // Step (3)
  262. // t1 = y0 * y0 in f10
  263. (p6) fma.s1 f10=f8,f8,f0;;
  264. // Step (4)
  265. // t2 = 1/2 - t1 * h in f10
  266. (p6) fnma.s1 f10=f10,f9,f7;;
  267. // Step (5)
  268. // y1 = y0 + t2 * y0 in f8
  269. (p6) fma.s1 f8=f10,f8,f8;;
  270. // Step (6)
  271. // S = a * y1 in f10
  272. (p6) fma.s1 f10=f6,f8,f0
  273. // Step (7)
  274. // t3 = y1 * h in f9
  275. (p6) fma.s1 f9=f8,f9,f0
  276. // Step (8)
  277. // H = 1/2 * y1 in f11
  278. (p6) fma.s1 f11=f7,f8,f0;;
  279. // Step (9)
  280. // d = a - S * S in f12
  281. (p6) fnma.s1 f12=f10,f10,f6
  282. // Step (10)
  283. // t4 = 1/2 - t3 * y1 in f7
  284. (p6) fnma.s1 f7=f9,f8,f7;;
  285. // Step (11)
  286. // S1 = S + d * H in f8
  287. (p6) fma.s.s1 f8=f12,f11,f10
  288. // Step (12)
  289. // H1 = H + t4 * H in f7
  290. (p6) fma.s1 f7=f7,f11,f11;;
  291. // Step (13)
  292. // d1 = a - S1 * S1 in f6
  293. (p6) fnma.s1 f6=f8,f8,f6;;
  294. // Step (14)
  295. // R = S1 + d1 * H1 in f8
  296. (p6) fma.s.s0 f8=f6,f7,f8;;
  297. // store result
  298. stfs [r33]=f8
  299. // save fpsr
  300. mov.m r36 = ar40;;
  301. st8 [r34] = r36
  302. // restore FPSR
  303. mov ar40 = r35;;
  304. // return
  305. br.ret.sptk b0
  306. .endp _thmH