Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

890 lines
24 KiB

  1. .file "atan2f.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 6/1/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. // History
  26. //==============================================================
  27. // 6/01/00 Initial version
  28. // 8/15/00 Bundle added after call to __libm_error_support to properly
  29. // set [the previously overwritten] GR_Parameter_RESULT.
  30. // 8/17/00 Changed predicate register macro-usage to direct predicate
  31. // names due to an assembler bug.
  32. // 1/05/01 Fixed flag settings for denormal input.
  33. // 1/19/01 Added documentation
  34. // 1/30/01 Improved speed
  35. // Description
  36. //=========================================
  37. // The atan2 function computes the principle value of the arc tangent of y/x using
  38. // the signs of both arguments to determine the quadrant of the return value.
  39. // A domain error may occur if both arguments are zero.
  40. // The atan2 function returns the arc tangent of y/x in the range [-pi,+pi] radians.
  41. //..
  42. //..Let (v,u) = (y,x) if |y| <= |x|, and (v,u) = (x,y) otherwise. Note that
  43. //..v and u can be negative. We state the relationship between atan2(y,x) and
  44. //..atan(v/u).
  45. //..
  46. //..Let swap = false if v = y, and swap = true if v = x.
  47. //..Define C according to the matrix
  48. //..
  49. //.. TABLE FOR C
  50. //.. x +ve x -ve
  51. //.. no swap (swap = false) sgn(y)*0 sgn(y)*pi
  52. //.. swap (swap = true ) sgn(y)*pi/2 sgn(y)*pi/2
  53. //..
  54. //.. atan2(y,x) = C + atan(v/u) if no swap
  55. //.. atan2(y,x) = C - atan(v/u) if swap
  56. //..
  57. //..These relationship is more efficient to compute as we accommodate signs in v and u
  58. //..saving the need to obtain the absolute value before computation can proceed.
  59. //..
  60. //..Suppose (v,u) = (y,x), we calculate atan(v/u) as follows:
  61. //..A = y * frcpa(x) (so A = (y/x)(1 - beta))
  62. //..atan(y/x) = atan(A) + atan( ((y/x)-A))/(1 + (y/x)A) ), the second term is
  63. //..a correction.
  64. //..atan(A) is approximated by a polynomial
  65. //..A + p1 A^3 + p2 A^5 + ... + p10 A^21,
  66. //..atan(G) is approximated as follows:
  67. //..Let G = (y - Ax)/(x + Ay), atan(G) can be approximated by G + g * p1
  68. //..where g is a limited precision approximation to G via g = (y - Ax)*frcpa(x + Ay).
  69. //..
  70. //..Suppose (v,u) = (x,y), we calculate atan(v/u) as follows:
  71. //..Z = x * frcpa(y) (so Z = (x/y)(1 - beta))
  72. //..atan(x/y) = atan(Z) + atan( ((x/y)-Z))/(1 + (x/y)Z) ), the second term is
  73. //..a correction.
  74. //..atan(Z) is approximated by a polynomial
  75. //..Z + p1 Z^3 + p2 Z^5 + ... + p10 Z^21,
  76. //..atan(T) is approximated as follows:
  77. //..Let T = (x - Ay)/(y + Ax), atan(T) can be approximated by T + t * p1
  78. //..where t is a limited precision approximation to T via t = (x - Ay)*frcpa(y + Ax).
  79. //..
  80. //..
  81. //..A = y * frcpa(x)
  82. //..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21
  83. //..
  84. //..This polynomial is computed as follows:
  85. //..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
  86. //..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6
  87. //..
  88. //..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6
  89. //..poly_A1 = poly_A2 + A4 * poly_A1
  90. //..poly_A1 = poly_A3 + A4 * poly_A1
  91. //..
  92. //..poly_A4 = p1 * A
  93. //,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4
  94. //..poly_A5 = p2 + Asq * poly_A5
  95. //..poly_A4 = poly_A4 + A5 * poly_A5
  96. //..
  97. //..atan_A = poly_A4 + A11 * poly_A1
  98. //..
  99. //..atan(G) is approximated as follows:
  100. //..G_numer = y - A*x, G_denom = x + A*y
  101. //..H1 = frcpa(G_denom)
  102. //..H_beta = 1 - H1 * G_denom
  103. //..H2 = H1 + H1 * H_beta
  104. //..H_beta2 = H_beta*H_beta
  105. //..H3 = H2 + H2*H_beta2
  106. //..g = H1 * G_numer; gsq = g*g; atan_G = g*p1, atan_G = atan_G*gsq
  107. //..atan_G = G_numer*H3 + atan_G
  108. //..
  109. //..
  110. //..A = y * frcpa(x)
  111. //..atan(A) ~=~ A + p1 A^3 + ... + P10 A^21
  112. //..
  113. //..This polynomial is computed as follows:
  114. //..Asq = A*A; Acub = A*Asq, A4 = Asq*Asq
  115. //..A5 = Asq*Acub, A6 = Asq*A4; A11 = A5 * A6
  116. //..
  117. //..poly_A1 = p9 + Asq*p10, poly_A2 = p7 + Asq*p8, poly_A3 = p5 + Asq*p6
  118. //..poly_A1 = poly_A2 + A4 * poly_A1
  119. //..poly_A1 = poly_A3 + A4 * poly_A1
  120. //..
  121. //..poly_A4 = p1 * A
  122. //,,poly_A5 = p3 + Asq * p4, poly_A4 = A + Asq*poly_A4
  123. //..poly_A5 = p2 + Asq * poly_A5
  124. //..poly_A4 = poly_A4 + A5 * poly_A5
  125. //..
  126. //..atan_A = poly_A4 + A11 * poly_A1
  127. //..
  128. //..
  129. //..====================================================================
  130. //.. COEFFICIENTS USED IN THE COMPUTATION
  131. //..====================================================================
  132. //coef_pj, j = 1,2,...,10; atan(A) ~=~ A + p1 A^3 + p2 A^5 + ... + p10 A^21
  133. //
  134. // coef_p1 = -.3333332707155439167401311806315789E+00
  135. // coef_p1 in dbl = BFD5 5555 1219 1621
  136. //
  137. // coef_p2 = .1999967670926658391827857030875748E+00
  138. // coef_p2 in dbl = 3FC9 997E 7AFB FF4E
  139. //
  140. // coef_p3 = -.1427989384500152360161563301087296E+00
  141. // coef_p3 in dbl = BFC2 473C 5145 EE38
  142. //
  143. // coef_p4 = .1105852823460720770079031213661163E+00
  144. // coef_p4 in dbl = 3FBC 4F51 2B18 65F5
  145. //
  146. // coef_p5 = -.8811839915595312348625710228448363E-01
  147. // coef_p5 in dbl = BFB6 8EED 6A8C FA32
  148. //
  149. // coef_p6 = .6742329836955067042153645159059714E-01
  150. // coef_p6 in dbl = 3FB1 42A7 3D7C 54E3
  151. //
  152. // coef_p7 = -.4468571068774672908561591262231909E-01
  153. // coef_p7 in dbl = BFA6 E10B A401 393F
  154. //
  155. // coef_p8 = .2252333246746511135532726960586493E-01
  156. // coef_p8 in dbl = 3F97 105B 4160 F86B
  157. //
  158. // coef_p9 = -.7303884867007574742501716845542314E-02
  159. // coef_p9 in dbl = BF7D EAAD AA33 6451
  160. //
  161. // coef_p10 = .1109686868355312093949039454619058E-02
  162. // coef_p10 in dbl = 3F52 2E5D 33BC 9BAA
  163. //
  164. // Special values
  165. //==============================================================
  166. // Y x Result
  167. // +number +inf +0
  168. // -number +inf -0
  169. // +number -inf +pi
  170. // -number -inf -pi
  171. //
  172. // +inf +number +pi/2
  173. // -inf +number -pi/2
  174. // +inf -number +pi/2
  175. // -inf -number -pi/2
  176. //
  177. // +inf +inf +pi/4
  178. // -inf +inf -pi/4
  179. // +inf -inf +3pi/4
  180. // -inf -inf -3pi/4
  181. //
  182. // +1 +1 +pi/4
  183. // -1 +1 -pi/4
  184. // +1 -1 +3pi/4
  185. // -1 -1 -3pi/4
  186. //
  187. // +number +0 +pi/2 // does not raise DBZ
  188. // -number +0 -pi/2 // does not raise DBZ
  189. // +number -0 +pi/2 // does not raise DBZ
  190. // -number -0 -pi/2 // does not raise DBZ
  191. //
  192. // +0 +number +0
  193. // -0 +number -0
  194. // +0 -number +pi
  195. // -0 -number -pi
  196. //
  197. // +0 +0 +0 // does not raise invalid
  198. // -0 +0 -0 // does not raise invalid
  199. // +0 -0 +pi // does not raise invalid
  200. // -0 -0 -pi // does not raise invalid
  201. //
  202. // Nan anything quiet Y
  203. // anything NaN quiet X
  204. // atan2(+-0/+-0) sets double error tag to 37
  205. // atan2f(+-0/+-0) sets single error tag to 38
  206. // These are domain errors.
  207. //
  208. // Assembly macros
  209. //=========================================
  210. // integer registers
  211. atan2f_GR_Addr_1 = r33
  212. atan2f_GR_Addr_2 = r34
  213. GR_SAVE_B0 = r35
  214. GR_SAVE_PFS = r36
  215. GR_SAVE_GP = r37
  216. GR_Parameter_X = r38
  217. GR_Parameter_Y = r39
  218. GR_Parameter_RESULT = r40
  219. GR_Parameter_TAG = r41
  220. // floating point registers
  221. atan2f_coef_p1 = f32
  222. atan2f_coef_p10 = f33
  223. atan2f_coef_p7 = f34
  224. atan2f_coef_p6 = f35
  225. atan2f_coef_p3 = f36
  226. atan2f_coef_p2 = f37
  227. atan2f_coef_p9 = f38
  228. atan2f_coef_p8 = f39
  229. atan2f_coef_p5 = f40
  230. atan2f_coef_p4 = f41
  231. atan2f_const_piby2 = f42
  232. atan2f_const_pi = f43
  233. atan2f_const_piby4 = f44
  234. atan2f_const_3piby4 = f45
  235. atan2f_xsq = f46
  236. atan2f_ysq = f47
  237. atan2f_xy = f48
  238. atan2f_const_1 = f49
  239. atan2f_sgn_Y = f50
  240. atan2f_Z0 = f51
  241. atan2f_A0 = f52
  242. atan2f_Z = f53
  243. atan2f_A = f54
  244. atan2f_C = f55
  245. atan2f_U = f56
  246. atan2f_Usq = f57
  247. atan2f_U4 = f58
  248. atan2f_U6 = f59
  249. atan2f_U8 = f60
  250. atan2f_poly_u109 = f61
  251. atan2f_poly_u87 = f62
  252. atan2f_poly_u65 = f63
  253. atan2f_poly_u43 = f64
  254. atan2f_poly_u21 = f65
  255. atan2f_poly_u10to7 = f66
  256. atan2f_poly_u6to3 = f67
  257. atan2f_poly_u10to3 = f68
  258. atan2f_poly_u10to0 = f69
  259. atan2f_poly_u210 = f70
  260. atan2f_T_numer = f71
  261. atan2f_T_denom = f72
  262. atan2f_G_numer = f73
  263. atan2f_G_denom = f74
  264. atan2f_p1rnum = f75
  265. atan2f_R_denom = f76
  266. atan2f_R_numer = f77
  267. atan2f_pR = f78
  268. atan2f_pRC = f79
  269. atan2f_pQRC = f80
  270. atan2f_Q1 = f81
  271. atan2f_Q_beta = f82
  272. atan2f_Q2 = f83
  273. atan2f_Q_beta2 = f84
  274. atan2f_Q3 = f85
  275. atan2f_r = f86
  276. atan2f_rsq = f87
  277. atan2f_poly_atan_U = f88
  278. // predicate registers
  279. //atan2f_Pred_Swap = p6 // |y| > |x|
  280. //atan2f_Pred_noSwap = p7 // |y| <= |x|
  281. //atan2f_Pred_Xpos = p8 // x >= 0
  282. //atan2f_Pred_Xneg = p9 // x < 0
  283. .data
  284. .align 16
  285. atan2f_coef_table1:
  286. data8 0xBFD5555512191621 // p1
  287. data8 0x3F522E5D33BC9BAA // p10
  288. data8 0xBFA6E10BA401393F // p7
  289. data8 0x3FB142A73D7C54E3 // p6
  290. data8 0xBFC2473C5145EE38 // p3
  291. data8 0x3FC9997E7AFBFF4E // p2
  292. atan2f_coef_table2:
  293. data8 0xBF7DEAADAA336451 // p9
  294. data8 0x3F97105B4160F86B // p8
  295. data8 0xBFB68EED6A8CFA32 // p5
  296. data8 0x3FBC4F512B1865F5 // p4
  297. data8 0x3ff921fb54442d18 // pi/2
  298. data8 0x400921fb54442d18 // pi
  299. data8 0x3fe921fb54442d18 // pi/4
  300. data8 0x4002d97c7f3321d2 // 3pi/4
  301. .global atan2f
  302. .text
  303. .proc atan2f
  304. .align 32
  305. atan2f:
  306. { .mfi
  307. alloc r32 = ar.pfs,1,5,4,0
  308. frcpa.s1 atan2f_Z0,p0 = f1,f8 // Approx to 1/y
  309. nop.i 999
  310. }
  311. { .mfi
  312. addl atan2f_GR_Addr_1 = @ltoff(atan2f_coef_table1),gp
  313. fma.s1 atan2f_xsq = f9,f9,f0
  314. nop.i 999 ;;
  315. }
  316. { .mfi
  317. ld8 atan2f_GR_Addr_1 = [atan2f_GR_Addr_1]
  318. frcpa.s1 atan2f_A0,p0 = f1,f9 // Approx to 1/x
  319. nop.i 999
  320. }
  321. { .mfi
  322. nop.m 999
  323. fma.s1 atan2f_ysq = f8,f8,f0
  324. nop.i 999 ;;
  325. }
  326. { .mfi
  327. nop.m 999
  328. fcmp.ge.s1 p8,p9 = f9,f0 // Set p8 if x>=0, p9 if x<0
  329. nop.i 999
  330. }
  331. { .mfi
  332. nop.m 999
  333. fma.s1 atan2f_xy = f9,f8,f0
  334. nop.i 999 ;;
  335. }
  336. { .mfi
  337. add atan2f_GR_Addr_2 = 0x30, atan2f_GR_Addr_1
  338. fmerge.s atan2f_sgn_Y = f8,f1
  339. nop.i 999 ;;
  340. }
  341. { .mmf
  342. ldfpd atan2f_coef_p1,atan2f_coef_p10 = [atan2f_GR_Addr_1],16
  343. ldfpd atan2f_coef_p9,atan2f_coef_p8 = [atan2f_GR_Addr_2],16
  344. fclass.m p10,p0 = f9,0xe7 // Test x @inf|@snan|@qnan|@zero
  345. }
  346. ;;
  347. { .mfi
  348. ldfpd atan2f_coef_p7,atan2f_coef_p6 = [atan2f_GR_Addr_1],16
  349. fma.s1 atan2f_T_denom = atan2f_Z0,atan2f_xsq,f8
  350. nop.i 999
  351. }
  352. { .mfi
  353. ldfpd atan2f_coef_p5,atan2f_coef_p4 = [atan2f_GR_Addr_2],16
  354. fma.s1 atan2f_Z = atan2f_Z0,f9,f0
  355. nop.i 999 ;;
  356. }
  357. { .mfi
  358. ldfpd atan2f_coef_p3,atan2f_coef_p2 = [atan2f_GR_Addr_1],16
  359. fma.s1 atan2f_G_denom = atan2f_A0,atan2f_ysq,f9
  360. nop.i 999
  361. }
  362. { .mfi
  363. ldfpd atan2f_const_piby2,atan2f_const_pi = [atan2f_GR_Addr_2],16
  364. fma.s1 atan2f_A = atan2f_A0,f8,f0
  365. nop.i 999 ;;
  366. }
  367. { .mfi
  368. ldfpd atan2f_const_piby4,atan2f_const_3piby4 = [atan2f_GR_Addr_2]
  369. fclass.m p11,p0 = f8,0xe7 // Test y @inf|@snan|@qnan|@zero
  370. nop.i 999
  371. }
  372. { .mfb
  373. nop.m 999
  374. fnma.s1 atan2f_T_numer = atan2f_Z0,atan2f_xy,f9
  375. (p10) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on x nan,inf,zero
  376. }
  377. // p6 if |y|>|x|, p7 if |x|>=|y| , use xsq and ysq for test
  378. { .mfi
  379. nop.m 999
  380. fcmp.gt.s1 p6,p7 = atan2f_ysq,atan2f_xsq
  381. nop.i 999
  382. }
  383. { .mfb
  384. nop.m 999
  385. fnma.s1 atan2f_G_numer = atan2f_A0,atan2f_xy,f8
  386. (p11) br.cond.spnt ATAN2F_XY_INF_NAN_ZERO ;; // Branch on y nan,inf,zero
  387. }
  388. { .mfi
  389. nop.m 999
  390. (p8) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f0,f0
  391. nop.i 999
  392. }
  393. { .mfi
  394. nop.m 999
  395. (p9) fma.s1 atan2f_const_1 = atan2f_sgn_Y,f1,f0
  396. nop.i 999 ;;
  397. }
  398. { .mfi
  399. nop.m 999
  400. (p6) fnma.s1 atan2f_U = atan2f_Z,f1,f0
  401. nop.i 999
  402. }
  403. { .mfi
  404. nop.m 999
  405. (p6) fma.s1 atan2f_Usq = atan2f_Z,atan2f_Z,f0
  406. nop.i 999 ;;
  407. }
  408. { .mfi
  409. nop.m 999
  410. (p7) fma.s1 atan2f_U = atan2f_A,f1,f0
  411. nop.i 999
  412. }
  413. { .mfi
  414. nop.m 999
  415. (p7) fma.s1 atan2f_Usq = atan2f_A,atan2f_A,f0
  416. nop.i 999 ;;
  417. }
  418. { .mfi
  419. nop.m 999
  420. (p6) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_T_denom
  421. nop.i 999
  422. }
  423. { .mfi
  424. nop.m 999
  425. (p6) fma.s1 atan2f_R_denom = atan2f_T_denom,f1,f0
  426. nop.i 999 ;;
  427. }
  428. { .mfi
  429. nop.m 999
  430. (p7) frcpa.s1 atan2f_Q1,p0 = f1,atan2f_G_denom
  431. nop.i 999
  432. }
  433. { .mfi
  434. nop.m 999
  435. (p7) fma.s1 atan2f_R_denom = atan2f_G_denom,f1,f0
  436. nop.i 999 ;;
  437. }
  438. { .mfi
  439. nop.m 999
  440. (p6) fnma.s1 atan2f_R_numer = atan2f_T_numer,f1,f0
  441. nop.i 999
  442. }
  443. { .mfi
  444. nop.m 999
  445. (p7) fma.s1 atan2f_R_numer = atan2f_G_numer,f1,f0
  446. nop.i 999 ;;
  447. }
  448. { .mfi
  449. nop.m 999
  450. (p6) fnma.s1 atan2f_p1rnum = atan2f_T_numer,atan2f_coef_p1,f0
  451. nop.i 999 ;;
  452. }
  453. { .mfi
  454. nop.m 999
  455. (p7) fma.s1 atan2f_p1rnum = atan2f_G_numer,atan2f_coef_p1,f0
  456. nop.i 999 ;;
  457. }
  458. { .mfi
  459. nop.m 999
  460. fma.s1 atan2f_U4 = atan2f_Usq,atan2f_Usq,f0
  461. nop.i 999
  462. }
  463. { .mfi
  464. nop.m 999
  465. fma.s1 atan2f_poly_u109 = atan2f_Usq,atan2f_coef_p10,atan2f_coef_p9
  466. nop.i 999 ;;
  467. }
  468. { .mfi
  469. nop.m 999
  470. fma.s1 atan2f_poly_u87 = atan2f_Usq,atan2f_coef_p8,atan2f_coef_p7
  471. nop.i 999
  472. }
  473. { .mfi
  474. nop.m 999
  475. fma.s1 atan2f_poly_u65 = atan2f_Usq,atan2f_coef_p6,atan2f_coef_p5
  476. nop.i 999 ;;
  477. }
  478. { .mfi
  479. nop.m 999
  480. fma.s1 atan2f_poly_u43 = atan2f_Usq,atan2f_coef_p4,atan2f_coef_p3
  481. nop.i 999
  482. }
  483. { .mfi
  484. nop.m 999
  485. fnma.s1 atan2f_Q_beta = atan2f_Q1,atan2f_R_denom,f1
  486. nop.i 999 ;;
  487. }
  488. { .mfi
  489. nop.m 999
  490. fma.s1 atan2f_poly_u21 = atan2f_Usq,atan2f_coef_p2,atan2f_coef_p1
  491. nop.i 999
  492. }
  493. { .mfi
  494. nop.m 999
  495. fma.s1 atan2f_r = atan2f_Q1,atan2f_R_numer,f0
  496. nop.i 999 ;;
  497. }
  498. { .mfi
  499. nop.m 999
  500. (p6) fma.s1 atan2f_C = atan2f_sgn_Y,atan2f_const_piby2,f0
  501. nop.i 999
  502. }
  503. { .mfi
  504. nop.m 999
  505. (p7) fma.s1 atan2f_C = atan2f_const_1,atan2f_const_pi,f0
  506. nop.i 999 ;;
  507. }
  508. { .mfi
  509. nop.m 999
  510. fma.s1 atan2f_U6 = atan2f_U4,atan2f_Usq,f0
  511. nop.i 999
  512. }
  513. { .mfi
  514. nop.m 999
  515. fma.s1 atan2f_U8 = atan2f_U4,atan2f_U4,f0
  516. nop.i 999 ;;
  517. }
  518. { .mfi
  519. nop.m 999
  520. fma.s1 atan2f_poly_u10to7 = atan2f_U4,atan2f_poly_u109,atan2f_poly_u87
  521. nop.i 999
  522. }
  523. { .mfi
  524. nop.m 999
  525. fma.s1 atan2f_pR = atan2f_p1rnum,atan2f_Q1,f0
  526. nop.i 999 ;;
  527. }
  528. { .mfi
  529. nop.m 999
  530. fma.s1 atan2f_poly_u6to3 = atan2f_U4,atan2f_poly_u65,atan2f_poly_u43
  531. nop.i 999
  532. }
  533. { .mfi
  534. nop.m 999
  535. fma.s1 atan2f_Q2 = atan2f_Q1,atan2f_Q_beta,atan2f_Q1
  536. nop.i 999 ;;
  537. }
  538. { .mfi
  539. nop.m 999
  540. fma.s1 atan2f_Q_beta2 = atan2f_Q_beta,atan2f_Q_beta,f0
  541. nop.i 999
  542. }
  543. { .mfi
  544. nop.m 999
  545. fma.s1 atan2f_rsq = atan2f_r,atan2f_r,f0
  546. nop.i 999 ;;
  547. }
  548. { .mfi
  549. nop.m 999
  550. fma.s1 atan2f_poly_u210 = atan2f_Usq,atan2f_poly_u21,f1
  551. nop.i 999 ;;
  552. }
  553. { .mfi
  554. nop.m 999
  555. fcmp.eq.s0 p8,p0 = f8,f9 // Dummy op to set flag on denormal inputs
  556. nop.i 999
  557. }
  558. { .mfi
  559. nop.m 999
  560. fma.s1 atan2f_poly_u10to3 = atan2f_U8,atan2f_poly_u10to7,atan2f_poly_u6to3
  561. nop.i 999 ;;
  562. }
  563. { .mfi
  564. nop.m 999
  565. fma.s1 atan2f_Q3 = atan2f_Q2,atan2f_Q_beta2,atan2f_Q2
  566. nop.i 999
  567. }
  568. { .mfi
  569. nop.m 999
  570. fma.s1 atan2f_pRC = atan2f_rsq,atan2f_pR,atan2f_C
  571. nop.i 999 ;;
  572. }
  573. { .mfi
  574. nop.m 999
  575. fma.s1 atan2f_poly_u10to0 = atan2f_U6,atan2f_poly_u10to3,atan2f_poly_u210
  576. nop.i 999 ;;
  577. }
  578. { .mfi
  579. nop.m 999
  580. fma.s1 atan2f_pQRC = atan2f_R_numer,atan2f_Q3,atan2f_pRC
  581. nop.i 999 ;;
  582. }
  583. { .mfb
  584. nop.m 999
  585. fma.s.s0 f8 = atan2f_U,atan2f_poly_u10to0,atan2f_pQRC
  586. br.ret.sptk b0 ;;
  587. }
  588. ATAN2F_XY_INF_NAN_ZERO:
  589. { .mfi
  590. nop.m 999
  591. fclass.m p10,p0 = f8,0xc3 // Is y nan
  592. nop.i 999
  593. }
  594. ;;
  595. { .mfi
  596. nop.m 999
  597. fclass.m p12,p0 = f9,0xc3 // Is x nan
  598. nop.i 999
  599. }
  600. ;;
  601. { .mfi
  602. nop.m 999
  603. fclass.m p6,p0 = f9,0x21 // Is x +inf
  604. nop.i 999
  605. }
  606. { .mfb
  607. nop.m 999
  608. (p10) fma.s f8 = f9,f8,f0 // Result quietized y if y is nan
  609. (p10) br.ret.spnt b0 // Exit if y is nan
  610. }
  611. ;;
  612. { .mfi
  613. nop.m 999
  614. (p6) fclass.m.unc p7,p8 = f8,0x23 // x +inf, is y inf
  615. nop.i 999
  616. }
  617. { .mfb
  618. nop.m 999
  619. (p12) fnorm.s f8 = f9 // Result quietized x if x is nan, y not nan
  620. (p12) br.ret.spnt b0 // Exit if x is nan, y not nan
  621. }
  622. ;;
  623. // Here if x or y inf, or x or y zero
  624. { .mfi
  625. nop.m 999
  626. fcmp.eq.s0 p15,p0 = f8,f9 // Dummy op to set flag on denormal inputs
  627. nop.i 999
  628. }
  629. ;;
  630. { .mfi
  631. nop.m 999
  632. fclass.m p11,p12 = f9,0x22 // Is x -inf
  633. nop.i 999
  634. }
  635. { .mfb
  636. nop.m 999
  637. (p7) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby4,f0 // Result +-pi/4
  638. (p7) br.ret.spnt b0 // Exit if x +inf and y inf
  639. }
  640. ;;
  641. { .mfb
  642. nop.m 999
  643. (p8) fmerge.s f8 = f8,f0 // If x +inf and y not inf, result +-0
  644. (p8) br.ret.spnt b0 // Exit if x +inf and y not inf
  645. }
  646. ;;
  647. { .mfi
  648. nop.m 999
  649. (p12) fclass.m.unc p13,p0 = f8,0x23 // x not -inf, is y inf
  650. nop.i 999
  651. }
  652. ;;
  653. { .mfi
  654. nop.m 999
  655. (p11) fclass.m.unc p14,p15 = f8,0x23 // x -inf, is y inf
  656. nop.i 999
  657. }
  658. ;;
  659. { .mfi
  660. nop.m 999
  661. fclass.m p6,p7 = f9,0x7 // Is x zero
  662. nop.i 999
  663. }
  664. { .mfb
  665. nop.m 999
  666. (p13) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // Result +-pi/2
  667. (p13) br.ret.spnt b0 // Exit if x not -inf and y inf
  668. }
  669. ;;
  670. { .mfi
  671. nop.m 999
  672. (p14) fma.s f8 = atan2f_sgn_Y, atan2f_const_3piby4,f0 // Result +-3pi/4
  673. nop.i 999
  674. }
  675. { .mfb
  676. nop.m 999
  677. (p15) fma.s f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // Result +-pi
  678. (p11) br.ret.spnt b0 // Exit if x -inf
  679. }
  680. ;;
  681. // Here if x or y zero
  682. { .mfi
  683. nop.m 999
  684. (p7) fclass.m.unc p8,p9 = f9,0x19 // x not zero, y zero, is x > zero
  685. nop.i 999
  686. }
  687. ;;
  688. { .mfi
  689. nop.m 999
  690. (p6) fclass.m.unc p10,p11 = f8,0x7 // x zero, is y zero
  691. nop.i 999
  692. }
  693. ;;
  694. { .mfi
  695. nop.m 999
  696. (p8) fmerge.s f8 = f8, f0 // x > zero and y zero, result is +-zero
  697. nop.i 999
  698. }
  699. { .mfb
  700. nop.m 999
  701. (p9) fma.s f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // x < 0, y 0, result +-pi
  702. (p10) br.cond.spnt __libm_error_region // Branch if x zero and y zero
  703. }
  704. ;;
  705. { .mfb
  706. nop.m 999
  707. (p11) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // x zero, y not zero
  708. br.ret.sptk b0 // Final special case exit
  709. }
  710. ;;
  711. .endp atan2f
  712. .proc __libm_error_region
  713. __libm_error_region:
  714. .prologue
  715. mov GR_Parameter_TAG = 38
  716. fclass.m p10,p11 = f9,0x5 // @zero | @pos
  717. ;;
  718. (p10) fmerge.s f10 = f8, f0
  719. (p11) fma.s f10 = atan2f_sgn_Y, atan2f_const_pi,f0
  720. ;;
  721. { .mfi
  722. add GR_Parameter_Y=-32,sp // Parameter 2 value
  723. nop.f 999
  724. .save ar.pfs,GR_SAVE_PFS
  725. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  726. }
  727. { .mfi
  728. .fframe 64
  729. add sp=-64,sp // Create new stack
  730. nop.f 0
  731. mov GR_SAVE_GP=gp // Save gp
  732. }
  733. ;;
  734. { .mmi
  735. stfs [GR_Parameter_Y] = f9,16 // Store Parameter 2 on stack
  736. add GR_Parameter_X = 16,sp // Parameter 1 address
  737. .save b0, GR_SAVE_B0
  738. mov GR_SAVE_B0=b0 // Save b0
  739. }
  740. ;;
  741. .body
  742. { .mib
  743. stfs [GR_Parameter_X] = f8 // Store Parameter 1 on stack
  744. add GR_Parameter_RESULT = 0,GR_Parameter_Y
  745. nop.b 0 // Parameter 3 address
  746. }
  747. { .mib
  748. stfs [GR_Parameter_Y] = f10 // Store Parameter 3 on stack
  749. add GR_Parameter_Y = -16,GR_Parameter_Y
  750. br.call.sptk b0=__libm_error_support# // Call error handling function
  751. }
  752. ;;
  753. { .mmi
  754. nop.m 0
  755. nop.m 0
  756. add GR_Parameter_RESULT = 48,sp
  757. };;
  758. { .mmi
  759. ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
  760. .restore
  761. add sp = 64,sp // Restore stack pointer
  762. mov b0 = GR_SAVE_B0 // Restore return address
  763. }
  764. ;;
  765. { .mib
  766. mov gp = GR_SAVE_GP // Restore gp
  767. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  768. br.ret.sptk b0 // Return
  769. }
  770. ;;
  771. .endp __libm_error_region
  772. .type __libm_error_support#,@function
  773. .global __libm_error_support#