Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1004 lines
28 KiB

  1. .file "atan2.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00 Initial version
  29. // 4/04/00 Unwind support added
  30. // 8/15/00 Bundle added after call to __libm_error_support to properly
  31. // set [the previously overwritten] GR_Parameter_RESULT.
  32. // 8/17/00 Changed predicate register macro-usage to direct predicate
  33. // names due to an assembler bug.
  34. // 9/28/00 Updated to set invalid on SNaN inputs
  35. // 1/19/01 Fixed flags for small results
  36. // 4/13/01 Rescheduled to make all paths faster
  37. //
  38. // API
  39. //==============================================================
  40. // double atan2(double Y, double X)
  41. //
  42. // Overview of operation
  43. //==============================================================
  44. //
  45. // There are two basic paths: swap true and swap false.
  46. // atan2(Y,X) ==> atan2(V/U) where U >= V. If Y > X, we must swap.
  47. //
  48. // p6 swap True |Y| > |X|
  49. // p7 swap False |Y| <= |X|
  50. // p8 X+ (If swap=True p8=p9=0)
  51. // p9 X-
  52. //
  53. // all the other predicates p10 thru p15 are false for the main path
  54. //
  55. // Simple trigonometric identities show
  56. // Region 1 (-45 to +45 degrees):
  57. // X>0, |Y|<=X, V=Y, U=X atan2(Y,X) = sgnY * (0 + atan(V/U))
  58. //
  59. // Region 2 (-90 to -45 degrees, and +45 to +90 degrees):
  60. // X>0, |Y|>X, V=X, U=Y atan2(Y,X) = sgnY * (pi/2 - atan(V/U))
  61. //
  62. // Region 3 (-135 to -90 degrees, and +90 to +135 degrees):
  63. // X<0, |Y|>X, V=X, U=Y atan2(Y,X) = sgnY * (pi/2 + atan(V/U))
  64. //
  65. // Region 4 (-180 to -135 degrees, and +135 to +180 degrees):
  66. // X<0, |Y|<=X, V=Y, U=X atan2(Y,X) = sgnY * (pi - atan(V/U))
  67. //
  68. // So the result is always of the form atan2(Y,X) = P + sgnXY * atan(V/U)
  69. //
  70. // We compute atan(V/U) from the identity
  71. // atan(z) + atan([(V/U)-z] / [1+(V/U)z])
  72. // where z is a limited precision approximation (16 bits) to V/U
  73. //
  74. // z is calculated with the assistance of the frcpa instruction.
  75. //
  76. // atan(z) is calculated by a polynomial z + z^3 * p(w), w=z^2
  77. // where p(w) = P0+P1*w+...+P22*w^22
  78. //
  79. // Let d = [(V/U)-z] / [1+(V/U)z]) = (V-U*z)/(U+V*z)
  80. //
  81. // Approximate atan(d) by d + P0*d^3
  82. // Let F = 1/(U+V*z) * (1-a), where |a|< 2^-8.8.
  83. // Compute q(a) = 1 + a + ... + a^5.
  84. // Then F*q(a) approximates the reciprocal to more than 50 bits.
  85. // Special values
  86. //==============================================================
  87. // Y x Result
  88. // +number +inf +0
  89. // -number +inf -0
  90. // +number -inf +pi
  91. // -number -inf -pi
  92. //
  93. // +inf +number +pi/2
  94. // -inf +number -pi/2
  95. // +inf -number +pi/2
  96. // -inf -number -pi/2
  97. //
  98. // +inf +inf +pi/4
  99. // -inf +inf -pi/4
  100. // +inf -inf +3pi/4
  101. // -inf -inf -3pi/4
  102. //
  103. // +1 +1 +pi/4
  104. // -1 +1 -pi/4
  105. // +1 -1 +3pi/4
  106. // -1 -1 -3pi/4
  107. //
  108. // +number +0 +pi/2
  109. // -number +0 -pi/2
  110. // +number -0 +pi/2
  111. // -number -0 -pi/2
  112. //
  113. // +0 +number +0
  114. // -0 +number -0
  115. // +0 -number +pi
  116. // -0 -number -pi
  117. //
  118. // +0 +0 +0
  119. // -0 +0 -0
  120. // +0 -0 +pi
  121. // -0 -0 -pi
  122. //
  123. // Nan anything quiet Y
  124. // anything NaN quiet X
  125. // atan2(+-0/+-0) sets double error tag to 37
  126. // Assembly macros
  127. //==============================================================
  128. EXP_AD_P1 = r33
  129. EXP_AD_P2 = r34
  130. atan2_GR_sml_exp = r35
  131. GR_SAVE_B0 = r35
  132. GR_SAVE_GP = r36
  133. GR_SAVE_PFS = r37
  134. GR_Parameter_X = r38
  135. GR_Parameter_Y = r39
  136. GR_Parameter_RESULT = r40
  137. atan2_GR_tag = r41
  138. atan2_Y = f8
  139. atan2_X = f9
  140. atan2_u1_X = f32
  141. atan2_u1_Y = f33
  142. atan2_z2_X = f34
  143. atan2_z2_Y = f35
  144. atan2_two = f36
  145. atan2_B1sq_Y = f37
  146. atan2_z1_X = f38
  147. atan2_z1_Y = f39
  148. atan2_B1X = f40
  149. atan2_B1Y = f41
  150. atan2_wp_X = f42
  151. atan2_B1sq_X = f43
  152. atan2_z = f44
  153. atan2_w = f45
  154. atan2_P0 = f46
  155. atan2_P1 = f47
  156. atan2_P2 = f48
  157. atan2_P3 = f49
  158. atan2_P4 = f50
  159. atan2_P5 = f51
  160. atan2_P6 = f52
  161. atan2_P7 = f53
  162. atan2_P8 = f54
  163. atan2_P9 = f55
  164. atan2_P10 = f56
  165. atan2_P11 = f57
  166. atan2_P12 = f58
  167. atan2_P13 = f59
  168. atan2_P14 = f60
  169. atan2_P15 = f61
  170. atan2_P16 = f62
  171. atan2_P17 = f63
  172. atan2_P18 = f64
  173. atan2_P19 = f65
  174. atan2_P20 = f66
  175. atan2_P21 = f67
  176. atan2_P22 = f68
  177. atan2_pi_by_2 = f69
  178. atan2_V13 = f70
  179. atan2_W11 = f71
  180. atan2_E = f72
  181. atan2_wp_Y = f73
  182. atan2_V11 = f74
  183. atan2_V12 = f75
  184. atan2_V7 = f76
  185. atan2_V8 = f77
  186. atan2_W7 = f78
  187. atan2_W8 = f79
  188. atan2_W3 = f80
  189. atan2_W4 = f81
  190. atan2_V3 = f82
  191. atan2_V4 = f83
  192. atan2_F = f84
  193. atan2_gV = f85
  194. atan2_V10 = f86
  195. atan2_zcub = f87
  196. atan2_V6 = f88
  197. atan2_V9 = f89
  198. atan2_W10 = f90
  199. atan2_W6 = f91
  200. atan2_W2 = f92
  201. atan2_V2 = f93
  202. atan2_alpha = f94
  203. atan2_alpha_1 = f95
  204. atan2_gVF = f96
  205. atan2_V5 = f97
  206. atan2_W12 = f98
  207. atan2_W5 = f99
  208. atan2_alpha_sq = f100
  209. atan2_Cp = f101
  210. atan2_V1 = f102
  211. atan2_ysq = f103
  212. atan2_W1 = f104
  213. atan2_alpha_cub = f105
  214. atan2_C = f106
  215. atan2_xsq = f107
  216. atan2_d = f108
  217. atan2_A_hi = f109
  218. atan2_dsq = f110
  219. atan2_pd = f111
  220. atan2_A_lo = f112
  221. atan2_A = f113
  222. atan2_Pp = f114
  223. atan2_sgnY = f115
  224. atan2_pi = f116
  225. atan2_3pi_by_4 = f117
  226. atan2_pi_by_4 = f118
  227. // These coefficients are for atan2.
  228. // You can also use this set to substitute those used in the |X| <= 1 case for atan;
  229. // BUT NOT vice versa.
  230. /////////////////////////////////////////////////////////////
  231. .data
  232. .align 16
  233. atan2_tb1:
  234. data8 0xA21922DC45605EA1 , 0x00003FFA // P11
  235. data8 0xB199DD6D2675C40F , 0x0000BFFA // P10
  236. data8 0xC2F01E5DDD100DBE , 0x00003FFA // P9
  237. data8 0xD78F28FC2A592781 , 0x0000BFFA // P8
  238. data8 0xF0F03ADB3FC930D3 , 0x00003FFA // P7
  239. data8 0x88887EBB209E3543 , 0x0000BFFB // P6
  240. data8 0x9D89D7D55C3287A5 , 0x00003FFB // P5
  241. data8 0xBA2E8B9793955C77 , 0x0000BFFB // P4
  242. data8 0xE38E38E320A8A098 , 0x00003FFB // P3
  243. data8 0x9249249247E37913 , 0x0000BFFC // P2
  244. data8 0xCCCCCCCCCCC906CD , 0x00003FFC // P1
  245. data8 0xAAAAAAAAAAAAA8A9 , 0x0000BFFD // P0
  246. data8 0xC90FDAA22168C235 , 0x00004000 // pi
  247. atan2_tb2:
  248. data8 0xCE585A259BD8374C , 0x00003FF0 // P21
  249. data8 0x9F90FB984D8E39D0 , 0x0000BFF3 // P20
  250. data8 0x9D3436AABE218776 , 0x00003FF5 // P19
  251. data8 0xDEC343E068A6D2A8 , 0x0000BFF6 // P18
  252. data8 0xF396268151CFB11C , 0x00003FF7 // P17
  253. data8 0xD818B4BB43D84BF2 , 0x0000BFF8 // P16
  254. data8 0xA2270D30A90AA220 , 0x00003FF9 // P15
  255. data8 0xD5F4F2182E7A8725 , 0x0000BFF9 // P14
  256. data8 0x80D601879218B53A , 0x00003FFA // P13
  257. data8 0x9297B23CCFFB291F , 0x0000BFFA // P12
  258. data8 0xFE7E52D2A89995B3 , 0x0000BFEC // P22
  259. data8 0xC90FDAA22168C235 , 0x00003FFF // pi/2
  260. data8 0xC90FDAA22168C235 , 0x00003FFE // pi/4
  261. data8 0x96cbe3f9990e91a8 , 0x00004000 // 3pi/4
  262. .align 32
  263. .global atan2#
  264. ////////////////////////////////////////////////////////
  265. .section .text
  266. .proc atan2#
  267. .align 32
  268. atan2:
  269. { .mfi
  270. alloc r32 = ar.pfs,1,5,4,0
  271. frcpa.s1 atan2_u1_X,p6 = f1,atan2_X
  272. nop.i 999
  273. }
  274. { .mfi
  275. addl EXP_AD_P1 = @ltoff(atan2_tb1), gp
  276. fma.s1 atan2_two = f1,f1,f1
  277. nop.i 999
  278. ;;
  279. }
  280. { .mfi
  281. ld8 EXP_AD_P1 = [EXP_AD_P1]
  282. frcpa.s1 atan2_u1_Y,p7 = f1,atan2_Y
  283. nop.i 999
  284. }
  285. { .mfi
  286. nop.m 999
  287. fma.s1 atan2_xsq = atan2_X,atan2_X,f0
  288. nop.i 999
  289. ;;
  290. }
  291. { .mfi
  292. nop.m 999
  293. fclass.m p10,p0 = atan2_Y, 0xc3 // Test for y=nan
  294. nop.i 999
  295. }
  296. { .mfi
  297. nop.m 999
  298. fma.s1 atan2_ysq = atan2_Y,atan2_Y,f0
  299. nop.i 999
  300. }
  301. ;;
  302. { .mfi
  303. add EXP_AD_P2 = 0xd0,EXP_AD_P1
  304. fclass.m p12,p0 = atan2_X, 0xc3 // Test for x nan
  305. nop.i 999
  306. }
  307. ;;
  308. // p10 Y NAN, quiet and return
  309. { .mfi
  310. ldfe atan2_P11 = [EXP_AD_P1],16
  311. fmerge.s atan2_sgnY = atan2_Y,f1
  312. nop.i 999
  313. }
  314. { .mfb
  315. ldfe atan2_P21 = [EXP_AD_P2],16
  316. (p10) fma.d f8 = atan2_Y,atan2_X,f0 // If y=nan, result quietized y
  317. (p10) br.ret.spnt b0 // Exit if y=nan
  318. ;;
  319. }
  320. { .mfi
  321. ldfe atan2_P10 = [EXP_AD_P1],16
  322. fma.s1 atan2_z1_X = atan2_u1_X, atan2_Y, f0
  323. nop.i 999
  324. }
  325. { .mfi
  326. ldfe atan2_P20 = [EXP_AD_P2],16
  327. fnma.s1 atan2_B1X = atan2_u1_X, atan2_X, atan2_two
  328. nop.i 999
  329. ;;
  330. }
  331. { .mfi
  332. ldfe atan2_P9 = [EXP_AD_P1],16
  333. fma.s1 atan2_z1_Y = atan2_u1_Y, atan2_X, f0
  334. nop.i 999
  335. }
  336. { .mfi
  337. ldfe atan2_P19 = [EXP_AD_P2],16
  338. fnma.s1 atan2_B1Y = atan2_u1_Y, atan2_Y, atan2_two
  339. nop.i 999
  340. }
  341. ;;
  342. { .mfi
  343. ldfe atan2_P8 = [EXP_AD_P1],16
  344. fma.s1 atan2_z2_X = atan2_u1_X, atan2_ysq, f0
  345. nop.i 999
  346. }
  347. { .mfi
  348. ldfe atan2_P18 = [EXP_AD_P2],16
  349. fma.s1 atan2_z2_Y = atan2_u1_Y, atan2_xsq, f0
  350. nop.i 999
  351. }
  352. ;;
  353. // p10 ==> x inf y ?
  354. // p11 ==> x !inf y ?
  355. { .mfi
  356. ldfe atan2_P7 = [EXP_AD_P1],16
  357. fclass.m p10,p11 = atan2_X, 0x23 // test for x inf
  358. nop.i 999
  359. }
  360. { .mfb
  361. ldfe atan2_P17 = [EXP_AD_P2],16
  362. (p12) fma.d f8 = atan2_X,atan2_Y,f0 // If x nan, result quiet x
  363. (p12) br.ret.spnt b0 // Exit for x nan
  364. ;;
  365. }
  366. // p6 true if swap, means |y| > |x| or ysq > xsq
  367. // p7 true if no swap, means |x| >= |y| or xsq >= ysq
  368. { .mmf
  369. ldfe atan2_P6 = [EXP_AD_P1],16
  370. ldfe atan2_P16 = [EXP_AD_P2],16
  371. fcmp.ge.s1 p7,p6 = atan2_xsq, atan2_ysq
  372. ;;
  373. }
  374. { .mfi
  375. ldfe atan2_P5 = [EXP_AD_P1],16
  376. fma.s1 atan2_wp_X = atan2_z1_X, atan2_z1_X, f0
  377. nop.i 999
  378. }
  379. { .mfi
  380. ldfe atan2_P15 = [EXP_AD_P2],16
  381. fma.s1 atan2_B1sq_X = atan2_B1X, atan2_B1X, f0
  382. nop.i 999
  383. ;;
  384. }
  385. { .mfi
  386. ldfe atan2_P4 = [EXP_AD_P1],16
  387. (p6) fma.s1 atan2_wp_Y = atan2_z1_Y, atan2_z1_Y, f0
  388. nop.i 999
  389. }
  390. { .mfi
  391. ldfe atan2_P14 = [EXP_AD_P2],16
  392. (p6) fma.s1 atan2_B1sq_Y = atan2_B1Y, atan2_B1Y, f0
  393. nop.i 999
  394. ;;
  395. }
  396. { .mfi
  397. ldfe atan2_P3 = [EXP_AD_P1],16
  398. (p6) fma.s1 atan2_E = atan2_z2_Y, atan2_B1Y, atan2_Y
  399. nop.i 999
  400. }
  401. { .mfi
  402. ldfe atan2_P13 = [EXP_AD_P2],16
  403. (p7) fma.s1 atan2_E = atan2_z2_X, atan2_B1X, atan2_X
  404. nop.i 999
  405. ;;
  406. }
  407. { .mfi
  408. ldfe atan2_P2 = [EXP_AD_P1],16
  409. (p6) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0
  410. nop.i 999
  411. }
  412. { .mfi
  413. ldfe atan2_P12 = [EXP_AD_P2],16
  414. (p7) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0
  415. nop.i 999
  416. ;;
  417. }
  418. { .mmf
  419. ldfe atan2_P1 = [EXP_AD_P1],16
  420. ldfe atan2_P22 = [EXP_AD_P2],16
  421. fcmp.eq.s0 p14,p15=atan2_X,atan2_Y // Dummy for denorm and invalid
  422. ;;
  423. }
  424. // p12 ==> x inf y inf
  425. // p13 ==> x inf y !inf
  426. { .mmf
  427. ldfe atan2_P0 = [EXP_AD_P1],16
  428. ldfe atan2_pi_by_2 = [EXP_AD_P2],16
  429. (p10) fclass.m.unc p12,p13 = atan2_Y, 0x23 // x inf, test if y inf
  430. ;;
  431. }
  432. { .mfi
  433. ldfe atan2_pi = [EXP_AD_P1],16
  434. (p6) fma.s1 atan2_w = atan2_wp_Y, atan2_B1sq_Y,f0
  435. nop.i 999
  436. }
  437. { .mfi
  438. ldfe atan2_pi_by_4 = [EXP_AD_P2],16
  439. (p7) fma.s1 atan2_w = atan2_wp_X, atan2_B1sq_X,f0
  440. nop.i 999
  441. ;;
  442. }
  443. { .mfi
  444. ldfe atan2_3pi_by_4 = [EXP_AD_P2],16
  445. (p11) fclass.m.unc p9,p0 = atan2_Y, 0x23 // x not inf, test if y inf
  446. nop.i 999
  447. ;;
  448. }
  449. { .mfi
  450. nop.m 999
  451. (p12) fcmp.gt.unc.s1 p10,p11 = atan2_X,f0 // x inf, y inf, test if x +inf
  452. nop.i 999
  453. }
  454. { .mfi
  455. nop.m 999
  456. (p6) fnma.s1 atan2_gV = atan2_Y, atan2_z, atan2_X
  457. nop.i 999
  458. ;;
  459. }
  460. { .mfi
  461. nop.m 999
  462. frcpa.s1 atan2_F,p0 = f1, atan2_E
  463. nop.i 999
  464. }
  465. { .mfi
  466. nop.m 999
  467. (p7) fnma.s1 atan2_gV = atan2_X, atan2_z, atan2_Y
  468. nop.i 999
  469. ;;
  470. }
  471. // p13 ==> x inf y !inf
  472. { .mfi
  473. nop.m 999
  474. (p13) fcmp.gt.unc.s1 p14,p15 = atan2_X,f0 // x inf, y !inf, test if x +inf
  475. nop.i 999
  476. }
  477. { .mfb
  478. nop.m 999
  479. (p9) fma.d f8 = atan2_sgnY, atan2_pi_by_2, f0 // +-pi/2 if x !inf, y inf
  480. (p9) br.ret.spnt b0 // exit if x not inf, y inf, result is +-pi/2
  481. ;;
  482. }
  483. { .mfi
  484. nop.m 999
  485. fma.s1 atan2_V13 = atan2_w, atan2_P11, atan2_P10
  486. nop.i 999
  487. }
  488. { .mfi
  489. nop.m 999
  490. fma.s1 atan2_W11 = atan2_w, atan2_P21, atan2_P20
  491. nop.i 999
  492. ;;
  493. }
  494. { .mfi
  495. nop.m 999
  496. fma.s1 atan2_V11 = atan2_w, atan2_P9, atan2_P8
  497. nop.i 999
  498. }
  499. { .mfi
  500. nop.m 999
  501. fma.s1 atan2_V12 = atan2_w, atan2_w, f0
  502. nop.i 999
  503. ;;
  504. }
  505. { .mfi
  506. nop.m 999
  507. fma.s1 atan2_V8 = atan2_w, atan2_P7 , atan2_P6
  508. nop.i 999
  509. }
  510. { .mfi
  511. nop.m 999
  512. fma.s1 atan2_W8 = atan2_w, atan2_P19, atan2_P18
  513. nop.i 999
  514. ;;
  515. }
  516. { .mfi
  517. nop.m 999
  518. fnma.s1 atan2_alpha = atan2_E, atan2_F, f1
  519. nop.i 999
  520. }
  521. { .mfi
  522. nop.m 999
  523. fnma.s1 atan2_alpha_1 = atan2_E, atan2_F, atan2_two
  524. nop.i 999
  525. ;;
  526. }
  527. { .mfi
  528. nop.m 999
  529. fma.s1 atan2_V7 = atan2_w, atan2_P5 , atan2_P4
  530. nop.i 999
  531. }
  532. { .mfi
  533. nop.m 999
  534. fma.s1 atan2_W7 = atan2_w, atan2_P17, atan2_P16
  535. nop.i 999
  536. ;;
  537. }
  538. { .mfi
  539. nop.m 999
  540. fma.s1 atan2_V4 = atan2_w, atan2_P3 , atan2_P2
  541. nop.i 999
  542. }
  543. { .mfi
  544. nop.m 999
  545. fma.s1 atan2_W4 = atan2_w, atan2_P15, atan2_P14
  546. nop.i 999
  547. ;;
  548. }
  549. { .mfi
  550. nop.m 999
  551. fma.s1 atan2_V3 = atan2_w, atan2_P1 , atan2_P0
  552. nop.i 999
  553. }
  554. { .mfi
  555. nop.m 999
  556. fma.s1 atan2_W3 = atan2_w, atan2_P13, atan2_P12
  557. nop.i 999
  558. ;;
  559. }
  560. { .mfi
  561. nop.m 999
  562. fma.s1 atan2_V10 = atan2_V12, atan2_V13, atan2_V11
  563. nop.i 999
  564. }
  565. { .mfi
  566. nop.m 999
  567. fma.s1 atan2_gVF = atan2_gV, atan2_F, f0
  568. nop.i 999
  569. ;;
  570. }
  571. { .mfi
  572. nop.m 999
  573. fma.s1 atan2_alpha_sq = atan2_alpha, atan2_alpha, f0
  574. nop.i 999
  575. }
  576. { .mfi
  577. nop.m 999
  578. fma.s1 atan2_Cp = atan2_alpha, atan2_alpha_1, f1
  579. nop.i 999
  580. ;;
  581. }
  582. { .mfi
  583. nop.m 999
  584. fma.s1 atan2_V9 = atan2_V12, atan2_V12, f0
  585. nop.i 999
  586. }
  587. { .mfi
  588. nop.m 999
  589. fma.s1 atan2_W10 = atan2_V12, atan2_P22 , atan2_W11
  590. nop.i 999
  591. ;;
  592. }
  593. { .mfi
  594. nop.m 999
  595. fma.s1 atan2_V6 = atan2_V12, atan2_V8 , atan2_V7
  596. nop.i 999
  597. }
  598. { .mfi
  599. nop.m 999
  600. fma.s1 atan2_W6 = atan2_V12, atan2_W8 , atan2_W7
  601. nop.i 999
  602. ;;
  603. }
  604. { .mfi
  605. nop.m 999
  606. fma.s1 atan2_V2 = atan2_V12, atan2_V4 , atan2_V3
  607. nop.i 999
  608. }
  609. { .mfi
  610. nop.m 999
  611. fma.s1 atan2_W2 = atan2_V12, atan2_W4 , atan2_W3
  612. nop.i 999
  613. ;;
  614. }
  615. // p8 ==> y 0 x?
  616. // p9 ==> y !0 x?
  617. { .mfi
  618. nop.m 999
  619. fclass.m p8,p9 = atan2_Y, 0x07 // Test for y=0
  620. nop.i 999
  621. }
  622. { .mfi
  623. nop.m 999
  624. fma.s1 atan2_zcub = atan2_z, atan2_w, f0
  625. nop.i 999
  626. ;;
  627. }
  628. { .mfi
  629. nop.m 999
  630. fma.s1 atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0
  631. nop.i 999
  632. }
  633. { .mfi
  634. nop.m 999
  635. fma.s1 atan2_C = atan2_gVF, atan2_Cp, f0
  636. nop.i 999
  637. ;;
  638. }
  639. // p12 ==> y0 x0
  640. // p13 ==> y0 x!0
  641. { .mfi
  642. nop.m 999
  643. (p8) fclass.m.unc p12,p13 = atan2_X, 0x07 // y=0, test if x is 0
  644. nop.i 999
  645. }
  646. { .mfi
  647. nop.m 999
  648. fma.s1 atan2_W12 = atan2_V9, atan2_V9, f0
  649. nop.i 999
  650. ;;
  651. }
  652. { .mfi
  653. nop.m 999
  654. fma.s1 atan2_V5 = atan2_V9, atan2_V10, atan2_V6
  655. nop.i 999
  656. }
  657. { .mfi
  658. nop.m 999
  659. fma.s1 atan2_W5 = atan2_V9, atan2_W10, atan2_W6
  660. nop.i 999
  661. ;;
  662. }
  663. // p9 ==> y!0 x0
  664. { .mfi
  665. nop.m 999
  666. (p9) fclass.m.unc p9,p0 = atan2_X, 0x07 // y not 0, test if x is 0
  667. nop.i 999
  668. }
  669. // p10 ==> X +INF, Y +-INF
  670. { .mfb
  671. nop.m 999
  672. (p10) fma.d f8 = atan2_sgnY, atan2_pi_by_4, f0 // x=+inf, y=inf
  673. (p10) br.ret.spnt b0 // Exit for x=+inf, y=inf, result is +-pi/4
  674. ;;
  675. }
  676. .pred.rel "mutex",p11,p14
  677. { .mfi
  678. nop.m 999
  679. (p14) fmerge.s f8 = atan2_sgnY, f0 // x=+inf, y !inf, result +-0
  680. nop.i 999
  681. }
  682. // p11 ==> X -INF, Y +-INF
  683. { .mfb
  684. nop.m 999
  685. (p11) fma.d f8 = atan2_sgnY, atan2_3pi_by_4, f0 // x=-inf, y=inf
  686. (p11) br.ret.spnt b0 // Exit for x=-inf, y=inf, result is +-3pi/4
  687. ;;
  688. }
  689. { .mfi
  690. nop.m 999
  691. (p13) fcmp.gt.unc.s1 p10,p11 = atan2_X,f0 // x not 0, y=0, test if x>0
  692. nop.i 999
  693. }
  694. { .mfb
  695. nop.m 999
  696. fma.s1 atan2_d = atan2_alpha_cub, atan2_C, atan2_C
  697. (p14) br.ret.spnt b0 // Exit if x=+inf, y !inf, result +-0
  698. ;;
  699. }
  700. { .mfi
  701. nop.m 999
  702. fma.s1 atan2_W12 = atan2_V9, atan2_W12, f0
  703. nop.i 999
  704. }
  705. { .mfb
  706. nop.m 999
  707. (p9) fma.d f8 = atan2_sgnY, atan2_pi_by_2, f0 // x=0, y not 0
  708. (p9) br.ret.spnt b0 // Exit if x=0 and y not 0, result is +-pi/2
  709. ;;
  710. }
  711. { .mfi
  712. nop.m 999
  713. fma.s1 atan2_V1 = atan2_V9, atan2_V5, atan2_V2
  714. nop.i 999
  715. }
  716. { .mfb
  717. nop.m 999
  718. fma.s1 atan2_W1 = atan2_V9, atan2_W5, atan2_W2
  719. (p12) br.spnt ATAN2_ERROR // Branch if x=0 and y=0
  720. ;;
  721. }
  722. { .mfi
  723. nop.m 999
  724. (p10) fmerge.s f8 = atan2_sgnY, f0 // +-0 if x>0, y=0
  725. nop.i 999
  726. }
  727. { .mfb
  728. nop.m 999
  729. (p11) fma.d f8 = atan2_sgnY, atan2_pi, f0 // +-pi if x<0, y=0
  730. (p13) br.ret.spnt b0 // Exit if x!0 and y=0
  731. ;;
  732. }
  733. { .mfb
  734. nop.m 999
  735. (p15) fma.d f8 = atan2_sgnY, atan2_pi, f0
  736. (p15) br.ret.spnt b0 // Exit if x=-inf, y !inf, result +-pi
  737. ;;
  738. }
  739. { .mfi
  740. nop.m 999
  741. fma.s1 atan2_pd = atan2_P0, atan2_d, f0
  742. nop.i 999
  743. }
  744. { .mfi
  745. nop.m 999
  746. fma.s1 atan2_dsq = atan2_d, atan2_d, f0
  747. nop.i 999
  748. ;;
  749. }
  750. { .mfi
  751. nop.m 999
  752. fma.s1 atan2_Pp = atan2_W12, atan2_W1, atan2_V1
  753. nop.i 999
  754. ;;
  755. }
  756. // p8 true if no swap and X positive
  757. // p9 true if no swap and X negative
  758. // both are false is swap is true
  759. { .mfi
  760. nop.m 999
  761. (p7) fcmp.ge.unc.s1 p8,p9 = atan2_X,f0
  762. nop.i 999
  763. }
  764. { .mfi
  765. nop.m 999
  766. fma.s1 atan2_A_lo = atan2_pd, atan2_dsq, atan2_d
  767. nop.i 999
  768. ;;
  769. }
  770. { .mfi
  771. nop.m 999
  772. fma.s1 atan2_A_hi = atan2_zcub, atan2_Pp, atan2_z
  773. nop.i 999
  774. ;;
  775. }
  776. { .mfi
  777. nop.m 999
  778. fma.s1 atan2_A = atan2_A_hi, f1, atan2_A_lo
  779. nop.i 999
  780. }
  781. // For |Y| <= |X| and X > 0, result is A_hi + A_lo
  782. { .mfi
  783. nop.m 999
  784. (p8) fma.d f8 = atan2_A_hi, f1, atan2_A_lo
  785. nop.i 999
  786. ;;
  787. }
  788. // For |Y| > |X|, result is +- pi/2 - (A_hi + A_lo)
  789. { .mfi
  790. nop.m 999
  791. (p6) fms.d f8 = atan2_sgnY, atan2_pi_by_2, atan2_A
  792. nop.i 999
  793. }
  794. // For |Y| <= |X|, and X < 0, result is +- pi + (A_hi + A_lo)
  795. { .mfb
  796. nop.m 999
  797. (p9) fma.d f8 = atan2_sgnY, atan2_pi, atan2_A
  798. br.ret.sptk b0
  799. ;;
  800. }
  801. ATAN2_ERROR:
  802. // Here if x=0 and y=0
  803. { .mfi
  804. nop.m 999
  805. fclass.m p10,p11 = atan2_X,0x05 // Test if x=+0
  806. nop.i 999
  807. }
  808. ;;
  809. { .mfi
  810. mov atan2_GR_tag = 37
  811. (p10) fmerge.s f10 = atan2_sgnY, f0 // x=+0, y=0
  812. nop.i 999
  813. }
  814. { .mfi
  815. nop.m 999
  816. (p11) fma.d f10 = atan2_sgnY, atan2_pi, f0 // x=-0, y=0
  817. nop.i 999
  818. ;;
  819. }
  820. .endp atan2#
  821. // Stack operations when calling error support.
  822. // (1) (2) (3) (call) (4)
  823. // sp -> + psp -> + psp -> + sp -> +
  824. // | | | |
  825. // | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
  826. // | | | |
  827. // | <-GR_Y Y2->| Y2 ->| <- GR_Y |
  828. // | | | |
  829. // | | <- GR_X X1 ->| |
  830. // | | | |
  831. // sp-64 -> + sp -> + sp -> + +
  832. // save ar.pfs save b0 restore gp
  833. // save gp restore ar.pfs
  834. .proc __libm_error_region
  835. __libm_error_region:
  836. .prologue
  837. // (1)
  838. { .mfi
  839. add GR_Parameter_Y=-32,sp // Parameter 2 value
  840. nop.f 999
  841. .save ar.pfs,GR_SAVE_PFS
  842. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  843. }
  844. { .mfi
  845. .fframe 64
  846. add sp=-64,sp // Create new stack
  847. nop.f 0
  848. mov GR_SAVE_GP=gp // Save gp
  849. };;
  850. // (2)
  851. { .mmi
  852. stfd [GR_Parameter_Y] = f8,16 // STORE Parameter 2 on stack
  853. add GR_Parameter_X = 16,sp // Parameter 1 address
  854. .save b0, GR_SAVE_B0
  855. mov GR_SAVE_B0=b0 // Save b0
  856. };;
  857. .body
  858. // (3)
  859. { .mib
  860. stfd [GR_Parameter_X] = f9 // STORE Parameter 1 on stack
  861. add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
  862. nop.b 0
  863. }
  864. { .mib
  865. stfd [GR_Parameter_Y] = f10 // STORE Parameter 3 on stack
  866. add GR_Parameter_Y = -16,GR_Parameter_Y
  867. br.call.sptk b0=__libm_error_support# // Call error handling function
  868. };;
  869. { .mmi
  870. nop.m 0
  871. nop.m 0
  872. add GR_Parameter_RESULT = 48,sp
  873. };;
  874. // (4)
  875. { .mmi
  876. ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
  877. .restore
  878. add sp = 64,sp // Restore stack pointer
  879. mov b0 = GR_SAVE_B0 // Restore return address
  880. };;
  881. { .mib
  882. mov gp = GR_SAVE_GP // Restore gp
  883. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  884. br.ret.sptk b0 // Return
  885. };;
  886. .endp __libm_error_region
  887. .type __libm_error_support#,@function
  888. .global __libm_error_support#