Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1216 lines
31 KiB

  1. .file "libm_atan2_reg.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00: Initial version
  29. // 4/04/00 Unwind support added
  30. .data
  31. .align 64
  32. Constants_atan:
  33. data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000
  34. // double pi/2, single lo_pi/2, two**(-3)
  35. data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1
  36. data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2
  37. data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3
  38. data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4
  39. data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5
  40. data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6
  41. data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7
  42. data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8
  43. data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1
  44. data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2
  45. data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3
  46. data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4
  47. // Entries Tbl_hi (double precision)
  48. // B = 1+Index/16+1/32 Index = 0
  49. // Entries Tbl_lo (single precision)
  50. // B = 1+Index/16+1/32 Index = 0
  51. data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000
  52. // Entries Tbl_hi (double precision) Index = 0,1,...,15
  53. // B = 2^(-1)*(1+Index/16+1/32)
  54. // Entries Tbl_lo (single precision)
  55. // Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
  56. data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000
  57. data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000
  58. data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000
  59. data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000
  60. data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000
  61. data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000
  62. data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000
  63. data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000
  64. data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000
  65. data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000
  66. data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000
  67. data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000
  68. data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000
  69. data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000
  70. data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000
  71. data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000
  72. //
  73. // Entries Tbl_hi (double precision) Index = 0,1,...,15
  74. // B = 2^(-2)*(1+Index/16+1/32)
  75. // Entries Tbl_lo (single precision)
  76. // Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
  77. //
  78. data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000
  79. data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000
  80. data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000
  81. data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000
  82. data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000
  83. data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000
  84. data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000
  85. data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000
  86. data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000
  87. data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000
  88. data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000
  89. data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000
  90. data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000
  91. data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000
  92. data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000
  93. data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000
  94. //
  95. // Entries Tbl_hi (double precision) Index = 0,1,...,15
  96. // B = 2^(-3)*(1+Index/16+1/32)
  97. // Entries Tbl_lo (single precision)
  98. // Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32)
  99. //
  100. data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000
  101. data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000
  102. data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000
  103. data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000
  104. data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000
  105. data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000
  106. data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000
  107. data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000
  108. data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000
  109. data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000
  110. data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000
  111. data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000
  112. data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000
  113. data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000
  114. data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000
  115. data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000
  116. data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // I two doubles
  117. data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // I_by_2 two dbls
  118. data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // I_by_4 two dbls
  119. data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3I_by_4 two dbls
  120. .section .text
  121. .proc __libm_atan2_reg#
  122. .global __libm_atan2_reg#
  123. .align 64
  124. __libm_atan2_reg:
  125. { .mfi
  126. alloc r32 = ar.pfs,0,20,4,0
  127. mov f32 = f8
  128. nop.i 0
  129. }
  130. { .mmi
  131. nop.m 0
  132. addl r39 = @ltoff(Constants_atan#), gp
  133. nop.i 999
  134. }
  135. ;;
  136. { .mmi
  137. ld8 r39 = [r39]
  138. nop.m 999
  139. nop.i 999
  140. }
  141. ;;
  142. { .mfi
  143. nop 999 // EMbo added ...
  144. mov f33 = f9
  145. nop.i 0
  146. } { .mfi
  147. nop 999 // EMbo added ...
  148. fclass.nm.unc p9,p0 = f32 ,0x1FF
  149. nop 999;; // EMbo added ...
  150. } { .mfi
  151. nop 999 // EMbo added ...
  152. fclass.nm.unc p8,p0 = f33 ,0x1FF
  153. nop 999 // EMbo added ...
  154. } { .mfi
  155. nop 999 // EMbo added ...
  156. fclass.m.unc p6,p0 = f33 ,0x103
  157. nop 999;; // EMbo added ...
  158. } { .mfi
  159. nop 999 // EMbo added ...
  160. fclass.m.unc p7,p0 = f32 ,0x103
  161. nop 999 // EMbo added ...
  162. } { .mfi
  163. nop 999 // EMbo added ...
  164. fclass.m.unc p12,p0 = f33 ,0x0C3
  165. nop 999;; // EMbo added ...
  166. } { .mfb
  167. nop 999 // EMbo added ...
  168. //
  169. // Check for NatVals.
  170. // Check for EM Unsupporteds
  171. // Check for NaNs.
  172. //
  173. fclass.m.unc p13,p0 = f32 ,0x0C3
  174. (p6) br.cond.sptk ATAN_NATVAL;;
  175. } { .mbb
  176. nop 999 // EMbo added ...
  177. (p7) br.cond.sptk ATAN_NATVAL
  178. (p8) br.cond.sptk ATAN_UNSUPPORTED;;
  179. } { .mib
  180. add r40 = 96, r39
  181. nop 999 // EMbo added ...
  182. (p9) br.cond.sptk ATAN_UNSUPPORTED;;
  183. } { .mib
  184. ldfd f50 = [r39],8
  185. nop 999 // EMbo added ...
  186. (p12) br.cond.sptk ATAN_NAN;;
  187. } { .mfb
  188. nop 999 // EMbo added ...
  189. fnorm.s1 f33 = f33
  190. (p13) br.cond.sptk ATAN_NAN;;
  191. } { .mfi
  192. ldfs f51 = [r39],4
  193. //
  194. // Remove sign bits from exponents
  195. // Load 2**(-3)
  196. // Normalize the input argument.
  197. //
  198. fnorm.s1 f32 = f32
  199. nop 999 // EMbo added ...
  200. } { .mfi
  201. nop 999 // EMbo added ...
  202. mov f82 = f1
  203. nop 999;; // EMbo added ...
  204. } { .mmi
  205. nop 999;; // EMbo added ...
  206. ldfs f78 = [r39],180
  207. nop 999;; // EMbo added ...
  208. } { .mmi
  209. getf.exp r36 = f33;;
  210. //
  211. // Get exp and sign of ArgX
  212. // Get exp and sign of ArgY
  213. // Load 2**(-3) and increment ptr to Q_4.
  214. //
  215. getf.exp r37 = f32
  216. shr.u r36 = r36,17;;
  217. } { .mfi
  218. nop 999 // EMbo added ...
  219. fmerge.s f84 = f1,f32
  220. shr.u r37 = r37,17;;
  221. } { .mfi
  222. nop 999 // EMbo added ...
  223. //
  224. // ArgX_abs = |ArgX|
  225. // ArgY_abs = |ArgY|
  226. // sign_X is sign bit of ArgX
  227. // sign_Y is sign bit of ArgY
  228. //
  229. fmerge.s f83 = f1,f33
  230. cmp.eq.unc p8,p9 = 0x00000, r37;;
  231. } { .mfi
  232. nop 999 // EMbo added ...
  233. (p8) fadd.s1 f34 = f0, f1
  234. nop 999;; // EMbo added ...
  235. } { .mfi
  236. nop 999 // EMbo added ...
  237. (p9) fsub.s1 f34 = f0, f1
  238. nop 999;; // EMbo added ...
  239. } { .mfi
  240. nop 999 // EMbo added ...
  241. fmin.s1 f36 = f83, f84
  242. nop 999 // EMbo added ...
  243. } { .mfi
  244. nop 999 // EMbo added ...
  245. fmax.s1 f35 = f83, f84
  246. nop 999;; // EMbo added ...
  247. } { .mfi
  248. nop 999 // EMbo added ...
  249. //
  250. // Is ArgX_abs >= ArgY_abs
  251. // Is sign_Y == 0?
  252. //
  253. fcmp.ge.s1 p6,p7 = f83,f84
  254. nop 999;; // EMbo added ...
  255. } { .mii
  256. (p6) cmp.eq.unc p10, p11 = 0x00000, r36
  257. (p6) add r38 = r0, r0;;
  258. //
  259. // U = max(ArgX_abs,ArgY_abs)
  260. // V = min(ArgX_abs,ArgY_abs)
  261. // if p6, swap = 0
  262. // if p7, swap = 1
  263. //
  264. //
  265. // Let M = 1.0
  266. // if p8, s_Y = 1.0
  267. // if p9, s_Y = -1.0
  268. //
  269. (p7) add r38 = 1,r0;;
  270. } { .mfi
  271. nop 999 // EMbo added ...
  272. frcpa.s1 f37, p6 = f36, f35
  273. nop 999;; // EMbo added ...
  274. } { .mfb
  275. nop 999 // EMbo added ...
  276. //
  277. // E = frcpa(V,U)
  278. //
  279. (p10) fsub.s1 f82 = f82, f1
  280. (p6) br.cond.sptk ATAN_STEP2;;
  281. } { .mib
  282. nop 999 // EMbo added ...
  283. nop 999 // EMbo added ...
  284. // /**************************************************/
  285. // /********************* STEP2 **********************/
  286. // /**************************************************/
  287. br.cond.spnt ATAN_SPECIAL_HANDLING;;
  288. }
  289. ATAN_STEP2:
  290. { .mlx
  291. nop 999 // EMbo added ...
  292. movl r47 = 0x8400000000000000
  293. } { .mlx
  294. nop 999 // EMbo added ...
  295. movl r48 = 0x0000000000000100;;
  296. } { .mfi
  297. nop 999 // EMbo added ...
  298. fmpy.s1 f38 = f37, f36
  299. nop 999 // EMbo added ...
  300. } { .mfi
  301. nop 999 // EMbo added ...
  302. fcmp.lt.unc.s0 p0,p9 = f9,f1
  303. nop 999;; // EMbo added ...
  304. } { .mfi
  305. nop 999 // EMbo added ...
  306. fcmp.lt.unc.s0 p0,p8 = f8,f1
  307. nop 999 // EMbo added ...
  308. } { .mfi
  309. nop 999 // EMbo added ...
  310. //
  311. // Q = E * V
  312. //
  313. (p11) fadd.s1 f82 = f82, f1
  314. nop 999;; // EMbo added ...
  315. } { .mfi
  316. getf.sig r46 = f38
  317. fcmp.lt.unc p6,p7 = f38,f78
  318. nop 999;; // EMbo added ...
  319. } { .mfi
  320. nop 999 // EMbo added ...
  321. fmpy.s1 f38 = f37, f36
  322. extr.u r42 = r46, 59, 4;;
  323. } { .mfi
  324. nop 999 // EMbo added ...
  325. fmpy.s1 f50 = f82, f50
  326. dep r47 = r42, r47, 59, 4
  327. } { .mfi
  328. nop 999 // EMbo added ...
  329. fmpy.s1 f51 = f82, f51
  330. nop 999;; // EMbo added ...
  331. } { .mmi
  332. nop 999;; // EMbo added ...
  333. //
  334. // Is Q < 2**(-3)?
  335. //
  336. //
  337. // Do fcmp to raise any denormal operand
  338. // exceptions.
  339. //
  340. getf.exp r45 = f38
  341. nop 999;; // EMbo added ...
  342. } { .mib
  343. //
  344. // lookup = b_1 b_2 b_3 B_4
  345. //
  346. //
  347. // Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
  348. //
  349. andcm r41 = 0x0003, r45
  350. nop 999 // EMbo added ...
  351. //
  352. // We waited a few extra cycles so P_lo and P_hi could be calculated.
  353. // Load the constant 256 for loading up table entries.
  354. //
  355. // /**************************************************/
  356. // /********************* STEP3 **********************/
  357. // /**************************************************/
  358. (p6) br.cond.spnt ATAN_POLY;;
  359. } { .mii
  360. setf.sig f39 = r47
  361. cmp.eq.unc p8, p9 = 0x0000, r41
  362. //
  363. // z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
  364. // point to beginning of Tbl_hi entries - k = 0.
  365. //
  366. add r40 = 16, r39
  367. } { .mmi
  368. ldfe f73 = [r39],-16;;
  369. (p9) sub r41 = r41,r0,1
  370. (p9) add r40 = 16,r40
  371. } { .mfi
  372. (p8) ldfd f48 = [r40],8
  373. fmpy.s1 f50 = f34, f50
  374. xor r38 = r36,r38;;
  375. } { .mmi
  376. ldfe f71 = [r39],-16;;
  377. (p8) ldfs f49 = [r40],8
  378. (p9) pmpy2.r r41 = r41,r48;;
  379. } { .mfi
  380. ldfe f69 = [r39],-16
  381. //
  382. // Let z_hi have exponent and sign of original Q
  383. // Load the Tbl_hi(0) else, increment pointer.
  384. //
  385. fmerge.se f39 = f38,f39
  386. (p9) shladd r42 = r42,0x0004,r41;;
  387. } { .mmi
  388. (p9) add r40 = r40, r42;;
  389. (p9) ldfd f48 = [r40],8
  390. nop 999;; // EMbo added ...
  391. } { .mmi
  392. ldfe f67 = [r39],-16;;
  393. (p9) ldfs f49 = [r40],8
  394. nop 999 // EMbo added ...
  395. } { .mfi
  396. nop 999 // EMbo added ...
  397. //
  398. // U_prime_hi = U + V * z_hi
  399. // Load the Tbl_lo(0)
  400. //
  401. fma.s1 f40 = f36, f39, f35
  402. nop 999;; // EMbo added ...
  403. } { .mfi
  404. nop 999 // EMbo added ...
  405. fnma.s1 f42 = f35, f39, f36
  406. nop 999 // EMbo added ...
  407. } { .mfi
  408. nop 999 // EMbo added ...
  409. mov f52 = f48
  410. nop 999;; // EMbo added ...
  411. } { .mfi
  412. nop 999 // EMbo added ...
  413. frcpa.s1 f43, p6 = f1, f40
  414. nop 999;; // EMbo added ...
  415. } { .mfi
  416. nop 999 // EMbo added ...
  417. //
  418. // U_prime_lo = U - U_prime_hi
  419. // k = k * 256 - result can be 0, 256, or 512.
  420. //
  421. fsub.s1 f41 = f35, f40
  422. cmp.eq.unc p7, p6 = 0x00000, r38
  423. } { .mfi
  424. nop 999 // EMbo added ...
  425. fmpy.s1 f52 = f34, f52
  426. nop 999;; // EMbo added ...
  427. } { .mfi
  428. nop 999 // EMbo added ...
  429. (p7) fadd.s1 f54 = f0, f1
  430. nop 999;; // EMbo added ...
  431. } { .mfi
  432. nop 999 // EMbo added ...
  433. (p6) fsub.s1 f54 = f0, f1
  434. nop 999;; // EMbo added ...
  435. } { .mfi
  436. nop 999 // EMbo added ...
  437. fnma.s1 f80 = f43, f40, f1
  438. nop 999;; // EMbo added ...
  439. } { .mfi
  440. nop 999 // EMbo added ...
  441. fadd.s1 f79 = f41, f40
  442. nop 999 // EMbo added ...
  443. } { .mfi
  444. nop 999 // EMbo added ...
  445. fma.s1 f41 = f36, f39, f41
  446. nop 999;; // EMbo added ...
  447. } { .mfi
  448. nop 999 // EMbo added ...
  449. fma.s1 f56 = f54, f52, f50
  450. nop 999;; // EMbo added ...
  451. } { .mfi
  452. nop 999 // EMbo added ...
  453. fma.s1 f43 = f80, f43, f43
  454. nop 999;; // EMbo added ...
  455. } { .mfi
  456. nop 999 // EMbo added ...
  457. //
  458. // U_prime_lo = U - U_hold
  459. // lookup -> lookup * 16 + k
  460. //
  461. //
  462. // V_prime = V - U * z_hi
  463. // U_prime_lo = V * z_hi + U_prime_lo
  464. //
  465. fsub.s1 f79 = f35, f79
  466. nop 999;; // EMbo added ...
  467. } { .mfi
  468. nop 999 // EMbo added ...
  469. fnma.s1 f80 = f43, f40, f1
  470. nop 999;; // EMbo added ...
  471. } { .mfi
  472. nop 999 // EMbo added ...
  473. //
  474. // C_hi = frcpa(1,U_prime_hi)
  475. // U_prime_lo = U_prime_lo + U_hold
  476. //
  477. //
  478. // C_hi_hold = 1 - C_hi * U_prime_hi (1)
  479. //
  480. //
  481. // C_hi = C_hi + C_hi * C_hi_hold (1)
  482. //
  483. //
  484. // C_hi_hold = 1 - C_hi * U_prime_hi (2)
  485. //
  486. fadd.s1 f41 = f41, f79
  487. nop 999;; // EMbo added ...
  488. } { .mfi
  489. nop 999 // EMbo added ...
  490. //
  491. // C_hi = C_hi + C_hi * C_hi_hold (2)
  492. //
  493. fma.s1 f43 = f80, f43, f43
  494. nop 999;; // EMbo added ...
  495. } { .mfi
  496. nop 999 // EMbo added ...
  497. //
  498. // C_hi_hold = 1 - C_hi * U_prime_hi (3)
  499. //
  500. fnma.s1 f80 = f43, f40, f1
  501. nop 999;; // EMbo added ...
  502. } { .mfi
  503. nop 999 // EMbo added ...
  504. //
  505. // C_hi = C_hi + C_hi * C_hi_hold (3)
  506. //
  507. fma.s1 f43 = f80, f43, f43
  508. nop 999;; // EMbo added ...
  509. } { .mfi
  510. nop 999 // EMbo added ...
  511. //
  512. // w_hi = V_prime * C_hi
  513. //
  514. fmpy.s1 f44 = f42, f43
  515. nop 999;; // EMbo added ...
  516. } { .mfi
  517. nop 999 // EMbo added ...
  518. fmpy.s1 f46 = f44, f44
  519. nop 999 // EMbo added ...
  520. } { .mfi
  521. nop 999 // EMbo added ...
  522. //
  523. // wsq = w_hi * w_hi
  524. // w_lo = = V_prime - w_hi * U_prime_hi
  525. //
  526. fnma.s1 f45 = f44, f40, f42
  527. nop 999;; // EMbo added ...
  528. } { .mfi
  529. nop 999 // EMbo added ...
  530. fma.s1 f47 = f46, f73, f71
  531. nop 999 // EMbo added ...
  532. } { .mfi
  533. nop 999 // EMbo added ...
  534. //
  535. // poly = Q_3 + wsq * Q_4
  536. // w_lo = = w_lo - w_hi * U_prime_lo
  537. //
  538. fnma.s1 f45 = f44, f41, f45
  539. nop 999;; // EMbo added ...
  540. } { .mfi
  541. nop 999 // EMbo added ...
  542. fma.s1 f47 = f46, f47, f69
  543. nop 999 // EMbo added ...
  544. } { .mfi
  545. nop 999 // EMbo added ...
  546. //
  547. // poly = Q_2 + wsq * poly
  548. // w_lo = = w_lo * C_hi
  549. //
  550. fmpy.s1 f45 = f43, f45
  551. nop 999;; // EMbo added ...
  552. } { .mfi
  553. nop 999 // EMbo added ...
  554. fma.s1 f47 = f46, f47, f67
  555. nop 999 // EMbo added ...
  556. } { .mfi
  557. nop 999 // EMbo added ...
  558. //
  559. // poly = Q_1 + wsq * poly
  560. // A_lo = Tbl_lo + w_lo
  561. // swap = xor(swap,sign_X)
  562. //
  563. fadd.s1 f53 = f49, f45
  564. nop 999;; // EMbo added ...
  565. } { .mfi
  566. nop 999 // EMbo added ...
  567. //
  568. // Is (swap) != 0 ?
  569. // poly = wsq * poly
  570. // A_hi = Tbl_hi
  571. //
  572. fmpy.s1 f47 = f46, f47
  573. nop 999;; // EMbo added ...
  574. } { .mfi
  575. nop 999 // EMbo added ...
  576. //
  577. // poly = wsq * poly
  578. //
  579. //
  580. // if (p6) sigma = -1.0
  581. // if (p7) sigma = 1.0
  582. //
  583. fmpy.s1 f47 = f44, f47
  584. nop 999;; // EMbo added ...
  585. } { .mfi
  586. nop 999 // EMbo added ...
  587. //
  588. // P_hi = s_Y * P_hi
  589. // A_lo = A_lo + poly
  590. //
  591. fadd.s1 f53 = f53, f47
  592. nop 999;; // EMbo added ...
  593. } { .mfi
  594. nop 999 // EMbo added ...
  595. //
  596. // A_lo = A_lo + w_hi
  597. // A_hi = s_Y * A_hi
  598. //
  599. fadd.s1 f53 = f53, f44
  600. nop 999;; // EMbo added ...
  601. } { .mfb
  602. nop 999 // EMbo added ...
  603. //
  604. // result_hi = P_hi + sigma * A_hi
  605. // result_lo = P_lo + sigma * A_lo
  606. //
  607. fma.s1 f55 = f54, f53, f51
  608. br.cond.sptk RETURN_ATAN;;
  609. }
  610. //
  611. // result = result_hi + result_lo * s_Y (User Supplied Rounding Mode)
  612. //
  613. // fma.d.s0 f57 = f55, f34, f56
  614. //
  615. // /**************************************************/
  616. // /********************* STEP4 **********************/
  617. // /**************************************************/
  618. //
  619. ATAN_POLY:
  620. { .mmi
  621. xor r38 = r36,r38
  622. addl r39 = @ltoff(Constants_atan#), gp
  623. nop.i 999
  624. }
  625. ;;
  626. { .mmi
  627. ld8 r39 = [r39]
  628. nop.m 999
  629. nop.i 999
  630. }
  631. ;;
  632. { .mlx
  633. nop 999 // EMbo added ...
  634. movl r47 = 0x24005;;
  635. } { .mfi
  636. add r39 = 128, r39
  637. fnma.s1 f81 = f37, f35, f1
  638. cmp.eq.unc p7, p6 = 0x00000, r38;;
  639. } { .mmf
  640. nop 999 // EMbo added ...
  641. ldfe f77 = [r39],-16
  642. //
  643. // Iterate 3 times E = E + E*(1.0 - E*U)
  644. // Also load P_8, P_7, P_6, P_5, P_4
  645. // E_hold = 1.0 - E * U (1)
  646. // A_temp = Q
  647. //
  648. mov f85 = f38;;
  649. } { .mmf
  650. nop 999 // EMbo added ...
  651. ldfe f76 = [r39],-16
  652. (p6) fsub.s1 f54 = f0, f1;;
  653. } { .mmf
  654. nop 999 // EMbo added ...
  655. ldfe f75 = [r39],-16
  656. //
  657. // E = E + E_hold*E (1)
  658. // Point to P_8.
  659. //
  660. fma.s1 f37 = f37, f81, f37;;
  661. } { .mmf
  662. nop 999 // EMbo added ...
  663. ldfe f74 = [r39],-16
  664. fnma.s1 f64 = f85, f35, f36;;
  665. } { .mmf
  666. nop 999 // EMbo added ...
  667. ldfe f72 = [r39],-16
  668. (p7) fadd.s1 f54 = f0, f1;;
  669. } { .mmf
  670. nop 999 // EMbo added ...
  671. ldfe f70 = [r39],-16
  672. //
  673. // E_hold = 1.0 - E * U (2)
  674. //
  675. fnma.s1 f81 = f37, f35, f1;;
  676. } { .mmf
  677. nop 999 // EMbo added ...
  678. ldfe f68 = [r39],-16
  679. fmpy.s1 f50 = f34, f50;;
  680. } { .mmf
  681. nop 999 // EMbo added ...
  682. ldfe f66 = [r39],-16
  683. fmpy.d.s0 f67 = f67, f67
  684. } { .mfi
  685. nop 999 // EMbo added ...
  686. //
  687. // E = E + E_hold*E (2)
  688. //
  689. fma.s1 f37 = f37, f81, f37
  690. nop 999;; // EMbo added ...
  691. } { .mfi
  692. nop 999 // EMbo added ...
  693. //
  694. // E_hold = 1.0 - E * U (3)
  695. //
  696. fnma.s1 f81 = f37, f35, f1
  697. nop 999;; // EMbo added ...
  698. } { .mfi
  699. nop 999 // EMbo added ...
  700. //
  701. // E = E + E_hold*E (3)
  702. // At this point E approximates 1/U to roughly working precision
  703. // z = V*E approximates V/U
  704. //
  705. fma.s1 f37 = f37, f81, f37
  706. nop 999;; // EMbo added ...
  707. } { .mfi
  708. nop 999 // EMbo added ...
  709. //
  710. // z = V * E
  711. //
  712. fmpy.s1 f59 = f36, f37
  713. nop 999 // EMbo added ...
  714. } { .mfi
  715. nop 999 // EMbo added ...
  716. fmpy.s1 f64 = f64, f37
  717. nop 999;; // EMbo added ...
  718. } { .mfi
  719. nop 999 // EMbo added ...
  720. //
  721. // zsq = z * z
  722. // Also load P_3
  723. //
  724. fmpy.s1 f60 = f59, f59
  725. nop 999 // EMbo added ...
  726. } { .mfi
  727. nop 999 // EMbo added ...
  728. fadd.s1 f52 = f85, f64
  729. nop 999;; // EMbo added ...
  730. } { .mfi
  731. nop 999 // EMbo added ...
  732. fma.s1 f62 = f60, f77, f76
  733. nop 999 // EMbo added ...
  734. } { .mfi
  735. nop 999 // EMbo added ...
  736. fma.s1 f63 = f60, f70, f68
  737. nop 999;; // EMbo added ...
  738. } { .mfi
  739. nop 999 // EMbo added ...
  740. //
  741. // z8 = zsq * zsq
  742. // Also load P_2
  743. //
  744. fmpy.s1 f61 = f60, f60
  745. nop 999 // EMbo added ...
  746. } { .mfi
  747. nop 999 // EMbo added ...
  748. fsub.s1 f85 = f85, f52
  749. nop 999;; // EMbo added ...
  750. } { .mfi
  751. nop 999 // EMbo added ...
  752. fmerge.s f65 = f52,f52
  753. nop 999;; // EMbo added ...
  754. } { .mfi
  755. nop 999 // EMbo added ...
  756. fma.s1 f62 = f60, f62, f75
  757. nop 999 // EMbo added ...
  758. } { .mfi
  759. nop 999 // EMbo added ...
  760. fma.s1 f63 = f60, f63, f66
  761. nop 999;; // EMbo added ...
  762. } { .mfi
  763. nop 999 // EMbo added ...
  764. //
  765. // z8 = z8 * z8
  766. // Also load P_1
  767. // poly1 = _4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
  768. // poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
  769. //
  770. //
  771. // poly1 = P_7 + zsq * P_8
  772. // poly2 = P_2 + zsq * P_3
  773. // poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*poly1))
  774. // poly2 = zsq*(P_1 + zsq*poly2)
  775. //
  776. //
  777. // poly1 = P_6 + zsq * poly1
  778. // poly2 = P_1 + zsq * poly2
  779. // poly1 = P_4 + zsq*(P_5 + zsq*poly1)
  780. // poly2 = zsq*poly2
  781. //
  782. fmpy.s1 f61 = f61, f61
  783. nop 999 // EMbo added ...
  784. } { .mfi
  785. nop 999 // EMbo added ...
  786. fadd.s1 f64 = f85, f64
  787. nop 999;; // EMbo added ...
  788. } { .mfi
  789. nop 999 // EMbo added ...
  790. fma.s1 f62 = f60, f62, f74
  791. nop 999 // EMbo added ...
  792. } { .mfi
  793. nop 999 // EMbo added ...
  794. //
  795. // poly1 = P_5 + zsq * poly1
  796. // poly2 = zsq * poly2
  797. // poly1 = P_4 + zsq*poly1
  798. //
  799. fmpy.s1 f63 = f63, f60
  800. nop 999;; // EMbo added ...
  801. } { .mfi
  802. nop 999 // EMbo added ...
  803. //
  804. // poly1 = P_4 + zsq * poly1
  805. // swap = xor(swap,sign_X)
  806. //
  807. fma.s1 f62 = f60, f62, f72
  808. nop 999;; // EMbo added ...
  809. } { .mfi
  810. nop 999 // EMbo added ...
  811. //
  812. // poly = z8*poly1 + poly2 (Typo in writeup)
  813. // Is (swap) != 0 ?
  814. //
  815. //
  816. // z_lo = V - A_temp * U
  817. // if (p7) sigma = 1.0
  818. // Writeup shows A_temp as A_hi
  819. //
  820. //
  821. // z_lo = z_lo * E
  822. // if (p6) sigma = -1.0
  823. // z_lo = (V - A_temp * U) *E
  824. //
  825. //
  826. // Fixup added to force inexact later -
  827. // A_hi = A_temp + z_lo
  828. // z_lo = (A_temp - A_hi) + z_lo
  829. // z_lo = A_hi - z_lo -A_hi + z_lo = about 0
  830. //
  831. fma.s1 f47 = f61, f62, f63
  832. nop 999;; // EMbo added ...
  833. } { .mfi
  834. nop 999 // EMbo added ...
  835. //
  836. // A_lo = z * poly + z_lo
  837. //
  838. fma.s1 f53 = f59, f47, f64
  839. nop 999;; // EMbo added ...
  840. } { .mfi
  841. nop 999 // EMbo added ...
  842. fadd.s1 f52 = f65, f53
  843. nop 999;; // EMbo added ...
  844. } { .mfi
  845. nop 999 // EMbo added ...
  846. fsub.s1 f65 = f65, f52
  847. nop 999 // EMbo added ...
  848. } { .mfi
  849. nop 999 // EMbo added ...
  850. fmpy.s1 f52 = f34, f52
  851. nop 999;; // EMbo added ...
  852. } { .mfi
  853. nop 999 // EMbo added ...
  854. fadd.s1 f53 = f65, f53
  855. nop 999 // EMbo added ...
  856. } { .mfi
  857. setf.exp f65 = r47
  858. fma.s1 f56 = f54, f52, f50
  859. nop 999;; // EMbo added ...
  860. } { .mfi
  861. nop 999 // EMbo added ...
  862. fclass.m.unc p6,p0 = f53,0x007
  863. nop 999;; // EMbo added ...
  864. } { .mfi
  865. nop 999 // EMbo added ...
  866. //
  867. // P_hi = s_Y * P_hi
  868. // A_hi = s_Y * A_hi
  869. //
  870. //
  871. // result_hi = P_hi + sigma * A_hi
  872. //
  873. (p6) mov f53 = f65
  874. nop 999 // EMbo added ...
  875. } { .mfi
  876. nop 999 // EMbo added ...
  877. //
  878. // tmp = P_hi - result_hi
  879. //
  880. fsub.s1 f65 = f50, f56
  881. nop 999;; // EMbo added ...
  882. } { .mfi
  883. nop 999 // EMbo added ...
  884. fma.s1 f65 = f52, f54, f65
  885. nop 999 // EMbo added ...
  886. } { .mfi
  887. nop 999 // EMbo added ...
  888. //
  889. // tmp = sigma * A_hi + tmp
  890. // sigma = A_lo * sigma + P_lo
  891. //
  892. fma.s1 f54 = f53, f54, f51
  893. nop 999;; // EMbo added ...
  894. } { .mfi
  895. nop 999 // EMbo added ...
  896. //
  897. // result_lo = s_Y * sigma + tmp
  898. //
  899. fma.s1 f55 = f34, f54, f65
  900. nop 999;; // EMbo added ...
  901. } { .mfb
  902. nop.m 0
  903. mov f34 = f1
  904. br.cond.sptk RETURN_ATAN;;
  905. }
  906. //
  907. // result = result_hi + result_lo (User Supplied Rounding Mode)
  908. //
  909. // fadd.d.s0 f57 = f55, f56
  910. ATAN_UNSUPPORTED:
  911. ATAN_NATVAL:
  912. { .mfb
  913. nop 999 // EMbo added ...
  914. //
  915. // Deal with the NatVal and unsupported cases.
  916. // Raise invalid if warrented.
  917. //
  918. fmpy.d.s0 f57 = f8, f9
  919. br.cond.sptk RETURN_ATAN;;
  920. }
  921. ATAN_NAN:
  922. { .mfb
  923. nop 999 // EMbo added ...
  924. //
  925. // If only one NaN, then generate the resulting
  926. // NaN and return - may raise invalid.
  927. //
  928. fmpy.d.s0 f57 = f8, f9
  929. br.cond.sptk RETURN_ATAN;;
  930. }
  931. ATAN_SPECIAL_HANDLING:
  932. { .mmf
  933. addl r39 = @ltoff(Constants_atan#), gp
  934. nop.m 999
  935. fcmp.lt.s0 p0,p7 = f8,f1
  936. }
  937. ;;
  938. //
  939. // Raise denormal operand faults if necessary
  940. //
  941. { .mfi
  942. ld8 r39 = [r39]
  943. fcmp.lt.s0 p0,p6 = f9,f1
  944. nop 999;; // EMbo added ...
  945. }
  946. ;;
  947. { .mfi
  948. nop 999 // EMbo added ...
  949. fclass.m.unc p6,p7 = f32,0x007
  950. nop 999;; // EMbo added ...
  951. } { .mlx
  952. nop 999 // EMbo added ...
  953. movl r47 = 992;;
  954. } { .mib
  955. add r39 = r39, r47
  956. nop 999 // EMbo added ...
  957. (p7) br.cond.sptk ATAN_ArgY_Not_ZERO;;
  958. } { .mfi
  959. nop 999 // EMbo added ...
  960. (p6) fclass.m.unc p14,p0 = f33,0x035
  961. nop 999 // EMbo added ...
  962. } { .mfi
  963. nop 999 // EMbo added ...
  964. (p6) fclass.m.unc p15,p0 = f33,0x036
  965. nop 999;; // EMbo added ...
  966. } { .mfi
  967. nop 999 // EMbo added ...
  968. (p6) fclass.m.unc p13,p0 = f33,0x007
  969. nop 999 // EMbo added ...
  970. } { .mfi
  971. ldfd f56 = [r39],8
  972. nop 999 // EMbo added ...
  973. nop 999;; // EMbo added ...
  974. } { .mfi
  975. ldfd f55 = [r39],-8
  976. (p14) fmerge.s f56 = f32,f0
  977. nop 999;; // EMbo added ...
  978. } { .mfi
  979. nop 999 // EMbo added ...
  980. //
  981. // Return sign_Y * 0 when Y = +/-0 and X > 0
  982. //
  983. (p14) fmerge.s f55 = f32,f0
  984. nop 999;; // EMbo added ...
  985. } { .mfi
  986. nop 999 // EMbo added ...
  987. (p15) fmerge.s f56 = f32,f56
  988. nop 999;; // EMbo added ...
  989. } { .mfi
  990. nop 999 // EMbo added ...
  991. //
  992. // Return sign_Y * PI when X < -0
  993. //
  994. //
  995. (p15) fmerge.s f55 = f32,f55
  996. nop 999;; // EMbo added ...
  997. } { .mfi
  998. nop 999 // EMbo added ...
  999. fadd.d.s0 f57 = f56,f55
  1000. nop.i 0
  1001. } { .bbb
  1002. //
  1003. // Call error support function for atan(0,0)
  1004. // - expected value already computed.
  1005. //
  1006. nop.b 0
  1007. nop.b 0
  1008. br.cond.sptk RETURN_ATAN
  1009. }
  1010. ATAN_ArgY_Not_ZERO:
  1011. { .mfi
  1012. nop 999 // EMbo added ...
  1013. fclass.m.unc p9,p10 = f32,0x023
  1014. nop 999;; // EMbo added ...
  1015. } { .mfb
  1016. nop 999 // EMbo added ...
  1017. (p9) fclass.m.unc p6,p0 = f33,0x017
  1018. (p10) br.cond.sptk ATAN_ArgY_Not_INF;;
  1019. } { .mfi
  1020. (p6) add r39 = 16,r39
  1021. (p9) fclass.m.unc p7,p0 = f33,0x021
  1022. nop 999;; // EMbo added ...
  1023. } { .mmf
  1024. nop 999 // EMbo added ...
  1025. ldfd f56 = [r39],8
  1026. (p9) fclass.m.unc p8,p0 = f33,0x022;;
  1027. } { .mbb
  1028. ldfd f55 = [r39],-8
  1029. nop 999 // EMbo added ...
  1030. nop 999;; // EMbo added ...
  1031. } { .mfi
  1032. nop 999 // EMbo added ...
  1033. (p6) fmerge.s f56 = f32,f56
  1034. nop 999;; // EMbo added ...
  1035. } { .mfi
  1036. nop 999 // EMbo added ...
  1037. (p6) fmerge.s f55 = f32,f55
  1038. nop 999;; // EMbo added ...
  1039. } { .mfb
  1040. nop 999 // EMbo added ...
  1041. //
  1042. // Load I/2 and adjust its sign.
  1043. // Return +I/2 when ArgY = +Inf and ArgX = +/-0,normal
  1044. // Return -I/2 when ArgY = -Inf and ArgX = +/-0,normal
  1045. //
  1046. (p6) fadd.d.s0 f57 = f56, f55
  1047. (p6) br.cond.sptk RETURN_ATAN;;
  1048. } { .mmi
  1049. (p7) add r39 = 32,r39;;
  1050. (p7) ldfd f56 = [r39],8
  1051. nop 999;; // EMbo added ...
  1052. } { .mmi
  1053. nop 999;; // EMbo added ...
  1054. (p7) ldfd f55 = [r39],-8
  1055. nop 999;; // EMbo added ...
  1056. } { .mfi
  1057. nop 999 // EMbo added ...
  1058. (p7) fmerge.s f56 = f32,f56
  1059. nop 999;; // EMbo added ...
  1060. } { .mfi
  1061. nop 999 // EMbo added ...
  1062. (p7) fmerge.s f55 = f32,f55
  1063. nop 999;; // EMbo added ...
  1064. } { .mfb
  1065. nop 999 // EMbo added ...
  1066. //
  1067. // Load PI/4 and adjust its sign.
  1068. // Return +PI/4 when ArgY = +Inf and ArgX = +Inf
  1069. // Return -PI/4 when ArgY = -Inf and ArgX = +Inf
  1070. //
  1071. (p7) fadd.d.s0 f57 = f56, f55
  1072. (p7) br.cond.sptk RETURN_ATAN;;
  1073. } { .mmi
  1074. (p8) add r39 = 48,r39;;
  1075. (p8) ldfd f56 =[r39],8
  1076. nop 999;; // EMbo added ...
  1077. } { .mmi
  1078. nop 999;; // EMbo added ...
  1079. (p8) ldfd f55 =[r39],-8
  1080. nop 999;; // EMbo added ...
  1081. } { .mfi
  1082. nop 999 // EMbo added ...
  1083. (p8) fmerge.s f56 = f32,f56
  1084. nop 999;; // EMbo added ...
  1085. } { .mfi
  1086. nop 999 // EMbo added ...
  1087. (p8) fmerge.s f55 = f32,f55
  1088. nop 999;; // EMbo added ...
  1089. } { .mfb
  1090. nop 999 // EMbo added ...
  1091. //
  1092. // Load I/4 and adjust its sign.
  1093. // Return +3I/4 when ArgY = +Inf and ArgX = -Inf
  1094. // Return -3I/4 when ArgY = -Inf and ArgX = -Inf
  1095. //
  1096. (p8) fadd.d.s0 f57 = f56, f55
  1097. (p8) br.cond.sptk RETURN_ATAN;;
  1098. }
  1099. ATAN_ArgY_Not_INF:
  1100. { .mfi
  1101. nop 999 // EMbo added ...
  1102. fclass.m.unc p6,p0 = f33,0x007
  1103. nop 999 // EMbo added ...
  1104. } { .mfi
  1105. nop 999 // EMbo added ...
  1106. fclass.m.unc p7,p0 = f33,0x021
  1107. nop 999;; // EMbo added ...
  1108. } { .mfi
  1109. nop 999 // EMbo added ...
  1110. fclass.m.unc p8,p0 = f33,0x022
  1111. (p6) add r39 = 16,r39;;
  1112. } { .mfi
  1113. (p6) ldfd f56 =[r39],8
  1114. nop 999 // EMbo added ...
  1115. nop 999;; // EMbo added ...
  1116. } { .mmi
  1117. nop 999;; // EMbo added ...
  1118. (p6) ldfd f55 =[r39],-8
  1119. nop 999;; // EMbo added ...
  1120. } { .mfi
  1121. nop 999 // EMbo added ...
  1122. (p6) fmerge.s f56 = f32,f56
  1123. nop 999;; // EMbo added ...
  1124. } { .mfi
  1125. nop 999 // EMbo added ...
  1126. (p6) fmerge.s f55 = f32,f55
  1127. nop 999;; // EMbo added ...
  1128. } { .mfb
  1129. nop 999 // EMbo added ...
  1130. //
  1131. // return = sign_Y * I/2 when ArgX = +/-0
  1132. //
  1133. (p6) fadd.d.s0 f57 = f56, f55
  1134. (p6) br.cond.sptk RETURN_ATAN;;
  1135. } { .mfi
  1136. nop 999 // EMbo added ...
  1137. (p7) fmerge.s f56 = f32,f0
  1138. nop 999 // EMbo added ...
  1139. } { .mfi
  1140. nop 999 // EMbo added ...
  1141. (p7) fmerge.s f55 = f32,f0
  1142. nop 999;; // EMbo added ...
  1143. } { .mfb
  1144. nop 999 // EMbo added ...
  1145. //
  1146. // return = sign_Y * 0 when ArgX = Inf
  1147. //
  1148. (p7) fadd.d.s0 f57 = f56, f55
  1149. (p7) br.cond.sptk RETURN_ATAN;;
  1150. } { .mfi
  1151. (p8) ldfd f56 = [r39],8
  1152. nop 999 // EMbo added ...
  1153. nop 999;; // EMbo added ...
  1154. } { .mmi
  1155. nop 999;; // EMbo added ...
  1156. (p8) ldfd f55 = [r39],-8
  1157. nop 999;; // EMbo added ...
  1158. } { .mfi
  1159. nop 999 // EMbo added ...
  1160. (p8) fmerge.s f56 = f32,f56
  1161. nop 999;; // EMbo added ...
  1162. } { .mfi
  1163. nop 999 // EMbo added ...
  1164. (p8) fmerge.s f55 = f32,f55
  1165. nop 999;; // EMbo added ...
  1166. } { .mfi
  1167. nop 999 // EMbo added ...
  1168. //
  1169. // return = sign_Y * I when ArgX = -Inf
  1170. //
  1171. (p8) fadd.d.s0 f57 = f56, f55
  1172. nop 999 // EMbo added ...
  1173. };;
  1174. RETURN_ATAN:
  1175. // mov f8 = f57 ;;
  1176. // The answer is in f57.
  1177. // But Z_hi is f56
  1178. // Z_lo is f55
  1179. // s_Y is f34
  1180. // W is in f9 and untouched
  1181. { .mfi
  1182. nop 999
  1183. mov f8 = f56
  1184. nop.i 0
  1185. };;
  1186. { .mfi
  1187. nop 999
  1188. mov f10 = f55
  1189. nop.i 999
  1190. }
  1191. { .mfb
  1192. nop 999
  1193. mov f11 = f34
  1194. br.ret.sptk b0
  1195. };;
  1196. .endp __libm_atan2_reg