Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

533 lines
14 KiB

  1. .file "atanf.s"
  2. // THIS IS NOT OPTIMIZED AND NOT OFFICIAL
  3. // Copyright (c) 2000, Intel Corporation
  4. // All rights reserved.
  5. //
  6. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  7. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  8. //
  9. // WARRANTY DISCLAIMER
  10. //
  11. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  12. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  13. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  14. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  15. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  16. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  17. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  18. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  19. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  20. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  21. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  22. //
  23. // Intel Corporation is the author of this code, and requests that all
  24. // problem reports or change requests be submitted to it directly at
  25. // http://developer.intel.com/opensource.
  26. // History
  27. //==============================================================
  28. // ?/??/00 Initial revision
  29. // 8/17/00 Changed predicate register macro-usage to direct predicate
  30. // names due to an assembler bug.
  31. //
  32. // Assembly macros
  33. //==============================================================
  34. // integer registers used
  35. EXP_Addr1 = r33
  36. EXP_Addr2 = r34
  37. // floating point registers used
  38. atanf_coeff_R4 = f32
  39. atanf_coeff_R5 = f33
  40. atanf_coeff_R1 = f34
  41. atanf_coeff_R2 = f35
  42. atanf_coeff_R3 = f36
  43. atanf_coeff_P1 = f37
  44. atanf_coeff_Q6 = f38
  45. atanf_coeff_Q7 = f39
  46. atanf_coeff_Q8 = f40
  47. atanf_coeff_Q9 = f41
  48. atanf_coeff_Q4 = f42
  49. atanf_coeff_Q5 = f43
  50. atanf_coeff_Q2 = f44
  51. atanf_coeff_Q3 = f45
  52. atanf_coeff_P5 = f46
  53. atanf_coeff_P6 = f47
  54. atanf_coeff_Q0 = f48
  55. atanf_coeff_Q1 = f49
  56. atanf_coeff_P7 = f50
  57. atanf_coeff_P8 = f51
  58. atanf_coeff_P3 = f52
  59. atanf_coeff_P4 = f53
  60. atanf_coeff_P9 = f54
  61. atanf_coeff_P10 = f55
  62. atanf_coeff_P2 = f56
  63. atanf_piby2 = f57
  64. atanf_z = f58
  65. atanf_b = f59
  66. atanf_zsq = f60
  67. atanf_sgn_x = f61
  68. atanf_sgnx_piby2 = f62
  69. atanf_abs_x = f63
  70. atanf_t = f64
  71. atanf_xcub = f65
  72. atanf_tsq = f66
  73. atanf_t4 = f67
  74. atanf_x5 = f68
  75. atanf_x6 = f69
  76. atanf_x11 = f70
  77. atanf_poly_p1 = f71
  78. atanf_poly_p2 = f72
  79. atanf_poly_p3 = f73
  80. atanf_poly_p4 = f74
  81. atanf_poly_p5 = f75
  82. atanf_poly_q1 = f76
  83. atanf_poly_q2 = f77
  84. atanf_poly_q3 = f78
  85. atanf_poly_q4 = f79
  86. atanf_poly_q5 = f80
  87. atanf_poly_q = f81
  88. atanf_poly_r1 = f81
  89. atanf_poly_r2 = f82
  90. atanf_poly_r3 = f83
  91. atanf_bsq = f84
  92. atanf_z4 = f85
  93. atanf_z5 = f86
  94. atanf_z8 = f87
  95. atanf_z13 = f88
  96. atanf_poly_r2 = f89
  97. atanf_poly_r1 = f90
  98. atanf_z8_bsq = f91
  99. atanf_poly_r = f92
  100. atanf_z21_poly_r = f93
  101. atanf_answer = f8
  102. // predicate registers used
  103. //atanf_pred_LE1 = p6
  104. //atanf_pred_GT1 = p7
  105. .data
  106. .align 16
  107. atanf_coeff_1_table:
  108. data8 0x40c4c241be751ff2 // r4
  109. data8 0x40e9f300c2f3070b // r5
  110. data8 0x409babffef772075 // r3
  111. data8 0xbfd5555512191621 // p1
  112. data8 0x3fc9997e7afbff4e // p2 = q8
  113. data8 0xbfd5555512191621 // p1 = q9
  114. data8 0x3f97105b4160f86b // p8 = q2
  115. data8 0xbfa6e10ba401393f // p7 = q3
  116. data8 0x3f522e5d33bc9baa // p10 = q0
  117. data8 0xbf7deaadaa336451 // p9 = q1
  118. data8 0xbfc2473c5145ee38 // p3
  119. data8 0x3fbc4f512b1865f5 // p4
  120. data8 0x3fc9997e7afbff4e // p2
  121. data8 0x3ff921fb54442d18 // pi/2
  122. atanf_coeff_2_table:
  123. data8 0x4035000000004284 // r1
  124. data8 0x406cdffff336a59b // r2
  125. data8 0x3fbc4f512b1865f5 // p4 = q6
  126. data8 0xbfc2473c5145ee38 // p3 = q7
  127. data8 0x3fb142a73d7c54e3 // p6 = q4
  128. data8 0xbfb68eed6a8cfa32 // p5 = q5
  129. data8 0xbfb68eed6a8cfa32 // p5
  130. data8 0x3fb142a73d7c54e3 // p6
  131. data8 0xbfa6e10ba401393f // p7
  132. data8 0x3f97105b4160f86b // p8
  133. data8 0xbf7deaadaa336451 // p9
  134. data8 0x3f522e5d33bc9baa // p10
  135. .global atanf
  136. .text
  137. .proc atanf
  138. .align 32
  139. atanf:
  140. { .mfi
  141. alloc r32 = ar.pfs,1,2,0,0
  142. frcpa.s1 atanf_z,p0 = f1,f8
  143. addl EXP_Addr2 = @ltoff(atanf_coeff_2_table),gp
  144. }
  145. { .mfi
  146. addl EXP_Addr1 = @ltoff(atanf_coeff_1_table),gp
  147. fma.s1 atanf_t = f8,f8,f0
  148. nop.i 999;;
  149. }
  150. { .mfi
  151. nop.m 999
  152. fmerge.s atanf_sgn_x = f8,f1
  153. nop.i 999;;
  154. }
  155. { .mfi
  156. ld8 EXP_Addr1 = [EXP_Addr1]
  157. fmerge.s atanf_abs_x = f1,f8
  158. nop.i 999
  159. }
  160. { .mfi
  161. ld8 EXP_Addr2 = [EXP_Addr2]
  162. nop.f 999
  163. nop.i 999;;
  164. }
  165. { .mfi
  166. nop.m 999
  167. fclass.m p8,p0 = f8,0x7 // @zero
  168. nop.i 999;;
  169. }
  170. { .mfi
  171. nop.m 999
  172. fcmp.eq.unc.s0 p9,p10 = f8,f1
  173. nop.i 999;;
  174. }
  175. { .mfi
  176. ldfpd atanf_coeff_R4,atanf_coeff_R5 = [EXP_Addr1],16
  177. fnma.s1 atanf_b = f8,atanf_z,f1
  178. nop.i 999
  179. }
  180. { .mfi
  181. ldfpd atanf_coeff_R1,atanf_coeff_R2 = [EXP_Addr2],16
  182. fma.s1 atanf_zsq = atanf_z,atanf_z,f0
  183. nop.i 999;;
  184. }
  185. { .mfi
  186. ldfpd atanf_coeff_R3,atanf_coeff_P1 = [EXP_Addr1],16
  187. fma.s1 atanf_xcub = f8,atanf_t,f0
  188. nop.i 999
  189. }
  190. { .mfi
  191. ldfpd atanf_coeff_Q6,atanf_coeff_Q7 = [EXP_Addr2],16
  192. fma.s1 atanf_tsq = atanf_t,atanf_t,f0
  193. nop.i 999;;
  194. }
  195. { .mfi
  196. ldfpd atanf_coeff_Q8,atanf_coeff_Q9 = [EXP_Addr1],16
  197. // fcmp.le.s1 atanf_pred_LE1,atanf_pred_GT1 = atanf_abs_x,f1
  198. fcmp.le.s1 p6,p7 = atanf_abs_x,f1
  199. nop.i 999
  200. }
  201. { .mfi
  202. ldfpd atanf_coeff_Q4,atanf_coeff_Q5 = [EXP_Addr2],16
  203. nop.f 999
  204. nop.i 999;;
  205. }
  206. { .mfi
  207. ldfpd atanf_coeff_Q2,atanf_coeff_Q3 = [EXP_Addr1],16
  208. fclass.m p8,p0 = f8,0xe7 // @inf|@qnan|@snan|@zero
  209. nop.i 999
  210. }
  211. { .mfi
  212. ldfpd atanf_coeff_P5,atanf_coeff_P6 = [EXP_Addr2],16
  213. nop.f 999
  214. nop.i 999;;
  215. }
  216. { .mfi
  217. ldfpd atanf_coeff_Q0,atanf_coeff_Q1 = [EXP_Addr1],16
  218. nop.f 999
  219. nop.i 999
  220. }
  221. { .mfi
  222. ldfpd atanf_coeff_P7,atanf_coeff_P8 = [EXP_Addr2],16
  223. nop.f 999
  224. nop.i 999;;
  225. }
  226. { .mfi
  227. ldfpd atanf_coeff_P3,atanf_coeff_P4 = [EXP_Addr1],16
  228. fma.s1 atanf_bsq = atanf_b,atanf_b,f0
  229. nop.i 999
  230. }
  231. { .mfi
  232. ldfpd atanf_coeff_P9,atanf_coeff_P10 = [EXP_Addr2]
  233. fma.s1 atanf_z4 = atanf_zsq,atanf_zsq,f0
  234. nop.i 999;;
  235. }
  236. { .mfi
  237. ldfpd atanf_coeff_P2,atanf_piby2 = [EXP_Addr1]
  238. fma.s1 atanf_x6 = atanf_t,atanf_tsq,f0
  239. nop.i 999
  240. }
  241. { .mfi
  242. nop.m 999
  243. fma.s1 atanf_t4 = atanf_tsq,atanf_tsq,f0
  244. nop.i 999;;
  245. }
  246. { .mfb
  247. nop.m 999
  248. fma.s1 atanf_x5 = atanf_t,atanf_xcub,f0
  249. (p8) br.cond.spnt ATANF_X_INF_NAN_ZERO
  250. }
  251. ;;
  252. { .mfi
  253. nop.m 999
  254. fma.s1 atanf_poly_r1 = atanf_b,atanf_coeff_R1,f1
  255. nop.i 999
  256. }
  257. { .mfi
  258. nop.m 999
  259. fma.s1 atanf_poly_r3 = atanf_b,atanf_coeff_R5,atanf_coeff_R4
  260. nop.i 999;;
  261. }
  262. { .mfi
  263. nop.m 999
  264. fma.s1 atanf_poly_r2 = atanf_b,atanf_coeff_R3,atanf_coeff_R2
  265. nop.i 999
  266. }
  267. { .mfi
  268. nop.m 999
  269. fma.s1 atanf_z8 = atanf_z4,atanf_z4,f0
  270. nop.i 999;;
  271. }
  272. { .mfi
  273. nop.m 999
  274. fma.s1 atanf_poly_q2 = atanf_t,atanf_coeff_Q5,atanf_coeff_Q4
  275. nop.i 999
  276. }
  277. { .mfi
  278. nop.m 999
  279. fma.s1 atanf_poly_q3 = atanf_t,atanf_coeff_Q7,atanf_coeff_Q6
  280. nop.i 999;;
  281. }
  282. { .mfi
  283. nop.m 999
  284. fma.s1 atanf_z5 = atanf_z,atanf_z4,f0
  285. nop.i 999
  286. }
  287. { .mfi
  288. nop.m 999
  289. fma.s1 atanf_poly_q1 = atanf_t,atanf_coeff_Q9,atanf_coeff_Q8
  290. nop.i 999;;
  291. }
  292. { .mfi
  293. nop.m 999
  294. fma.s1 atanf_poly_q4 = atanf_t,atanf_coeff_Q1,atanf_coeff_Q0
  295. nop.i 999
  296. }
  297. { .mfi
  298. nop.m 999
  299. fma.s1 atanf_poly_q5 = atanf_t,atanf_coeff_Q3,atanf_coeff_Q2
  300. nop.i 999;;
  301. }
  302. { .mfi
  303. nop.m 999
  304. fma.s1 atanf_poly_p4 = f8,atanf_coeff_P1,f0
  305. nop.i 999
  306. }
  307. { .mfi
  308. nop.m 999
  309. fma.s1 atanf_poly_p5 = atanf_t,atanf_coeff_P4,atanf_coeff_P3
  310. nop.i 999;;
  311. }
  312. { .mfi
  313. nop.m 999
  314. fma.s1 atanf_poly_r1 = atanf_z8,atanf_poly_r1,f0
  315. nop.i 999
  316. }
  317. { .mfi
  318. nop.m 999
  319. fma.s1 atanf_z8_bsq = atanf_z8,atanf_bsq,f0
  320. nop.i 999;;
  321. }
  322. { .mfi
  323. nop.m 999
  324. fma.s1 atanf_poly_q2 = atanf_tsq,atanf_poly_q3,atanf_poly_q2
  325. nop.i 999
  326. }
  327. { .mfi
  328. nop.m 999
  329. fma.s1 atanf_poly_r2 = atanf_bsq,atanf_poly_r3,atanf_poly_r2
  330. nop.i 999;;
  331. }
  332. { .mfi
  333. nop.m 999
  334. fma.s1 atanf_poly_p2 = atanf_t,atanf_coeff_P8,atanf_coeff_P7
  335. nop.i 999
  336. }
  337. { .mfi
  338. nop.m 999
  339. fma.s1 atanf_poly_q1 = atanf_poly_q1,f1,atanf_tsq
  340. nop.i 999;;
  341. }
  342. { .mfi
  343. nop.m 999
  344. fma.s1 atanf_z13 = atanf_z5,atanf_z8,f0
  345. nop.i 999
  346. }
  347. { .mfi
  348. nop.m 999
  349. fma.s1 atanf_poly_p1 = atanf_t,atanf_coeff_P10,atanf_coeff_P9
  350. nop.i 999;;
  351. }
  352. { .mfi
  353. nop.m 999
  354. fma.s1 atanf_poly_p4 = atanf_t,atanf_poly_p4,f8
  355. nop.i 999
  356. }
  357. { .mfi
  358. nop.m 999
  359. fma.s1 atanf_poly_q4 = atanf_tsq,atanf_poly_q5,atanf_poly_q4
  360. nop.i 999;;
  361. }
  362. { .mfi
  363. nop.m 999
  364. fma.s1 atanf_poly_p3 = atanf_t,atanf_coeff_P6,atanf_coeff_P5
  365. nop.i 999
  366. }
  367. { .mfi
  368. nop.m 999
  369. fma.s1 atanf_poly_p5 = atanf_t,atanf_poly_p5,atanf_coeff_P2
  370. nop.i 999;;
  371. }
  372. { .mfi
  373. nop.m 999
  374. fma.s1 atanf_x11 = atanf_x5,atanf_x6,f0
  375. nop.i 999
  376. }
  377. { .mfi
  378. nop.m 999
  379. fma.s1 atanf_poly_r = atanf_z8_bsq,atanf_poly_r2,atanf_poly_r1
  380. nop.i 999;;
  381. }
  382. { .mfi
  383. nop.m 999
  384. fma atanf_sgnx_piby2 = atanf_sgn_x,atanf_piby2,f0
  385. nop.i 999
  386. }
  387. { .mfi
  388. nop.m 999
  389. fma.s1 atanf_poly_q2 = atanf_t4,atanf_poly_q1,atanf_poly_q2
  390. nop.i 999;;
  391. }
  392. { .mfi
  393. nop.m 999
  394. fma.s1 atanf_poly_p1 = atanf_tsq,atanf_poly_p1,atanf_poly_p2
  395. nop.i 999;;
  396. }
  397. { .mfi
  398. nop.m 999
  399. fma.s1 atanf_poly_p4 = atanf_x5,atanf_poly_p5,atanf_poly_p4
  400. nop.i 999;;
  401. }
  402. { .mfi
  403. nop.m 999
  404. fma.s1 atanf_z21_poly_r = atanf_z13,atanf_poly_r,f0
  405. nop.i 999;;
  406. }
  407. { .mfi
  408. nop.m 999
  409. fma.s1 atanf_poly_q = atanf_t4,atanf_poly_q2,atanf_poly_q4
  410. nop.i 999;;
  411. }
  412. { .mfi
  413. nop.m 999
  414. fma.s1 atanf_poly_p1 = atanf_tsq,atanf_poly_p1,atanf_poly_p3
  415. nop.i 999;;
  416. }
  417. { .mfi
  418. nop.m 999
  419. //(atanf_pred_GT1) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
  420. (p7) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
  421. nop.i 999;;
  422. }
  423. { .mfb
  424. nop.m 999
  425. //(atanf_pred_LE1) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
  426. (p6) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
  427. br.ret.sptk b0
  428. }
  429. ATANF_X_INF_NAN_ZERO:
  430. fclass.m p8,p9 = f8,0x23 // @inf
  431. ;;
  432. (p8) fmerge.s f8 = f8, atanf_piby2
  433. ;;
  434. fnorm.s f8 = f8
  435. br.ret.sptk b0
  436. .endp atanf