Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

531 lines
14 KiB

  1. .file "atanf.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. // History
  26. //==============================================================
  27. // 2/20/00 Initial version
  28. // 8/17/00 Changed predicate register macro-usage to direct predicate
  29. // names due to an assembler bug.
  30. //
  31. // Assembly macros
  32. //==============================================================
  33. // integer registers used
  34. EXP_Addr1 = r33
  35. EXP_Addr2 = r34
  36. // floating point registers used
  37. atanf_coeff_R4 = f32
  38. atanf_coeff_R5 = f33
  39. atanf_coeff_R1 = f34
  40. atanf_coeff_R2 = f35
  41. atanf_coeff_R3 = f36
  42. atanf_coeff_P1 = f37
  43. atanf_coeff_Q6 = f38
  44. atanf_coeff_Q7 = f39
  45. atanf_coeff_Q8 = f40
  46. atanf_coeff_Q9 = f41
  47. atanf_coeff_Q4 = f42
  48. atanf_coeff_Q5 = f43
  49. atanf_coeff_Q2 = f44
  50. atanf_coeff_Q3 = f45
  51. atanf_coeff_P5 = f46
  52. atanf_coeff_P6 = f47
  53. atanf_coeff_Q0 = f48
  54. atanf_coeff_Q1 = f49
  55. atanf_coeff_P7 = f50
  56. atanf_coeff_P8 = f51
  57. atanf_coeff_P3 = f52
  58. atanf_coeff_P4 = f53
  59. atanf_coeff_P9 = f54
  60. atanf_coeff_P10 = f55
  61. atanf_coeff_P2 = f56
  62. atanf_piby2 = f57
  63. atanf_z = f58
  64. atanf_b = f59
  65. atanf_zsq = f60
  66. atanf_sgn_x = f61
  67. atanf_sgnx_piby2 = f62
  68. atanf_abs_x = f63
  69. atanf_t = f64
  70. atanf_xcub = f65
  71. atanf_tsq = f66
  72. atanf_t4 = f67
  73. atanf_x5 = f68
  74. atanf_x6 = f69
  75. atanf_x11 = f70
  76. atanf_poly_p1 = f71
  77. atanf_poly_p2 = f72
  78. atanf_poly_p3 = f73
  79. atanf_poly_p4 = f74
  80. atanf_poly_p5 = f75
  81. atanf_poly_q1 = f76
  82. atanf_poly_q2 = f77
  83. atanf_poly_q3 = f78
  84. atanf_poly_q4 = f79
  85. atanf_poly_q5 = f80
  86. atanf_poly_q = f81
  87. atanf_poly_r1 = f81
  88. atanf_poly_r2 = f82
  89. atanf_poly_r3 = f83
  90. atanf_bsq = f84
  91. atanf_z4 = f85
  92. atanf_z5 = f86
  93. atanf_z8 = f87
  94. atanf_z13 = f88
  95. atanf_poly_r2 = f89
  96. atanf_poly_r1 = f90
  97. atanf_z8_bsq = f91
  98. atanf_poly_r = f92
  99. atanf_z21_poly_r = f93
  100. atanf_answer = f8
  101. // predicate registers used
  102. //atanf_pred_LE1 = p6
  103. //atanf_pred_GT1 = p7
  104. .data
  105. .align 16
  106. atanf_coeff_1_table:
  107. data8 0x40c4c241be751ff2 // r4
  108. data8 0x40e9f300c2f3070b // r5
  109. data8 0x409babffef772075 // r3
  110. data8 0xbfd5555512191621 // p1
  111. data8 0x3fc9997e7afbff4e // p2 = q8
  112. data8 0xbfd5555512191621 // p1 = q9
  113. data8 0x3f97105b4160f86b // p8 = q2
  114. data8 0xbfa6e10ba401393f // p7 = q3
  115. data8 0x3f522e5d33bc9baa // p10 = q0
  116. data8 0xbf7deaadaa336451 // p9 = q1
  117. data8 0xbfc2473c5145ee38 // p3
  118. data8 0x3fbc4f512b1865f5 // p4
  119. data8 0x3fc9997e7afbff4e // p2
  120. data8 0x3ff921fb54442d18 // pi/2
  121. atanf_coeff_2_table:
  122. data8 0x4035000000004284 // r1
  123. data8 0x406cdffff336a59b // r2
  124. data8 0x3fbc4f512b1865f5 // p4 = q6
  125. data8 0xbfc2473c5145ee38 // p3 = q7
  126. data8 0x3fb142a73d7c54e3 // p6 = q4
  127. data8 0xbfb68eed6a8cfa32 // p5 = q5
  128. data8 0xbfb68eed6a8cfa32 // p5
  129. data8 0x3fb142a73d7c54e3 // p6
  130. data8 0xbfa6e10ba401393f // p7
  131. data8 0x3f97105b4160f86b // p8
  132. data8 0xbf7deaadaa336451 // p9
  133. data8 0x3f522e5d33bc9baa // p10
  134. .global atanf
  135. .text
  136. .proc atanf
  137. .align 32
  138. atanf:
  139. { .mfi
  140. alloc r32 = ar.pfs,1,2,0,0
  141. frcpa.s1 atanf_z,p0 = f1,f8
  142. addl EXP_Addr2 = @ltoff(atanf_coeff_2_table),gp
  143. }
  144. { .mfi
  145. addl EXP_Addr1 = @ltoff(atanf_coeff_1_table),gp
  146. fma.s1 atanf_t = f8,f8,f0
  147. nop.i 999;;
  148. }
  149. { .mfi
  150. nop.m 999
  151. fmerge.s atanf_sgn_x = f8,f1
  152. nop.i 999;;
  153. }
  154. { .mfi
  155. ld8 EXP_Addr1 = [EXP_Addr1]
  156. fmerge.s atanf_abs_x = f1,f8
  157. nop.i 999
  158. }
  159. { .mfi
  160. ld8 EXP_Addr2 = [EXP_Addr2]
  161. nop.f 999
  162. nop.i 999;;
  163. }
  164. { .mfi
  165. nop.m 999
  166. fclass.m p8,p0 = f8,0x7 // @zero
  167. nop.i 999;;
  168. }
  169. { .mfi
  170. nop.m 999
  171. fcmp.eq.unc.s0 p9,p10 = f8,f1
  172. nop.i 999;;
  173. }
  174. { .mfi
  175. ldfpd atanf_coeff_R4,atanf_coeff_R5 = [EXP_Addr1],16
  176. fnma.s1 atanf_b = f8,atanf_z,f1
  177. nop.i 999
  178. }
  179. { .mfi
  180. ldfpd atanf_coeff_R1,atanf_coeff_R2 = [EXP_Addr2],16
  181. fma.s1 atanf_zsq = atanf_z,atanf_z,f0
  182. nop.i 999;;
  183. }
  184. { .mfi
  185. ldfpd atanf_coeff_R3,atanf_coeff_P1 = [EXP_Addr1],16
  186. fma.s1 atanf_xcub = f8,atanf_t,f0
  187. nop.i 999
  188. }
  189. { .mfi
  190. ldfpd atanf_coeff_Q6,atanf_coeff_Q7 = [EXP_Addr2],16
  191. fma.s1 atanf_tsq = atanf_t,atanf_t,f0
  192. nop.i 999;;
  193. }
  194. { .mfi
  195. ldfpd atanf_coeff_Q8,atanf_coeff_Q9 = [EXP_Addr1],16
  196. // fcmp.le.s1 atanf_pred_LE1,atanf_pred_GT1 = atanf_abs_x,f1
  197. fcmp.le.s1 p6,p7 = atanf_abs_x,f1
  198. nop.i 999
  199. }
  200. { .mfi
  201. ldfpd atanf_coeff_Q4,atanf_coeff_Q5 = [EXP_Addr2],16
  202. nop.f 999
  203. nop.i 999;;
  204. }
  205. { .mfi
  206. ldfpd atanf_coeff_Q2,atanf_coeff_Q3 = [EXP_Addr1],16
  207. fclass.m p8,p0 = f8,0xe7 // @inf|@qnan|@snan|@zero
  208. nop.i 999
  209. }
  210. { .mfi
  211. ldfpd atanf_coeff_P5,atanf_coeff_P6 = [EXP_Addr2],16
  212. nop.f 999
  213. nop.i 999;;
  214. }
  215. { .mfi
  216. ldfpd atanf_coeff_Q0,atanf_coeff_Q1 = [EXP_Addr1],16
  217. nop.f 999
  218. nop.i 999
  219. }
  220. { .mfi
  221. ldfpd atanf_coeff_P7,atanf_coeff_P8 = [EXP_Addr2],16
  222. nop.f 999
  223. nop.i 999;;
  224. }
  225. { .mfi
  226. ldfpd atanf_coeff_P3,atanf_coeff_P4 = [EXP_Addr1],16
  227. fma.s1 atanf_bsq = atanf_b,atanf_b,f0
  228. nop.i 999
  229. }
  230. { .mfi
  231. ldfpd atanf_coeff_P9,atanf_coeff_P10 = [EXP_Addr2]
  232. fma.s1 atanf_z4 = atanf_zsq,atanf_zsq,f0
  233. nop.i 999;;
  234. }
  235. { .mfi
  236. ldfpd atanf_coeff_P2,atanf_piby2 = [EXP_Addr1]
  237. fma.s1 atanf_x6 = atanf_t,atanf_tsq,f0
  238. nop.i 999
  239. }
  240. { .mfi
  241. nop.m 999
  242. fma.s1 atanf_t4 = atanf_tsq,atanf_tsq,f0
  243. nop.i 999;;
  244. }
  245. { .mfb
  246. nop.m 999
  247. fma.s1 atanf_x5 = atanf_t,atanf_xcub,f0
  248. (p8) br.cond.spnt ATANF_X_INF_NAN_ZERO
  249. }
  250. ;;
  251. { .mfi
  252. nop.m 999
  253. fma.s1 atanf_poly_r1 = atanf_b,atanf_coeff_R1,f1
  254. nop.i 999
  255. }
  256. { .mfi
  257. nop.m 999
  258. fma.s1 atanf_poly_r3 = atanf_b,atanf_coeff_R5,atanf_coeff_R4
  259. nop.i 999;;
  260. }
  261. { .mfi
  262. nop.m 999
  263. fma.s1 atanf_poly_r2 = atanf_b,atanf_coeff_R3,atanf_coeff_R2
  264. nop.i 999
  265. }
  266. { .mfi
  267. nop.m 999
  268. fma.s1 atanf_z8 = atanf_z4,atanf_z4,f0
  269. nop.i 999;;
  270. }
  271. { .mfi
  272. nop.m 999
  273. fma.s1 atanf_poly_q2 = atanf_t,atanf_coeff_Q5,atanf_coeff_Q4
  274. nop.i 999
  275. }
  276. { .mfi
  277. nop.m 999
  278. fma.s1 atanf_poly_q3 = atanf_t,atanf_coeff_Q7,atanf_coeff_Q6
  279. nop.i 999;;
  280. }
  281. { .mfi
  282. nop.m 999
  283. fma.s1 atanf_z5 = atanf_z,atanf_z4,f0
  284. nop.i 999
  285. }
  286. { .mfi
  287. nop.m 999
  288. fma.s1 atanf_poly_q1 = atanf_t,atanf_coeff_Q9,atanf_coeff_Q8
  289. nop.i 999;;
  290. }
  291. { .mfi
  292. nop.m 999
  293. fma.s1 atanf_poly_q4 = atanf_t,atanf_coeff_Q1,atanf_coeff_Q0
  294. nop.i 999
  295. }
  296. { .mfi
  297. nop.m 999
  298. fma.s1 atanf_poly_q5 = atanf_t,atanf_coeff_Q3,atanf_coeff_Q2
  299. nop.i 999;;
  300. }
  301. { .mfi
  302. nop.m 999
  303. fma.s1 atanf_poly_p4 = f8,atanf_coeff_P1,f0
  304. nop.i 999
  305. }
  306. { .mfi
  307. nop.m 999
  308. fma.s1 atanf_poly_p5 = atanf_t,atanf_coeff_P4,atanf_coeff_P3
  309. nop.i 999;;
  310. }
  311. { .mfi
  312. nop.m 999
  313. fma.s1 atanf_poly_r1 = atanf_z8,atanf_poly_r1,f0
  314. nop.i 999
  315. }
  316. { .mfi
  317. nop.m 999
  318. fma.s1 atanf_z8_bsq = atanf_z8,atanf_bsq,f0
  319. nop.i 999;;
  320. }
  321. { .mfi
  322. nop.m 999
  323. fma.s1 atanf_poly_q2 = atanf_tsq,atanf_poly_q3,atanf_poly_q2
  324. nop.i 999
  325. }
  326. { .mfi
  327. nop.m 999
  328. fma.s1 atanf_poly_r2 = atanf_bsq,atanf_poly_r3,atanf_poly_r2
  329. nop.i 999;;
  330. }
  331. { .mfi
  332. nop.m 999
  333. fma.s1 atanf_poly_p2 = atanf_t,atanf_coeff_P8,atanf_coeff_P7
  334. nop.i 999
  335. }
  336. { .mfi
  337. nop.m 999
  338. fma.s1 atanf_poly_q1 = atanf_poly_q1,f1,atanf_tsq
  339. nop.i 999;;
  340. }
  341. { .mfi
  342. nop.m 999
  343. fma.s1 atanf_z13 = atanf_z5,atanf_z8,f0
  344. nop.i 999
  345. }
  346. { .mfi
  347. nop.m 999
  348. fma.s1 atanf_poly_p1 = atanf_t,atanf_coeff_P10,atanf_coeff_P9
  349. nop.i 999;;
  350. }
  351. { .mfi
  352. nop.m 999
  353. fma.s1 atanf_poly_p4 = atanf_t,atanf_poly_p4,f8
  354. nop.i 999
  355. }
  356. { .mfi
  357. nop.m 999
  358. fma.s1 atanf_poly_q4 = atanf_tsq,atanf_poly_q5,atanf_poly_q4
  359. nop.i 999;;
  360. }
  361. { .mfi
  362. nop.m 999
  363. fma.s1 atanf_poly_p3 = atanf_t,atanf_coeff_P6,atanf_coeff_P5
  364. nop.i 999
  365. }
  366. { .mfi
  367. nop.m 999
  368. fma.s1 atanf_poly_p5 = atanf_t,atanf_poly_p5,atanf_coeff_P2
  369. nop.i 999;;
  370. }
  371. { .mfi
  372. nop.m 999
  373. fma.s1 atanf_x11 = atanf_x5,atanf_x6,f0
  374. nop.i 999
  375. }
  376. { .mfi
  377. nop.m 999
  378. fma.s1 atanf_poly_r = atanf_z8_bsq,atanf_poly_r2,atanf_poly_r1
  379. nop.i 999;;
  380. }
  381. { .mfi
  382. nop.m 999
  383. fma atanf_sgnx_piby2 = atanf_sgn_x,atanf_piby2,f0
  384. nop.i 999
  385. }
  386. { .mfi
  387. nop.m 999
  388. fma.s1 atanf_poly_q2 = atanf_t4,atanf_poly_q1,atanf_poly_q2
  389. nop.i 999;;
  390. }
  391. { .mfi
  392. nop.m 999
  393. fma.s1 atanf_poly_p1 = atanf_tsq,atanf_poly_p1,atanf_poly_p2
  394. nop.i 999;;
  395. }
  396. { .mfi
  397. nop.m 999
  398. fma.s1 atanf_poly_p4 = atanf_x5,atanf_poly_p5,atanf_poly_p4
  399. nop.i 999;;
  400. }
  401. { .mfi
  402. nop.m 999
  403. fma.s1 atanf_z21_poly_r = atanf_z13,atanf_poly_r,f0
  404. nop.i 999;;
  405. }
  406. { .mfi
  407. nop.m 999
  408. fma.s1 atanf_poly_q = atanf_t4,atanf_poly_q2,atanf_poly_q4
  409. nop.i 999;;
  410. }
  411. { .mfi
  412. nop.m 999
  413. fma.s1 atanf_poly_p1 = atanf_tsq,atanf_poly_p1,atanf_poly_p3
  414. nop.i 999;;
  415. }
  416. { .mfi
  417. nop.m 999
  418. //(atanf_pred_GT1) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
  419. (p7) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
  420. nop.i 999;;
  421. }
  422. { .mfb
  423. nop.m 999
  424. //(atanf_pred_LE1) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
  425. (p6) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
  426. br.ret.sptk b0
  427. }
  428. ATANF_X_INF_NAN_ZERO:
  429. fclass.m p8,p9 = f8,0x23 // @inf
  430. ;;
  431. (p8) fmerge.s f8 = f8, atanf_piby2
  432. ;;
  433. fnorm.s f8 = f8
  434. br.ret.sptk b0
  435. .endp atanf