Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

745 lines
15 KiB

  1. .file "tan.s"
  2. // Copyright (c) 2000, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00: Initial version
  29. // 4/04/00 Unwind support added
  30. //
  31. // API
  32. //==============================================================
  33. // double tan( double x);
  34. //
  35. // Overview of operation
  36. //==============================================================
  37. // If the input value in radians is |x| >= 1.xxxxx 2^10 call the
  38. // older slower version.
  39. //
  40. // The new algorithm is used when |x| <= 1.xxxxx 2^9.
  41. //
  42. // Represent the input X as Nfloat * pi/2 + r
  43. // where r can be negative and |r| <= pi/4
  44. //
  45. // tan_W = x * 2/pi
  46. // Nfloat = round_int(tan_W)
  47. //
  48. // tan_r = x - Nfloat * (pi/2)_hi
  49. // tan_r = tan_r - Nfloat * (pi/2)_lo
  50. //
  51. // We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd.
  52. // p8: tan(X) = tan(r)
  53. // p9: tan(X) = -cot(r)
  54. //
  55. // Each is evaluated as a series. The p9 path requires 1/r.
  56. //
  57. // The coefficients used in the series are stored in a table as
  58. // are the pi constants.
  59. //
  60. // Registers used
  61. //==============================================================
  62. //
  63. // predicate registers used:
  64. // p6, p7, p8, p9, p10
  65. //
  66. // floating-point registers used:
  67. // f32 -> f93
  68. // f8, input
  69. //
  70. // general registers used
  71. // r32 -> r43
  72. //
  73. // Assembly macros
  74. //==============================================================
  75. tan_Inv_Pi_by_2 = f32
  76. tan_Pi_by_2_hi = f33
  77. tan_Pi_by_2_lo = f34
  78. tan_P0 = f35
  79. tan_P1 = f36
  80. tan_P2 = f37
  81. tan_P3 = f38
  82. tan_P4 = f39
  83. tan_P5 = f40
  84. tan_P6 = f41
  85. tan_P7 = f42
  86. tan_P8 = f43
  87. tan_P9 = f44
  88. tan_P10 = f45
  89. tan_P11 = f46
  90. tan_P12 = f47
  91. tan_P13 = f48
  92. tan_P14 = f49
  93. tan_P15 = f50
  94. tan_Q0 = f51
  95. tan_Q1 = f52
  96. tan_Q2 = f53
  97. tan_Q3 = f54
  98. tan_Q4 = f55
  99. tan_Q5 = f56
  100. tan_Q6 = f57
  101. tan_Q7 = f58
  102. tan_Q8 = f59
  103. tan_Q9 = f60
  104. tan_Q10 = f61
  105. tan_r = f62
  106. tan_rsq = f63
  107. tan_rcube = f64
  108. tan_v18 = f65
  109. tan_v16 = f66
  110. tan_v17 = f67
  111. tan_v12 = f68
  112. tan_v13 = f69
  113. tan_v7 = f70
  114. tan_v8 = f71
  115. tan_v4 = f72
  116. tan_v5 = f73
  117. tan_v15 = f74
  118. tan_v11 = f75
  119. tan_v14 = f76
  120. tan_v3 = f77
  121. tan_v6 = f78
  122. tan_v10 = f79
  123. tan_v2 = f80
  124. tan_v9 = f81
  125. tan_v1 = f82
  126. tan_int_Nfloat = f83
  127. tan_Nfloat = f84
  128. tan_NORM_f8 = f85
  129. tan_W = f86
  130. tan_y0 = f87
  131. tan_d = f88
  132. tan_y1 = f89
  133. tan_dsq = f90
  134. tan_y2 = f91
  135. tan_d4 = f92
  136. tan_inv_r = f93
  137. /////////////////////////////////////////////////////////////
  138. tan_AD = r33
  139. tan_GR_10009 = r34
  140. tan_GR_17_ones = r35
  141. tan_GR_N_odd_even = r36
  142. tan_GR_N = r37
  143. tan_signexp = r38
  144. tan_exp = r39
  145. tan_ADQ = r40
  146. GR_SAVE_PFS = r41
  147. GR_SAVE_B0 = r42
  148. GR_SAVE_GP = r43
  149. .data
  150. .align 16
  151. double_tan_constants:
  152. data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi
  153. data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi
  154. data8 0xBEEA54580DDEA0E1 // P14
  155. data8 0x3ED3021ACE749A59 // P15
  156. data8 0xBEF312BD91DC8DA1 // P12
  157. data8 0x3EFAE9AFC14C5119 // P13
  158. data8 0x3F2F342BF411E769 // P8
  159. data8 0x3F1A60FC9F3B0227 // P9
  160. data8 0x3EFF246E78E5E45B // P10
  161. data8 0x3F01D9D2E782875C // P11
  162. data8 0x3F8226E34C4499B6 // P4
  163. data8 0x3F6D6D3F12C236AC // P5
  164. data8 0x3F57DA1146DCFD8B // P6
  165. data8 0x3F43576410FE3D75 // P7
  166. data8 0x3FD5555555555555 // P0
  167. data8 0x3FC11111111111C2 // P1
  168. data8 0x3FABA1BA1BA0E850 // P2
  169. data8 0x3F9664F4886725A7 // P3
  170. double_Q_tan_constants:
  171. data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo
  172. data8 0x3E223A73BA576E48 // Q8
  173. data8 0x3DF54AD8D1F2CA43 // Q9
  174. data8 0x3EF66A8EE529A6AA // Q4
  175. data8 0x3EC2281050410EE6 // Q5
  176. data8 0x3E8D6BB992CC3CF5 // Q6
  177. data8 0x3E57F88DE34832E4 // Q7
  178. data8 0x3FD5555555555555 // Q0
  179. data8 0x3F96C16C16C16DB8 // Q1
  180. data8 0x3F61566ABBFFB489 // Q2
  181. data8 0x3F2BBD77945C1733 // Q3
  182. data8 0x3D927FB33E2B0E04 // Q10
  183. .align 32
  184. .global tan#
  185. ////////////////////////////////////////////////////////
  186. .section .text
  187. .proc tan#
  188. .align 32
  189. tan:
  190. // The initial fnorm will take any unmasked faults and
  191. // normalize any single/double unorms
  192. { .mmi
  193. alloc r32=ar.pfs,1,11,0,0
  194. (p0) addl tan_AD = @ltoff(double_tan_constants), gp
  195. nop.i 999
  196. }
  197. ;;
  198. { .mmi
  199. ld8 tan_AD = [tan_AD]
  200. nop.m 999
  201. nop.i 999
  202. }
  203. ;;
  204. { .mfi
  205. nop.m 999
  206. (p0) fnorm tan_NORM_f8 = f8
  207. (p0) mov tan_GR_17_ones = 0x1ffff ;;
  208. }
  209. { .mfi
  210. nop.m 999
  211. nop.f 999
  212. (p0) mov tan_GR_10009 = 0x10009 ;;
  213. }
  214. ;;
  215. { .mmi
  216. adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD
  217. (p0) ldfe tan_Inv_Pi_by_2 = [tan_AD],16
  218. nop.i 999
  219. }
  220. ;;
  221. { .mfi
  222. (p0) ldfe tan_Pi_by_2_hi = [tan_AD],16
  223. (p0) fclass.m.unc p6,p0 = f8, 0x07
  224. }
  225. { .mfi
  226. (p0) ldfe tan_Pi_by_2_lo = [tan_ADQ],16
  227. nop.f 999
  228. nop.i 999 ;;
  229. }
  230. { .mmb
  231. (p0) ldfd tan_P14 = [tan_AD],8
  232. (p0) ldfd tan_Q8 = [tan_ADQ],8
  233. nop.b 999 ;;
  234. }
  235. { .mmb
  236. (p0) ldfd tan_P15 = [tan_AD],8
  237. (p0) ldfd tan_Q9 = [tan_ADQ],8
  238. nop.b 999 ;;
  239. }
  240. { .mmb
  241. (p0) ldfd tan_P12 = [tan_AD],8
  242. (p0) ldfd tan_Q4 = [tan_ADQ],8
  243. nop.b 999 ;;
  244. }
  245. { .mmb
  246. (p0) ldfd tan_P13 = [tan_AD],8
  247. (p0) ldfd tan_Q5 = [tan_ADQ],8
  248. nop.b 999 ;;
  249. }
  250. { .mmb
  251. (p0) ldfd tan_P8 = [tan_AD],8
  252. (p0) getf.exp tan_signexp = tan_NORM_f8
  253. nop.b 999 ;;
  254. }
  255. { .mmb
  256. (p0) ldfd tan_P9 = [tan_AD],8
  257. (p0) ldfd tan_Q6 = [tan_ADQ],8
  258. (p6) br.ret.spnt b0 ;;
  259. }
  260. { .mmi
  261. (p0) ldfd tan_P10 = [tan_AD],8
  262. (p0) ldfd tan_Q7 = [tan_ADQ],8
  263. (p0) and tan_exp = tan_GR_17_ones, tan_signexp ;;
  264. }
  265. // p7 is true if we must call DBX TAN
  266. // p7 is true if f8 exp is > 0x10009 (which includes all ones
  267. // NAN or inf)
  268. { .mfi
  269. (p0) ldfd tan_P11 = [tan_AD],8
  270. (p0) fma.s1 tan_W = tan_NORM_f8, tan_Inv_Pi_by_2, f0
  271. (p0) cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009
  272. }
  273. { .mfi
  274. (p0) ldfd tan_Q0 = [tan_ADQ],8
  275. nop.f 999
  276. nop.i 999 ;;
  277. }
  278. { .mmb
  279. (p0) ldfd tan_P4 = [tan_AD],8
  280. (p0) ldfd tan_Q1 = [tan_ADQ],8
  281. (p7) br.cond.spnt TAN_DBX ;;
  282. }
  283. { .mmb
  284. (p0) ldfd tan_P5 = [tan_AD],8
  285. (p0) ldfd tan_Q2 = [tan_ADQ],8
  286. nop.b 999 ;;
  287. }
  288. { .mmb
  289. (p0) ldfd tan_P6 = [tan_AD],8
  290. (p0) ldfd tan_Q3 = [tan_ADQ],8
  291. nop.b 999 ;;
  292. }
  293. { .mmi
  294. (p0) ldfd tan_P7 = [tan_AD],8
  295. (p0) ldfd tan_Q10 = [tan_ADQ],8
  296. nop.i 999 ;;
  297. }
  298. // tan_int_Nfloat = Round_Int_Nearest(tan_W)
  299. { .mfi
  300. (p0) ldfd tan_P0 = [tan_AD],8
  301. (p0) fcvt.fx.s1 tan_int_Nfloat = tan_W
  302. nop.i 999 ;;
  303. }
  304. { .mmi
  305. (p0) ldfd tan_P1 = [tan_AD],8
  306. nop.m 999
  307. nop.i 999 ;;
  308. }
  309. { .mfi
  310. (p0) ldfd tan_P2 = [tan_AD],8
  311. nop.f 999
  312. nop.i 999 ;;
  313. }
  314. { .mmi
  315. (p0) ldfd tan_P3 = [tan_AD],8
  316. nop.m 999
  317. nop.i 999 ;;
  318. }
  319. { .mfi
  320. nop.m 999
  321. (p0) fcvt.xf tan_Nfloat = tan_int_Nfloat
  322. nop.i 999 ;;
  323. }
  324. { .mfi
  325. (p0) getf.sig tan_GR_N = tan_int_Nfloat
  326. nop.f 999
  327. nop.i 999 ;;
  328. }
  329. { .mmi
  330. nop.m 999
  331. nop.m 999
  332. (p0) and tan_GR_N_odd_even = 0x1, tan_GR_N ;;
  333. }
  334. // p8 ==> even
  335. // p9 ==> odd
  336. { .mmi
  337. nop.m 999
  338. nop.m 999
  339. (p0) cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;;
  340. }
  341. // tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x
  342. { .mfi
  343. nop.m 999
  344. (p0) fnma.s1 tan_r = tan_Nfloat, tan_Pi_by_2_hi, tan_NORM_f8
  345. nop.i 999 ;;
  346. }
  347. // tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo
  348. { .mfi
  349. nop.m 999
  350. (p0) fnma.s1 tan_r = tan_Nfloat, tan_Pi_by_2_lo, tan_r
  351. nop.i 999 ;;
  352. }
  353. { .mfi
  354. nop.m 999
  355. (p0) fma.s1 tan_rsq = tan_r, tan_r, f0
  356. nop.i 999 ;;
  357. }
  358. { .mfi
  359. nop.m 999
  360. (p9) frcpa.s1 tan_y0, p10 = f1,tan_r
  361. nop.i 999 ;;
  362. }
  363. { .mfi
  364. nop.m 999
  365. (p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14
  366. nop.i 999
  367. }
  368. { .mfi
  369. nop.m 999
  370. (p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0
  371. nop.i 999 ;;
  372. }
  373. { .mfi
  374. nop.m 999
  375. (p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12
  376. nop.i 999
  377. }
  378. { .mfi
  379. nop.m 999
  380. (p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0
  381. nop.i 999 ;;
  382. }
  383. { .mfi
  384. nop.m 999
  385. (p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8
  386. nop.i 999
  387. }
  388. { .mfi
  389. nop.m 999
  390. (p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10
  391. nop.i 999 ;;
  392. }
  393. { .mfi
  394. nop.m 999
  395. (p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4
  396. nop.i 999
  397. }
  398. { .mfi
  399. nop.m 999
  400. (p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6
  401. nop.i 999 ;;
  402. }
  403. { .mfi
  404. nop.m 999
  405. (p9) fnma.s1 tan_d = tan_r, tan_y0, f1
  406. nop.i 999
  407. }
  408. { .mfi
  409. nop.m 999
  410. (p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2
  411. nop.i 999 ;;
  412. }
  413. { .mfi
  414. nop.m 999
  415. (p9) fma.s1 tan_v11 = tan_rsq, tan_Q9, tan_Q8
  416. nop.i 999
  417. }
  418. { .mfi
  419. nop.m 999
  420. (p9) fma.s1 tan_v12 = tan_rsq, tan_rsq, f0
  421. nop.i 999 ;;
  422. }
  423. { .mfi
  424. nop.m 999
  425. (p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16
  426. nop.i 999
  427. }
  428. { .mfi
  429. nop.m 999
  430. (p9) fma.s1 tan_v7 = tan_rsq, tan_Q5, tan_Q4
  431. nop.i 999 ;;
  432. }
  433. { .mfi
  434. nop.m 999
  435. (p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12
  436. nop.i 999
  437. }
  438. { .mfi
  439. nop.m 999
  440. (p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0
  441. nop.i 999 ;;
  442. }
  443. { .mfi
  444. nop.m 999
  445. (p9) fma.s1 tan_v8 = tan_rsq, tan_Q7, tan_Q6
  446. nop.i 999
  447. }
  448. { .mfi
  449. nop.m 999
  450. (p9) fma.s1 tan_v3 = tan_rsq, tan_Q1, tan_Q0
  451. nop.i 999 ;;
  452. }
  453. { .mfi
  454. nop.m 999
  455. (p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4
  456. nop.i 999
  457. }
  458. { .mfi
  459. nop.m 999
  460. (p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7
  461. nop.i 999 ;;
  462. }
  463. { .mfi
  464. nop.m 999
  465. (p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0
  466. nop.i 999
  467. }
  468. { .mfi
  469. nop.m 999
  470. (p9) fma.s1 tan_v10 = tan_v12, tan_Q10, tan_v11
  471. nop.i 999 ;;
  472. }
  473. { .mfi
  474. nop.m 999
  475. (p9) fma.s1 tan_dsq = tan_d, tan_d, f0
  476. nop.i 999
  477. }
  478. { .mfi
  479. nop.m 999
  480. (p9) fma.s1 tan_v9 = tan_v12, tan_v12,f0
  481. nop.i 999 ;;
  482. }
  483. { .mfi
  484. nop.m 999
  485. (p9) fma.s1 tan_v4 = tan_rsq, tan_Q3, tan_Q2
  486. nop.i 999
  487. }
  488. { .mfi
  489. nop.m 999
  490. (p9) fma.s1 tan_v6 = tan_v12, tan_v8, tan_v7
  491. nop.i 999 ;;
  492. }
  493. { .mfi
  494. nop.m 999
  495. (p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11
  496. nop.i 999 ;;
  497. }
  498. { .mfi
  499. nop.m 999
  500. (p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0
  501. nop.i 999
  502. }
  503. { .mfi
  504. nop.m 999
  505. (p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d
  506. nop.i 999 ;;
  507. }
  508. { .mfi
  509. nop.m 999
  510. (p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3
  511. nop.i 999
  512. }
  513. { .mfi
  514. nop.m 999
  515. (p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0
  516. nop.i 999 ;;
  517. }
  518. { .mfi
  519. nop.m 999
  520. (p9) fma.s1 tan_v2 = tan_v12, tan_v4, tan_v3
  521. nop.i 999
  522. }
  523. { .mfi
  524. nop.m 999
  525. (p9) fma.s1 tan_v5 = tan_v9, tan_v10, tan_v6
  526. nop.i 999 ;;
  527. }
  528. { .mfi
  529. nop.m 999
  530. (p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0
  531. nop.i 999
  532. }
  533. { .mfi
  534. nop.m 999
  535. (p8) fma.s1 tan_rcube = tan_rsq, tan_r, f0
  536. nop.i 999 ;;
  537. }
  538. { .mfi
  539. nop.m 999
  540. (p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2
  541. nop.i 999
  542. }
  543. { .mfi
  544. nop.m 999
  545. (p9) fma.s1 tan_v1 = tan_v9, tan_v5, tan_v2
  546. nop.i 999 ;;
  547. }
  548. { .mfb
  549. nop.m 999
  550. (p8) fma.d f8 = tan_v1, tan_rcube, tan_r
  551. (p0) nop.b 999
  552. }
  553. { .mfb
  554. nop.m 999
  555. (p9) fms.d.s0 f8 = tan_r, tan_v1, tan_inv_r
  556. (p0) br.ret.sptk b0 ;;
  557. }
  558. .endp tan#
  559. .proc __libm_callout
  560. __libm_callout:
  561. TAN_DBX:
  562. .prologue
  563. { .mfi
  564. nop.m 0
  565. fmerge.s f9 = f0,f0
  566. .save ar.pfs,GR_SAVE_PFS
  567. mov GR_SAVE_PFS=ar.pfs
  568. }
  569. ;;
  570. { .mfi
  571. mov GR_SAVE_GP=gp
  572. nop.f 0
  573. .save b0, GR_SAVE_B0
  574. mov GR_SAVE_B0=b0
  575. }
  576. .body
  577. { .mfb
  578. nop.m 999
  579. nop.f 999
  580. (p0) br.call.sptk.many b0=__libm_tan# ;;
  581. }
  582. { .mfi
  583. (p0) mov gp = GR_SAVE_GP
  584. (p0) fnorm.d f8 = f8
  585. (p0) mov b0 = GR_SAVE_B0
  586. }
  587. ;;
  588. { .mib
  589. nop.m 999
  590. (p0) mov ar.pfs = GR_SAVE_PFS
  591. (p0) br.ret.sptk b0
  592. ;;
  593. }
  594. .endp __libm_callout
  595. .type __libm_tan#,@function
  596. .global __libm_tan#