Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

737 lines
17 KiB

  1. .file "tanf.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00: Initial version
  29. // 4/04/00 Unwind support added
  30. // 12/27/00 Improved speed
  31. // 02/21/01 Updated to call tanl
  32. //
  33. // API
  34. //==============================================================
  35. // float tan( float x);
  36. //
  37. // Overview of operation
  38. //==============================================================
  39. // If the input value in radians is |x| >= 1.xxxxx 2^10 call the
  40. // older slower version.
  41. //
  42. // The new algorithm is used when |x| <= 1.xxxxx 2^9.
  43. //
  44. // Represent the input X as Nfloat * pi/2 + r
  45. // where r can be negative and |r| <= pi/4
  46. //
  47. // tan_W = x * 2/pi
  48. // Nfloat = round_int(tan_W)
  49. //
  50. // tan_r = x - Nfloat * (pi/2)_hi
  51. // tan_r = tan_r - Nfloat * (pi/2)_lo
  52. //
  53. // We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd.
  54. // p8: tan(X) = tan(r)
  55. // p9: tan(X) = -cot(r)
  56. //
  57. // Each is evaluated as a series. The p9 path requires 1/r.
  58. //
  59. // The coefficients used in the series are stored in a table as
  60. // are the pi constants.
  61. //
  62. // Registers used
  63. //==============================================================
  64. //
  65. // predicate registers used:
  66. // p6-10
  67. //
  68. // floating-point registers used:
  69. // f10-15, f32-105
  70. // f8, input
  71. //
  72. // general registers used
  73. // r14-18, r32-43
  74. //
  75. // Assembly macros
  76. //==============================================================
  77. TAN_INV_PI_BY_2_2TO64 = f10
  78. TAN_RSHF_2TO64 = f11
  79. TAN_2TOM64 = f12
  80. TAN_RSHF = f13
  81. TAN_W_2TO64_RSH = f14
  82. TAN_NFLOAT = f15
  83. tan_Inv_Pi_by_2 = f32
  84. tan_Pi_by_2_hi = f33
  85. tan_Pi_by_2_lo = f34
  86. tan_P0 = f35
  87. tan_P1 = f36
  88. tan_P2 = f37
  89. tan_P3 = f38
  90. tan_P4 = f39
  91. tan_P5 = f40
  92. tan_P6 = f41
  93. tan_P7 = f42
  94. tan_P8 = f43
  95. tan_P9 = f44
  96. tan_P10 = f45
  97. tan_P11 = f46
  98. tan_P12 = f47
  99. tan_P13 = f48
  100. tan_P14 = f49
  101. tan_P15 = f50
  102. tan_Q0 = f51
  103. tan_Q1 = f52
  104. tan_Q2 = f53
  105. tan_Q3 = f54
  106. tan_Q4 = f55
  107. tan_Q5 = f56
  108. tan_Q6 = f57
  109. tan_Q7 = f58
  110. tan_Q8 = f59
  111. tan_Q9 = f60
  112. tan_Q10 = f61
  113. tan_r = f62
  114. tan_rsq = f63
  115. tan_rcube = f64
  116. tan_v18 = f65
  117. tan_v16 = f66
  118. tan_v17 = f67
  119. tan_v12 = f68
  120. tan_v13 = f69
  121. tan_v7 = f70
  122. tan_v8 = f71
  123. tan_v4 = f72
  124. tan_v5 = f73
  125. tan_v15 = f74
  126. tan_v11 = f75
  127. tan_v14 = f76
  128. tan_v3 = f77
  129. tan_v6 = f78
  130. tan_v10 = f79
  131. tan_v2 = f80
  132. tan_v9 = f81
  133. tan_v1 = f82
  134. tan_int_Nfloat = f83
  135. tan_Nfloat = f84
  136. tan_NORM_f8 = f85
  137. tan_W = f86
  138. tan_y0 = f87
  139. tan_d = f88
  140. tan_y1 = f89
  141. tan_dsq = f90
  142. tan_y2 = f91
  143. tan_d4 = f92
  144. tan_inv_r = f93
  145. tan_z1 = f94
  146. tan_z2 = f95
  147. tan_z3 = f96
  148. tan_z4 = f97
  149. tan_z5 = f98
  150. tan_z6 = f99
  151. tan_z7 = f100
  152. tan_z8 = f101
  153. tan_z9 = f102
  154. tan_z10 = f103
  155. tan_z11 = f104
  156. tan_z12 = f105
  157. /////////////////////////////////////////////////////////////
  158. tan_GR_sig_inv_pi_by_2 = r14
  159. tan_GR_rshf_2to64 = r15
  160. tan_GR_exp_2tom64 = r16
  161. tan_GR_n = r17
  162. tan_GR_rshf = r18
  163. tan_AD = r33
  164. tan_GR_10009 = r34
  165. tan_GR_17_ones = r35
  166. tan_GR_N_odd_even = r36
  167. tan_GR_N = r37
  168. tan_signexp = r38
  169. tan_exp = r39
  170. tan_ADQ = r40
  171. GR_SAVE_PFS = r41
  172. GR_SAVE_B0 = r42
  173. GR_SAVE_GP = r43
  174. .data
  175. .align 16
  176. double_tan_constants:
  177. // data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi
  178. data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi
  179. data8 0xBEEA54580DDEA0E1 // P14
  180. data8 0x3ED3021ACE749A59 // P15
  181. data8 0xBEF312BD91DC8DA1 // P12
  182. data8 0x3EFAE9AFC14C5119 // P13
  183. data8 0x3F2F342BF411E769 // P8
  184. data8 0x3F1A60FC9F3B0227 // P9
  185. data8 0x3EFF246E78E5E45B // P10
  186. data8 0x3F01D9D2E782875C // P11
  187. data8 0x3F8226E34C4499B6 // P4
  188. data8 0x3F6D6D3F12C236AC // P5
  189. data8 0x3F57DA1146DCFD8B // P6
  190. data8 0x3F43576410FE3D75 // P7
  191. data8 0x3FD5555555555555 // P0
  192. data8 0x3FC11111111111C2 // P1
  193. data8 0x3FABA1BA1BA0E850 // P2
  194. data8 0x3F9664F4886725A7 // P3
  195. double_Q_tan_constants:
  196. data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo
  197. data8 0x3E223A73BA576E48 // Q8
  198. data8 0x3DF54AD8D1F2CA43 // Q9
  199. data8 0x3EF66A8EE529A6AA // Q4
  200. data8 0x3EC2281050410EE6 // Q5
  201. data8 0x3E8D6BB992CC3CF5 // Q6
  202. data8 0x3E57F88DE34832E4 // Q7
  203. data8 0x3FD5555555555555 // Q0
  204. data8 0x3F96C16C16C16DB8 // Q1
  205. data8 0x3F61566ABBFFB489 // Q2
  206. data8 0x3F2BBD77945C1733 // Q3
  207. data8 0x3D927FB33E2B0E04 // Q10
  208. .align 32
  209. .global tanf#
  210. ////////////////////////////////////////////////////////
  211. .section .text
  212. .global tanf
  213. .proc tanf
  214. .align 32
  215. tanf:
  216. // The initial fnorm will take any unmasked faults and
  217. // normalize any single/double unorms
  218. { .mlx
  219. alloc r32=ar.pfs,1,11,0,0
  220. movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi
  221. }
  222. { .mlx
  223. addl tan_AD = @ltoff(double_tan_constants), gp
  224. movl tan_GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+63+1)
  225. }
  226. ;;
  227. { .mfi
  228. ld8 tan_AD = [tan_AD]
  229. fnorm tan_NORM_f8 = f8
  230. mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64
  231. }
  232. { .mlx
  233. nop.m 999
  234. movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
  235. }
  236. ;;
  237. // Form two constants we need
  238. // 2/pi * 2^1 * 2^63, scaled by 2^64 since we just loaded the significand
  239. // 1.1000...000 * 2^(63+63+1) to right shift int(W) into the significand
  240. { .mmi
  241. setf.sig TAN_INV_PI_BY_2_2TO64 = tan_GR_sig_inv_pi_by_2
  242. setf.d TAN_RSHF_2TO64 = tan_GR_rshf_2to64
  243. mov tan_GR_17_ones = 0x1ffff ;;
  244. }
  245. // Form another constant
  246. // 2^-64 for scaling Nfloat
  247. // 1.1000...000 * 2^63, the right shift constant
  248. { .mmf
  249. setf.exp TAN_2TOM64 = tan_GR_exp_2tom64
  250. adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD
  251. fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0
  252. }
  253. ;;
  254. // Form another constant
  255. // 2^-64 for scaling Nfloat
  256. // 1.1000...000 * 2^63, the right shift constant
  257. { .mmf
  258. setf.d TAN_RSHF = tan_GR_rshf
  259. ldfe tan_Pi_by_2_hi = [tan_AD],16
  260. fclass.m.unc p7,p0 = f8, 0x23 // Test for x=inf
  261. }
  262. ;;
  263. { .mfb
  264. ldfe tan_Pi_by_2_lo = [tan_ADQ],16
  265. fclass.m.unc p8,p0 = f8, 0xc3 // Test for x=nan
  266. (p6) br.ret.spnt b0 ;; // Exit for x=0
  267. }
  268. { .mfi
  269. ldfpd tan_P14,tan_P15 = [tan_AD],16
  270. (p7) frcpa.s0 f8,p9=f0,f0 // Set qnan indef if x=inf
  271. mov tan_GR_10009 = 0x10009
  272. }
  273. { .mib
  274. ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16
  275. nop.i 999
  276. (p7) br.ret.spnt b0 ;; // Exit for x=inf
  277. }
  278. { .mfi
  279. ldfpd tan_P12,tan_P13 = [tan_AD],16
  280. (p8) fma.s f8=f8,f1,f8 // Set qnan if x=nan
  281. nop.i 999
  282. }
  283. { .mib
  284. ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16
  285. nop.i 999
  286. (p8) br.ret.spnt b0 ;; // Exit for x=nan
  287. }
  288. { .mmi
  289. getf.exp tan_signexp = tan_NORM_f8
  290. ldfpd tan_P8,tan_P9 = [tan_AD],16
  291. nop.i 999 ;;
  292. }
  293. // Multiply x by scaled 2/pi and add large const to shift integer part of W to
  294. // rightmost bits of significand
  295. { .mfi
  296. ldfpd tan_Q6,tan_Q7 = [tan_ADQ],16
  297. fma.s1 TAN_W_2TO64_RSH = tan_NORM_f8,TAN_INV_PI_BY_2_2TO64,TAN_RSHF_2TO64
  298. nop.i 999 ;;
  299. }
  300. { .mmi
  301. ldfpd tan_P10,tan_P11 = [tan_AD],16
  302. nop.m 999
  303. and tan_exp = tan_GR_17_ones, tan_signexp ;;
  304. }
  305. // p7 is true if we must call DBX TAN
  306. // p7 is true if f8 exp is > 0x10009 (which includes all ones
  307. // NAN or inf)
  308. { .mmi
  309. ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16
  310. cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009
  311. nop.i 999 ;;
  312. }
  313. { .mmb
  314. ldfpd tan_P4,tan_P5 = [tan_AD],16
  315. nop.m 999
  316. (p7) br.cond.spnt TAN_DBX ;;
  317. }
  318. { .mmi
  319. ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16
  320. nop.m 999
  321. nop.i 999 ;;
  322. }
  323. // TAN_NFLOAT = Round_Int_Nearest(tan_W)
  324. { .mfi
  325. ldfpd tan_P6,tan_P7 = [tan_AD],16
  326. fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF
  327. nop.i 999 ;;
  328. }
  329. { .mfi
  330. ldfd tan_Q10 = [tan_ADQ]
  331. nop.f 999
  332. nop.i 999 ;;
  333. }
  334. { .mfi
  335. ldfpd tan_P0,tan_P1 = [tan_AD],16
  336. nop.f 999
  337. nop.i 999 ;;
  338. }
  339. { .mfi
  340. getf.sig tan_GR_n = TAN_W_2TO64_RSH
  341. nop.f 999
  342. nop.i 999 ;;
  343. }
  344. // tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x
  345. { .mfi
  346. ldfpd tan_P2,tan_P3 = [tan_AD]
  347. fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8
  348. nop.i 999 ;;
  349. }
  350. // p8 ==> even
  351. // p9 ==> odd
  352. { .mmi
  353. and tan_GR_N_odd_even = 0x1, tan_GR_n ;;
  354. nop.m 999
  355. cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;;
  356. }
  357. // tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo
  358. { .mfi
  359. nop.m 999
  360. fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r
  361. nop.i 999 ;;
  362. }
  363. { .mfi
  364. nop.m 999
  365. fma.s1 tan_rsq = tan_r, tan_r, f0
  366. nop.i 999 ;;
  367. }
  368. { .mfi
  369. nop.m 999
  370. (p9) frcpa.s1 tan_y0, p10 = f1,tan_r
  371. nop.i 999 ;;
  372. }
  373. { .mfi
  374. nop.m 999
  375. (p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14
  376. nop.i 999
  377. }
  378. { .mfi
  379. nop.m 999
  380. (p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0
  381. nop.i 999 ;;
  382. }
  383. { .mfi
  384. nop.m 999
  385. (p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12
  386. nop.i 999
  387. }
  388. { .mfi
  389. nop.m 999
  390. (p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0
  391. nop.i 999 ;;
  392. }
  393. { .mfi
  394. nop.m 999
  395. (p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8
  396. nop.i 999
  397. }
  398. { .mfi
  399. nop.m 999
  400. (p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10
  401. nop.i 999 ;;
  402. }
  403. { .mfi
  404. nop.m 999
  405. (p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4
  406. nop.i 999
  407. }
  408. { .mfi
  409. nop.m 999
  410. (p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6
  411. nop.i 999 ;;
  412. }
  413. { .mfi
  414. nop.m 999
  415. (p9) fnma.s1 tan_d = tan_r, tan_y0, f1
  416. nop.i 999
  417. }
  418. { .mfi
  419. nop.m 999
  420. (p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2
  421. nop.i 999 ;;
  422. }
  423. { .mfi
  424. nop.m 999
  425. (p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8
  426. nop.i 999
  427. }
  428. { .mfi
  429. nop.m 999
  430. (p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0
  431. nop.i 999 ;;
  432. }
  433. { .mfi
  434. nop.m 999
  435. (p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16
  436. nop.i 999
  437. }
  438. { .mfi
  439. nop.m 999
  440. (p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4
  441. nop.i 999 ;;
  442. }
  443. { .mfi
  444. nop.m 999
  445. (p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12
  446. nop.i 999
  447. }
  448. { .mfi
  449. nop.m 999
  450. (p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6
  451. nop.i 999 ;;
  452. }
  453. { .mfi
  454. nop.m 999
  455. (p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0
  456. nop.i 999
  457. }
  458. { .mfi
  459. nop.m 999
  460. (p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0
  461. nop.i 999 ;;
  462. }
  463. { .mfi
  464. nop.m 999
  465. (p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4
  466. nop.i 999
  467. }
  468. { .mfi
  469. nop.m 999
  470. (p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7
  471. nop.i 999 ;;
  472. }
  473. { .mfi
  474. nop.m 999
  475. (p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0
  476. nop.i 999
  477. }
  478. { .mfi
  479. nop.m 999
  480. (p9) fma.s1 tan_dsq = tan_d, tan_d, f0
  481. nop.i 999 ;;
  482. }
  483. { .mfi
  484. nop.m 999
  485. (p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11
  486. nop.i 999
  487. }
  488. { .mfi
  489. nop.m 999
  490. (p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0
  491. nop.i 999 ;;
  492. }
  493. { .mfi
  494. nop.m 999
  495. (p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2
  496. nop.i 999
  497. }
  498. { .mfi
  499. nop.m 999
  500. (p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7
  501. nop.i 999 ;;
  502. }
  503. { .mfi
  504. nop.m 999
  505. (p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11
  506. nop.i 999 ;;
  507. }
  508. { .mfi
  509. nop.m 999
  510. (p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0
  511. nop.i 999
  512. }
  513. { .mfi
  514. nop.m 999
  515. (p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d
  516. nop.i 999 ;;
  517. }
  518. { .mfi
  519. nop.m 999
  520. (p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3
  521. nop.i 999
  522. }
  523. { .mfi
  524. nop.m 999
  525. (p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0
  526. nop.i 999 ;;
  527. }
  528. { .mfi
  529. nop.m 999
  530. (p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3
  531. nop.i 999
  532. }
  533. { .mfi
  534. nop.m 999
  535. (p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6
  536. nop.i 999 ;;
  537. }
  538. { .mfi
  539. nop.m 999
  540. (p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0
  541. nop.i 999
  542. }
  543. { .mfi
  544. nop.m 999
  545. (p8) fma.s1 tan_rcube = tan_rsq, tan_r, f0
  546. nop.i 999 ;;
  547. }
  548. { .mfi
  549. nop.m 999
  550. (p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2
  551. nop.i 999
  552. }
  553. { .mfi
  554. nop.m 999
  555. (p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2
  556. nop.i 999 ;;
  557. }
  558. { .mfi
  559. nop.m 999
  560. (p8) fma.s.s0 f8 = tan_v1, tan_rcube, tan_r
  561. nop.i 999
  562. }
  563. { .mfb
  564. nop.m 999
  565. (p9) fms.s.s0 f8 = tan_r, tan_z1, tan_inv_r
  566. br.ret.sptk b0 ;;
  567. }
  568. .endp tanf#
  569. .proc __libm_callout
  570. __libm_callout:
  571. TAN_DBX:
  572. .prologue
  573. { .mfi
  574. nop.m 0
  575. fmerge.s f9 = f0,f0
  576. .save ar.pfs,GR_SAVE_PFS
  577. mov GR_SAVE_PFS=ar.pfs
  578. }
  579. ;;
  580. { .mfi
  581. mov GR_SAVE_GP=gp
  582. nop.f 0
  583. .save b0, GR_SAVE_B0
  584. mov GR_SAVE_B0=b0
  585. }
  586. .body
  587. { .mfb
  588. nop.m 999
  589. nop.f 999
  590. br.call.sptk.many b0=__libm_tan# ;;
  591. }
  592. { .mfi
  593. mov gp = GR_SAVE_GP
  594. fnorm.s f8 = f8
  595. mov b0 = GR_SAVE_B0
  596. }
  597. ;;
  598. { .mib
  599. nop.m 999
  600. mov ar.pfs = GR_SAVE_PFS
  601. br.ret.sptk b0
  602. ;;
  603. }
  604. .endp __libm_callout
  605. .type __libm_tan#,@function
  606. .global __libm_tan#