Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

736 lines
17 KiB

  1. .file "tan.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00: Initial version
  29. // 4/04/00 Unwind support added
  30. // 12/27/00 Improved speed
  31. // 02/21/01 Updated to call tanl
  32. //
  33. // API
  34. //==============================================================
  35. // double tan( double x);
  36. //
  37. // Overview of operation
  38. //==============================================================
  39. // If the input value in radians is |x| >= 1.xxxxx 2^10 call the
  40. // older slower version.
  41. //
  42. // The new algorithm is used when |x| <= 1.xxxxx 2^9.
  43. //
  44. // Represent the input X as Nfloat * pi/2 + r
  45. // where r can be negative and |r| <= pi/4
  46. //
  47. // tan_W = x * 2/pi
  48. // Nfloat = round_int(tan_W)
  49. //
  50. // tan_r = x - Nfloat * (pi/2)_hi
  51. // tan_r = tan_r - Nfloat * (pi/2)_lo
  52. //
  53. // We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd.
  54. // p8: tan(X) = tan(r)
  55. // p9: tan(X) = -cot(r)
  56. //
  57. // Each is evaluated as a series. The p9 path requires 1/r.
  58. //
  59. // The coefficients used in the series are stored in a table as
  60. // are the pi constants.
  61. //
  62. // Registers used
  63. //==============================================================
  64. //
  65. // predicate registers used:
  66. // p6-10
  67. //
  68. // floating-point registers used:
  69. // f10-15, f32-105
  70. // f8, input
  71. //
  72. // general registers used
  73. // r14-18, r32-43
  74. //
  75. // Assembly macros
  76. //==============================================================
  77. TAN_INV_PI_BY_2_2TO64 = f10
  78. TAN_RSHF_2TO64 = f11
  79. TAN_2TOM64 = f12
  80. TAN_RSHF = f13
  81. TAN_W_2TO64_RSH = f14
  82. TAN_NFLOAT = f15
  83. tan_Inv_Pi_by_2 = f32
  84. tan_Pi_by_2_hi = f33
  85. tan_Pi_by_2_lo = f34
  86. tan_P0 = f35
  87. tan_P1 = f36
  88. tan_P2 = f37
  89. tan_P3 = f38
  90. tan_P4 = f39
  91. tan_P5 = f40
  92. tan_P6 = f41
  93. tan_P7 = f42
  94. tan_P8 = f43
  95. tan_P9 = f44
  96. tan_P10 = f45
  97. tan_P11 = f46
  98. tan_P12 = f47
  99. tan_P13 = f48
  100. tan_P14 = f49
  101. tan_P15 = f50
  102. tan_Q0 = f51
  103. tan_Q1 = f52
  104. tan_Q2 = f53
  105. tan_Q3 = f54
  106. tan_Q4 = f55
  107. tan_Q5 = f56
  108. tan_Q6 = f57
  109. tan_Q7 = f58
  110. tan_Q8 = f59
  111. tan_Q9 = f60
  112. tan_Q10 = f61
  113. tan_r = f62
  114. tan_rsq = f63
  115. tan_rcube = f64
  116. tan_v18 = f65
  117. tan_v16 = f66
  118. tan_v17 = f67
  119. tan_v12 = f68
  120. tan_v13 = f69
  121. tan_v7 = f70
  122. tan_v8 = f71
  123. tan_v4 = f72
  124. tan_v5 = f73
  125. tan_v15 = f74
  126. tan_v11 = f75
  127. tan_v14 = f76
  128. tan_v3 = f77
  129. tan_v6 = f78
  130. tan_v10 = f79
  131. tan_v2 = f80
  132. tan_v9 = f81
  133. tan_v1 = f82
  134. tan_int_Nfloat = f83
  135. tan_Nfloat = f84
  136. tan_NORM_f8 = f85
  137. tan_W = f86
  138. tan_y0 = f87
  139. tan_d = f88
  140. tan_y1 = f89
  141. tan_dsq = f90
  142. tan_y2 = f91
  143. tan_d4 = f92
  144. tan_inv_r = f93
  145. tan_z1 = f94
  146. tan_z2 = f95
  147. tan_z3 = f96
  148. tan_z4 = f97
  149. tan_z5 = f98
  150. tan_z6 = f99
  151. tan_z7 = f100
  152. tan_z8 = f101
  153. tan_z9 = f102
  154. tan_z10 = f103
  155. tan_z11 = f104
  156. tan_z12 = f105
  157. /////////////////////////////////////////////////////////////
  158. tan_GR_sig_inv_pi_by_2 = r14
  159. tan_GR_rshf_2to64 = r15
  160. tan_GR_exp_2tom64 = r16
  161. tan_GR_n = r17
  162. tan_GR_rshf = r18
  163. tan_AD = r33
  164. tan_GR_10009 = r34
  165. tan_GR_17_ones = r35
  166. tan_GR_N_odd_even = r36
  167. tan_GR_N = r37
  168. tan_signexp = r38
  169. tan_exp = r39
  170. tan_ADQ = r40
  171. GR_SAVE_PFS = r41
  172. GR_SAVE_B0 = r42
  173. GR_SAVE_GP = r43
  174. .data
  175. .align 16
  176. double_tan_constants:
  177. // data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi
  178. data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi
  179. data8 0xBEEA54580DDEA0E1 // P14
  180. data8 0x3ED3021ACE749A59 // P15
  181. data8 0xBEF312BD91DC8DA1 // P12
  182. data8 0x3EFAE9AFC14C5119 // P13
  183. data8 0x3F2F342BF411E769 // P8
  184. data8 0x3F1A60FC9F3B0227 // P9
  185. data8 0x3EFF246E78E5E45B // P10
  186. data8 0x3F01D9D2E782875C // P11
  187. data8 0x3F8226E34C4499B6 // P4
  188. data8 0x3F6D6D3F12C236AC // P5
  189. data8 0x3F57DA1146DCFD8B // P6
  190. data8 0x3F43576410FE3D75 // P7
  191. data8 0x3FD5555555555555 // P0
  192. data8 0x3FC11111111111C2 // P1
  193. data8 0x3FABA1BA1BA0E850 // P2
  194. data8 0x3F9664F4886725A7 // P3
  195. double_Q_tan_constants:
  196. data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo
  197. data8 0x3E223A73BA576E48 // Q8
  198. data8 0x3DF54AD8D1F2CA43 // Q9
  199. data8 0x3EF66A8EE529A6AA // Q4
  200. data8 0x3EC2281050410EE6 // Q5
  201. data8 0x3E8D6BB992CC3CF5 // Q6
  202. data8 0x3E57F88DE34832E4 // Q7
  203. data8 0x3FD5555555555555 // Q0
  204. data8 0x3F96C16C16C16DB8 // Q1
  205. data8 0x3F61566ABBFFB489 // Q2
  206. data8 0x3F2BBD77945C1733 // Q3
  207. data8 0x3D927FB33E2B0E04 // Q10
  208. .align 32
  209. .global tan#
  210. ////////////////////////////////////////////////////////
  211. .section .text
  212. .proc tan#
  213. .align 32
  214. tan:
  215. // The initial fnorm will take any unmasked faults and
  216. // normalize any single/double unorms
  217. { .mlx
  218. alloc r32=ar.pfs,1,11,0,0
  219. movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi
  220. }
  221. { .mlx
  222. addl tan_AD = @ltoff(double_tan_constants), gp
  223. movl tan_GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+63+1)
  224. }
  225. ;;
  226. { .mfi
  227. ld8 tan_AD = [tan_AD]
  228. fnorm tan_NORM_f8 = f8
  229. mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64
  230. }
  231. { .mlx
  232. nop.m 999
  233. movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
  234. }
  235. ;;
  236. // Form two constants we need
  237. // 2/pi * 2^1 * 2^63, scaled by 2^64 since we just loaded the significand
  238. // 1.1000...000 * 2^(63+63+1) to right shift int(W) into the significand
  239. { .mmi
  240. setf.sig TAN_INV_PI_BY_2_2TO64 = tan_GR_sig_inv_pi_by_2
  241. setf.d TAN_RSHF_2TO64 = tan_GR_rshf_2to64
  242. mov tan_GR_17_ones = 0x1ffff ;;
  243. }
  244. // Form another constant
  245. // 2^-64 for scaling Nfloat
  246. // 1.1000...000 * 2^63, the right shift constant
  247. { .mmf
  248. setf.exp TAN_2TOM64 = tan_GR_exp_2tom64
  249. adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD
  250. fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0
  251. }
  252. ;;
  253. // Form another constant
  254. // 2^-64 for scaling Nfloat
  255. // 1.1000...000 * 2^63, the right shift constant
  256. { .mmf
  257. setf.d TAN_RSHF = tan_GR_rshf
  258. ldfe tan_Pi_by_2_hi = [tan_AD],16
  259. fclass.m.unc p7,p0 = f8, 0x23 // Test for x=inf
  260. }
  261. ;;
  262. { .mfb
  263. ldfe tan_Pi_by_2_lo = [tan_ADQ],16
  264. fclass.m.unc p8,p0 = f8, 0xc3 // Test for x=nan
  265. (p6) br.ret.spnt b0 ;; // Exit for x=0
  266. }
  267. { .mfi
  268. ldfpd tan_P14,tan_P15 = [tan_AD],16
  269. (p7) frcpa.s0 f8,p9=f0,f0 // Set qnan indef if x=inf
  270. mov tan_GR_10009 = 0x10009
  271. }
  272. { .mib
  273. ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16
  274. nop.i 999
  275. (p7) br.ret.spnt b0 ;; // Exit for x=inf
  276. }
  277. { .mfi
  278. ldfpd tan_P12,tan_P13 = [tan_AD],16
  279. (p8) fma.d f8=f8,f1,f8 // Set qnan if x=nan
  280. nop.i 999
  281. }
  282. { .mib
  283. ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16
  284. nop.i 999
  285. (p8) br.ret.spnt b0 ;; // Exit for x=nan
  286. }
  287. { .mmi
  288. getf.exp tan_signexp = tan_NORM_f8
  289. ldfpd tan_P8,tan_P9 = [tan_AD],16
  290. nop.i 999 ;;
  291. }
  292. // Multiply x by scaled 2/pi and add large const to shift integer part of W to
  293. // rightmost bits of significand
  294. { .mfi
  295. ldfpd tan_Q6,tan_Q7 = [tan_ADQ],16
  296. fma.s1 TAN_W_2TO64_RSH = tan_NORM_f8,TAN_INV_PI_BY_2_2TO64,TAN_RSHF_2TO64
  297. nop.i 999 ;;
  298. }
  299. { .mmi
  300. ldfpd tan_P10,tan_P11 = [tan_AD],16
  301. nop.m 999
  302. and tan_exp = tan_GR_17_ones, tan_signexp ;;
  303. }
  304. // p7 is true if we must call DBX TAN
  305. // p7 is true if f8 exp is > 0x10009 (which includes all ones
  306. // NAN or inf)
  307. { .mmi
  308. ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16
  309. cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009
  310. nop.i 999 ;;
  311. }
  312. { .mmb
  313. ldfpd tan_P4,tan_P5 = [tan_AD],16
  314. nop.m 999
  315. (p7) br.cond.spnt TAN_DBX ;;
  316. }
  317. { .mmi
  318. ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16
  319. nop.m 999
  320. nop.i 999 ;;
  321. }
  322. // TAN_NFLOAT = Round_Int_Nearest(tan_W)
  323. { .mfi
  324. ldfpd tan_P6,tan_P7 = [tan_AD],16
  325. fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF
  326. nop.i 999 ;;
  327. }
  328. { .mfi
  329. ldfd tan_Q10 = [tan_ADQ]
  330. nop.f 999
  331. nop.i 999 ;;
  332. }
  333. { .mfi
  334. ldfpd tan_P0,tan_P1 = [tan_AD],16
  335. nop.f 999
  336. nop.i 999 ;;
  337. }
  338. { .mfi
  339. getf.sig tan_GR_n = TAN_W_2TO64_RSH
  340. nop.f 999
  341. nop.i 999 ;;
  342. }
  343. // tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x
  344. { .mfi
  345. ldfpd tan_P2,tan_P3 = [tan_AD]
  346. fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8
  347. nop.i 999 ;;
  348. }
  349. // p8 ==> even
  350. // p9 ==> odd
  351. { .mmi
  352. and tan_GR_N_odd_even = 0x1, tan_GR_n ;;
  353. nop.m 999
  354. cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;;
  355. }
  356. // tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo
  357. { .mfi
  358. nop.m 999
  359. fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r
  360. nop.i 999 ;;
  361. }
  362. { .mfi
  363. nop.m 999
  364. fma.s1 tan_rsq = tan_r, tan_r, f0
  365. nop.i 999 ;;
  366. }
  367. { .mfi
  368. nop.m 999
  369. (p9) frcpa.s1 tan_y0, p10 = f1,tan_r
  370. nop.i 999 ;;
  371. }
  372. { .mfi
  373. nop.m 999
  374. (p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14
  375. nop.i 999
  376. }
  377. { .mfi
  378. nop.m 999
  379. (p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0
  380. nop.i 999 ;;
  381. }
  382. { .mfi
  383. nop.m 999
  384. (p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12
  385. nop.i 999
  386. }
  387. { .mfi
  388. nop.m 999
  389. (p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0
  390. nop.i 999 ;;
  391. }
  392. { .mfi
  393. nop.m 999
  394. (p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8
  395. nop.i 999
  396. }
  397. { .mfi
  398. nop.m 999
  399. (p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10
  400. nop.i 999 ;;
  401. }
  402. { .mfi
  403. nop.m 999
  404. (p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4
  405. nop.i 999
  406. }
  407. { .mfi
  408. nop.m 999
  409. (p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6
  410. nop.i 999 ;;
  411. }
  412. { .mfi
  413. nop.m 999
  414. (p9) fnma.s1 tan_d = tan_r, tan_y0, f1
  415. nop.i 999
  416. }
  417. { .mfi
  418. nop.m 999
  419. (p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2
  420. nop.i 999 ;;
  421. }
  422. { .mfi
  423. nop.m 999
  424. (p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8
  425. nop.i 999
  426. }
  427. { .mfi
  428. nop.m 999
  429. (p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0
  430. nop.i 999 ;;
  431. }
  432. { .mfi
  433. nop.m 999
  434. (p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16
  435. nop.i 999
  436. }
  437. { .mfi
  438. nop.m 999
  439. (p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4
  440. nop.i 999 ;;
  441. }
  442. { .mfi
  443. nop.m 999
  444. (p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12
  445. nop.i 999
  446. }
  447. { .mfi
  448. nop.m 999
  449. (p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6
  450. nop.i 999 ;;
  451. }
  452. { .mfi
  453. nop.m 999
  454. (p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0
  455. nop.i 999
  456. }
  457. { .mfi
  458. nop.m 999
  459. (p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0
  460. nop.i 999 ;;
  461. }
  462. { .mfi
  463. nop.m 999
  464. (p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4
  465. nop.i 999
  466. }
  467. { .mfi
  468. nop.m 999
  469. (p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7
  470. nop.i 999 ;;
  471. }
  472. { .mfi
  473. nop.m 999
  474. (p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0
  475. nop.i 999
  476. }
  477. { .mfi
  478. nop.m 999
  479. (p9) fma.s1 tan_dsq = tan_d, tan_d, f0
  480. nop.i 999 ;;
  481. }
  482. { .mfi
  483. nop.m 999
  484. (p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11
  485. nop.i 999
  486. }
  487. { .mfi
  488. nop.m 999
  489. (p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0
  490. nop.i 999 ;;
  491. }
  492. { .mfi
  493. nop.m 999
  494. (p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2
  495. nop.i 999
  496. }
  497. { .mfi
  498. nop.m 999
  499. (p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7
  500. nop.i 999 ;;
  501. }
  502. { .mfi
  503. nop.m 999
  504. (p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11
  505. nop.i 999 ;;
  506. }
  507. { .mfi
  508. nop.m 999
  509. (p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0
  510. nop.i 999
  511. }
  512. { .mfi
  513. nop.m 999
  514. (p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d
  515. nop.i 999 ;;
  516. }
  517. { .mfi
  518. nop.m 999
  519. (p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3
  520. nop.i 999
  521. }
  522. { .mfi
  523. nop.m 999
  524. (p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0
  525. nop.i 999 ;;
  526. }
  527. { .mfi
  528. nop.m 999
  529. (p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3
  530. nop.i 999
  531. }
  532. { .mfi
  533. nop.m 999
  534. (p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6
  535. nop.i 999 ;;
  536. }
  537. { .mfi
  538. nop.m 999
  539. (p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0
  540. nop.i 999
  541. }
  542. { .mfi
  543. nop.m 999
  544. (p8) fma.s1 tan_rcube = tan_rsq, tan_r, f0
  545. nop.i 999 ;;
  546. }
  547. { .mfi
  548. nop.m 999
  549. (p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2
  550. nop.i 999
  551. }
  552. { .mfi
  553. nop.m 999
  554. (p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2
  555. nop.i 999 ;;
  556. }
  557. { .mfi
  558. nop.m 999
  559. (p8) fma.d.s0 f8 = tan_v1, tan_rcube, tan_r
  560. nop.i 999
  561. }
  562. { .mfb
  563. nop.m 999
  564. (p9) fms.d.s0 f8 = tan_r, tan_z1, tan_inv_r
  565. br.ret.sptk b0 ;;
  566. }
  567. .endp tan#
  568. .proc __libm_callout
  569. __libm_callout:
  570. TAN_DBX:
  571. .prologue
  572. { .mfi
  573. nop.m 0
  574. fmerge.s f9 = f0,f0
  575. .save ar.pfs,GR_SAVE_PFS
  576. mov GR_SAVE_PFS=ar.pfs
  577. }
  578. ;;
  579. { .mfi
  580. mov GR_SAVE_GP=gp
  581. nop.f 0
  582. .save b0, GR_SAVE_B0
  583. mov GR_SAVE_B0=b0
  584. }
  585. .body
  586. { .mfb
  587. nop.m 999
  588. nop.f 999
  589. (p0) br.call.sptk.many b0=__libm_tan# ;;
  590. }
  591. { .mfi
  592. mov gp = GR_SAVE_GP
  593. fnorm.d f8 = f8
  594. mov b0 = GR_SAVE_B0
  595. }
  596. ;;
  597. { .mib
  598. nop.m 999
  599. mov ar.pfs = GR_SAVE_PFS
  600. br.ret.sptk b0
  601. ;;
  602. }
  603. .endp __libm_callout
  604. .type __libm_tan#,@function
  605. .global __libm_tan#