Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3312 lines
76 KiB

  1. .file "libm_tan.s"
  2. // Copyright (c) 2000, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. //*********************************************************************
  27. //
  28. // History:
  29. // 02/02/00 Initial Version
  30. // 4/04/00 Unwind support added
  31. //
  32. //*********************************************************************
  33. //
  34. // Function: tan(x) = tangent(x), for double precision x values
  35. //
  36. //*********************************************************************
  37. //
  38. // Accuracy: Very accurate for double-precision values
  39. //
  40. //*********************************************************************
  41. //
  42. // Resources Used:
  43. //
  44. // Floating-Point Registers: f8 (Input and Return Value)
  45. // f9-f15
  46. // f32-f112
  47. //
  48. // General Purpose Registers:
  49. // r32-r48
  50. // r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
  51. //
  52. // Predicate Registers: p6-p15
  53. //
  54. //*********************************************************************
  55. //
  56. // IEEE Special Conditions:
  57. //
  58. // Denormal fault raised on denormal inputs
  59. // Overflow exceptions do not occur
  60. // Underflow exceptions raised when appropriate for tan
  61. // (No specialized error handling for this routine)
  62. // Inexact raised when appropriate by algorithm
  63. //
  64. // tan(SNaN) = QNaN
  65. // tan(QNaN) = QNaN
  66. // tan(inf) = QNaN
  67. // tan(+/-0) = +/-0
  68. //
  69. //*********************************************************************
  70. //
  71. // Mathematical Description
  72. //
  73. // We consider the computation of FPTAN of Arg. Now, given
  74. //
  75. // Arg = N pi/2 + alpha, |alpha| <= pi/4,
  76. //
  77. // basic mathematical relationship shows that
  78. //
  79. // tan( Arg ) = tan( alpha ) if N is even;
  80. // = -cot( alpha ) otherwise.
  81. //
  82. // The value of alpha is obtained by argument reduction and
  83. // represented by two working precision numbers r and c where
  84. //
  85. // alpha = r + c accurately.
  86. //
  87. // The reduction method is described in a previous write up.
  88. // The argument reduction scheme identifies 4 cases. For Cases 2
  89. // and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
  90. // computed very easily by 2 or 3 terms of the Taylor series
  91. // expansion as follows:
  92. //
  93. // Case 2:
  94. // -------
  95. //
  96. // tan(r + c) = r + c + r^3/3 ...accurately
  97. // -cot(r + c) = -1/(r+c) + r/3 ...accurately
  98. //
  99. // Case 4:
  100. // -------
  101. //
  102. // tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately
  103. // -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately
  104. //
  105. //
  106. // The only cases left are Cases 1 and 3 of the argument reduction
  107. // procedure. These two cases will be merged since after the
  108. // argument is reduced in either cases, we have the reduced argument
  109. // represented as r + c and that the magnitude |r + c| is not small
  110. // enough to allow the usage of a very short approximation.
  111. //
  112. // The greatest challenge of this task is that the second terms of
  113. // the Taylor series for tan(r) and -cot(r)
  114. //
  115. // r + r^3/3 + 2 r^5/15 + ...
  116. //
  117. // and
  118. //
  119. // -1/r + r/3 + r^3/45 + ...
  120. //
  121. // are not very small when |r| is close to pi/4 and the rounding
  122. // errors will be a concern if simple polynomial accumulation is
  123. // used. When |r| < 2^(-2), however, the second terms will be small
  124. // enough (5 bits or so of right shift) that a normal Horner
  125. // recurrence suffices. Hence there are two cases that we consider
  126. // in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
  127. //
  128. // Case small_r: |r| < 2^(-2)
  129. // --------------------------
  130. //
  131. // Since Arg = N pi/4 + r + c accurately, we have
  132. //
  133. // tan(Arg) = tan(r+c) for N even,
  134. // = -cot(r+c) otherwise.
  135. //
  136. // Here for this case, both tan(r) and -cot(r) can be approximated
  137. // by simple polynomials:
  138. //
  139. // tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
  140. // -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
  141. //
  142. // accurately. Since |r| is relatively small, tan(r+c) and
  143. // -cot(r+c) can be accurately approximated by replacing r with
  144. // r+c only in the first two terms of the corresponding polynomials.
  145. //
  146. // Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
  147. // almost 64 sig. bits, thus
  148. //
  149. // P1_1 (r+c)^3 = P1_1 r^3 + c * r^2 accurately.
  150. //
  151. // Hence,
  152. //
  153. // tan(r+c) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
  154. // + c*(1 + r^2)
  155. //
  156. // -cot(r+c) = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
  157. // + Q1_1*c
  158. //
  159. //
  160. // Case normal_r: 2^(-2) <= |r| <= pi/4
  161. // ------------------------------------
  162. //
  163. // This case is more likely than the previous one if one considers
  164. // r to be uniformly distributed in [-pi/4 pi/4].
  165. //
  166. // The required calculation is either
  167. //
  168. // tan(r + c) = tan(r) + correction, or
  169. // -cot(r + c) = -cot(r) + correction.
  170. //
  171. // Specifically,
  172. //
  173. // tan(r + c) = tan(r) + c tan'(r) + O(c^2)
  174. // = tan(r) + c sec^2(r) + O(c^2)
  175. // = tan(r) + c SEC_sq ...accurately
  176. // as long as SEC_sq approximates sec^2(r)
  177. // to, say, 5 bits or so.
  178. //
  179. // Similarly,
  180. //
  181. // -cot(r + c) = -cot(r) - c cot'(r) + O(c^2)
  182. // = -cot(r) + c csc^2(r) + O(c^2)
  183. // = -cot(r) + c CSC_sq ...accurately
  184. // as long as CSC_sq approximates csc^2(r)
  185. // to, say, 5 bits or so.
  186. //
  187. // We therefore concentrate on accurately calculating tan(r) and
  188. // cot(r) for a working-precision number r, |r| <= pi/4 to within
  189. // 0.1% or so.
  190. //
  191. // We will employ a table-driven approach. Let
  192. //
  193. // r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
  194. // = sgn_r * ( B + x )
  195. //
  196. // where
  197. //
  198. // B = 2^k * 1.b_1 b_2 ... b_5 1
  199. // x = |r| - B
  200. //
  201. // Now,
  202. // tan(B) + tan(x)
  203. // tan( B + x ) = ------------------------
  204. // 1 - tan(B)*tan(x)
  205. //
  206. // / \
  207. // | tan(B) + tan(x) |
  208. // = tan(B) + | ------------------------ - tan(B) |
  209. // | 1 - tan(B)*tan(x) |
  210. // \ /
  211. //
  212. // sec^2(B) * tan(x)
  213. // = tan(B) + ------------------------
  214. // 1 - tan(B)*tan(x)
  215. //
  216. // (1/[sin(B)*cos(B)]) * tan(x)
  217. // = tan(B) + --------------------------------
  218. // cot(B) - tan(x)
  219. //
  220. //
  221. // Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
  222. // calculated beforehand and stored in a table. Since
  223. //
  224. // |x| <= 2^k * 2^(-6) <= 2^(-7) (because k = -1, -2)
  225. //
  226. // a very short polynomial will be sufficient to approximate tan(x)
  227. // accurately. The details involved in computing the last expression
  228. // will be given in the next section on algorithm description.
  229. //
  230. //
  231. // Now, we turn to the case where cot( B + x ) is needed.
  232. //
  233. //
  234. // 1 - tan(B)*tan(x)
  235. // cot( B + x ) = ------------------------
  236. // tan(B) + tan(x)
  237. //
  238. // / \
  239. // | 1 - tan(B)*tan(x) |
  240. // = cot(B) + | ----------------------- - cot(B) |
  241. // | tan(B) + tan(x) |
  242. // \ /
  243. //
  244. // [tan(B) + cot(B)] * tan(x)
  245. // = cot(B) - ----------------------------
  246. // tan(B) + tan(x)
  247. //
  248. // (1/[sin(B)*cos(B)]) * tan(x)
  249. // = cot(B) - --------------------------------
  250. // tan(B) + tan(x)
  251. //
  252. //
  253. // Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
  254. // are needed are the same set of values needed in the previous
  255. // case.
  256. //
  257. // Finally, we can put all the ingredients together as follows:
  258. //
  259. // Arg = N * pi/2 + r + c ...accurately
  260. //
  261. // tan(Arg) = tan(r) + correction if N is even;
  262. // = -cot(r) + correction otherwise.
  263. //
  264. // For Cases 2 and 4,
  265. //
  266. // Case 2:
  267. // tan(Arg) = tan(r + c) = r + c + r^3/3 N even
  268. // = -cot(r + c) = -1/(r+c) + r/3 N odd
  269. // Case 4:
  270. // tan(Arg) = tan(r + c) = r + c + r^3/3 + 2r^5/15 N even
  271. // = -cot(r + c) = -1/(r+c) + r/3 + r^3/45 N odd
  272. //
  273. //
  274. // For Cases 1 and 3,
  275. //
  276. // Case small_r: |r| < 2^(-2)
  277. //
  278. // tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
  279. // + c*(1 + r^2) N even
  280. //
  281. // = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
  282. // + Q1_1*c N odd
  283. //
  284. // Case normal_r: 2^(-2) <= |r| <= pi/4
  285. //
  286. // tan(Arg) = tan(r) + c * sec^2(r) N even
  287. // = -cot(r) + c * csc^2(r) otherwise
  288. //
  289. // For N even,
  290. //
  291. // tan(Arg) = tan(r) + c*sec^2(r)
  292. // = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
  293. // = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) )
  294. // = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) )
  295. //
  296. // since B approximates |r| to 2^(-6) in relative accuracy.
  297. //
  298. // / (1/[sin(B)*cos(B)]) * tan(x)
  299. // tan(Arg) = sgn_r * | tan(B) + --------------------------------
  300. // \ cot(B) - tan(x)
  301. // \
  302. // + CORR |
  303. // /
  304. // where
  305. //
  306. // CORR = sgn_r*c*tan(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)).
  307. //
  308. // For N odd,
  309. //
  310. // tan(Arg) = -cot(r) + c*csc^2(r)
  311. // = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
  312. // = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) )
  313. // = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) )
  314. //
  315. // since B approximates |r| to 2^(-6) in relative accuracy.
  316. //
  317. // / (1/[sin(B)*cos(B)]) * tan(x)
  318. // tan(Arg) = sgn_r * | -cot(B) + --------------------------------
  319. // \ tan(B) + tan(x)
  320. // \
  321. // + CORR |
  322. // /
  323. // where
  324. //
  325. // CORR = sgn_r*c*cot(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)).
  326. //
  327. //
  328. // The actual algorithm prescribes how all the mathematical formulas
  329. // are calculated.
  330. //
  331. //
  332. // 2. Algorithmic Description
  333. // ==========================
  334. //
  335. // 2.1 Computation for Cases 2 and 4.
  336. // ----------------------------------
  337. //
  338. // For Case 2, we use two-term polynomials.
  339. //
  340. // For N even,
  341. //
  342. // rsq := r * r
  343. // Result := c + r * rsq * P1_1
  344. // Result := r + Result ...in user-defined rounding
  345. //
  346. // For N odd,
  347. // S_hi := -frcpa(r) ...8 bits
  348. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
  349. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
  350. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
  351. // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
  352. // ...S_hi + S_lo is -1/(r+c) to extra precision
  353. // S_lo := S_lo + Q1_1*r
  354. //
  355. // Result := S_hi + S_lo ...in user-defined rounding
  356. //
  357. // For Case 4, we use three-term polynomials
  358. //
  359. // For N even,
  360. //
  361. // rsq := r * r
  362. // Result := c + r * rsq * (P1_1 + rsq * P1_2)
  363. // Result := r + Result ...in user-defined rounding
  364. //
  365. // For N odd,
  366. // S_hi := -frcpa(r) ...8 bits
  367. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
  368. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
  369. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
  370. // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
  371. // ...S_hi + S_lo is -1/(r+c) to extra precision
  372. // rsq := r * r
  373. // P := Q1_1 + rsq*Q1_2
  374. // S_lo := S_lo + r*P
  375. //
  376. // Result := S_hi + S_lo ...in user-defined rounding
  377. //
  378. //
  379. // Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
  380. // the same as those used in the small_r case of Cases 1 and 3
  381. // below.
  382. //
  383. //
  384. // 2.2 Computation for Cases 1 and 3.
  385. // ----------------------------------
  386. // This is further divided into the case of small_r,
  387. // where |r| < 2^(-2), and the case of normal_r, where |r| lies between
  388. // 2^(-2) and pi/4.
  389. //
  390. // Algorithm for the case of small_r
  391. // ---------------------------------
  392. //
  393. // For N even,
  394. // rsq := r * r
  395. // Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
  396. // r_to_the_8 := rsq * rsq
  397. // r_to_the_8 := r_to_the_8 * r_to_the_8
  398. // Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
  399. // CORR := c * ( 1 + rsq )
  400. // Poly := Poly1 + r_to_the_8*Poly2
  401. // Result := r*Poly + CORR
  402. // Result := r + Result ...in user-defined rounding
  403. // ...note that Poly1 and r_to_the_8 can be computed in parallel
  404. // ...with Poly2 (Poly1 is intentionally set to be much
  405. // ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
  406. //
  407. // For N odd,
  408. // S_hi := -frcpa(r) ...8 bits
  409. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
  410. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
  411. // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
  412. // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
  413. // ...S_hi + S_lo is -1/(r+c) to extra precision
  414. // S_lo := S_lo + Q1_1*c
  415. //
  416. // ...S_hi and S_lo are computed in parallel with
  417. // ...the following
  418. // rsq := r*r
  419. // P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
  420. //
  421. // Result := r*P + S_lo
  422. // Result := S_hi + Result ...in user-defined rounding
  423. //
  424. //
  425. // Algorithm for the case of normal_r
  426. // ----------------------------------
  427. //
  428. // Here, we first consider the computation of tan( r + c ). As
  429. // presented in the previous section,
  430. //
  431. // tan( r + c ) = tan(r) + c * sec^2(r)
  432. // = sgn_r * [ tan(B+x) + CORR ]
  433. // CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
  434. //
  435. // because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
  436. //
  437. // tan( r + c ) =
  438. // / (1/[sin(B)*cos(B)]) * tan(x)
  439. // sgn_r * | tan(B) + -------------------------------- +
  440. // \ cot(B) - tan(x)
  441. // \
  442. // CORR |
  443. // /
  444. //
  445. // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
  446. // calculated beforehand and stored in a table. Specifically,
  447. // the table values are
  448. //
  449. // tan(B) as T_hi + T_lo;
  450. // cot(B) as C_hi + C_lo;
  451. // 1/[sin(B)*cos(B)] as SC_inv
  452. //
  453. // T_hi, C_hi are in double-precision memory format;
  454. // T_lo, C_lo are in single-precision memory format;
  455. // SC_inv is in extended-precision memory format.
  456. //
  457. // The value of tan(x) will be approximated by a short polynomial of
  458. // the form
  459. //
  460. // tan(x) as x + x * P, where
  461. // P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
  462. //
  463. // Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
  464. // to a relative accuracy better than 2^(-20). Thus, a good
  465. // initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
  466. // division is:
  467. //
  468. // 1/(cot(B) - tan(x)) is approximately
  469. // 1/(cot(B) - x) is
  470. // tan(B)/(1 - x*tan(B)) is approximately
  471. // T_hi / ( 1 - T_hi * x ) is approximately
  472. //
  473. // T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
  474. //
  475. // The calculation of tan(r+c) therefore proceed as follows:
  476. //
  477. // Tx := T_hi * x
  478. // xsq := x * x
  479. //
  480. // V_hi := T_hi*(1 + Tx*(1 + Tx))
  481. // P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
  482. // ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
  483. // ...good to about 20 bits of accuracy
  484. //
  485. // tanx := x + x*P
  486. // D := C_hi - tanx
  487. // ...D is a double precision denominator: cot(B) - tan(x)
  488. //
  489. // V_hi := V_hi + V_hi*(1 - V_hi*D)
  490. // ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
  491. //
  492. // V_lo := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
  493. // - V_hi*C_lo ) ...observe all order
  494. // ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
  495. // ...to extra accuracy
  496. //
  497. // ... SC_inv(B) * (x + x*P)
  498. // ... tan(B) + ------------------------- + CORR
  499. // ... cot(B) - (x + x*P)
  500. // ...
  501. // ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
  502. // ...
  503. //
  504. // Sx := SC_inv * x
  505. // CORR := sgn_r * c * SC_inv * T_hi
  506. //
  507. // ...put the ingredients together to compute
  508. // ... SC_inv(B) * (x + x*P)
  509. // ... tan(B) + ------------------------- + CORR
  510. // ... cot(B) - (x + x*P)
  511. // ...
  512. // ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
  513. // ...
  514. // ... = T_hi + T_lo + CORR +
  515. // ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
  516. //
  517. // CORR := CORR + T_lo
  518. // tail := V_lo + P*(V_hi + V_lo)
  519. // tail := Sx * tail + CORR
  520. // tail := Sx * V_hi + tail
  521. // T_hi := sgn_r * T_hi
  522. //
  523. // ...T_hi + sgn_r*tail now approximate
  524. // ...sgn_r*(tan(B+x) + CORR) accurately
  525. //
  526. // Result := T_hi + sgn_r*tail ...in user-defined
  527. // ...rounding control
  528. // ...It is crucial that independent paths be fully
  529. // ...exploited for performance's sake.
  530. //
  531. //
  532. // Next, we consider the computation of -cot( r + c ). As
  533. // presented in the previous section,
  534. //
  535. // -cot( r + c ) = -cot(r) + c * csc^2(r)
  536. // = sgn_r * [ -cot(B+x) + CORR ]
  537. // CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
  538. //
  539. // because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
  540. //
  541. // -cot( r + c ) =
  542. // / (1/[sin(B)*cos(B)]) * tan(x)
  543. // sgn_r * | -cot(B) + -------------------------------- +
  544. // \ tan(B) + tan(x)
  545. // \
  546. // CORR |
  547. // /
  548. //
  549. // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
  550. // calculated beforehand and stored in a table. Specifically,
  551. // the table values are
  552. //
  553. // tan(B) as T_hi + T_lo;
  554. // cot(B) as C_hi + C_lo;
  555. // 1/[sin(B)*cos(B)] as SC_inv
  556. //
  557. // T_hi, C_hi are in double-precision memory format;
  558. // T_lo, C_lo are in single-precision memory format;
  559. // SC_inv is in extended-precision memory format.
  560. //
  561. // The value of tan(x) will be approximated by a short polynomial of
  562. // the form
  563. //
  564. // tan(x) as x + x * P, where
  565. // P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
  566. //
  567. // Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
  568. // to a relative accuracy better than 2^(-18). Thus, a good
  569. // initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
  570. // division is:
  571. //
  572. // 1/(tan(B) + tan(x)) is approximately
  573. // 1/(tan(B) + x) is
  574. // cot(B)/(1 + x*cot(B)) is approximately
  575. // C_hi / ( 1 + C_hi * x ) is approximately
  576. //
  577. // C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
  578. //
  579. // The calculation of -cot(r+c) therefore proceed as follows:
  580. //
  581. // Cx := C_hi * x
  582. // xsq := x * x
  583. //
  584. // V_hi := C_hi*(1 - Cx*(1 - Cx))
  585. // P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
  586. // ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
  587. // ...good to about 18 bits of accuracy
  588. //
  589. // tanx := x + x*P
  590. // D := T_hi + tanx
  591. // ...D is a double precision denominator: tan(B) + tan(x)
  592. //
  593. // V_hi := V_hi + V_hi*(1 - V_hi*D)
  594. // ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
  595. //
  596. // V_lo := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
  597. // - V_hi*T_lo ) ...observe all order
  598. // ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
  599. // ...to extra accuracy
  600. //
  601. // ... SC_inv(B) * (x + x*P)
  602. // ... -cot(B) + ------------------------- + CORR
  603. // ... tan(B) + (x + x*P)
  604. // ...
  605. // ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
  606. // ...
  607. //
  608. // Sx := SC_inv * x
  609. // CORR := sgn_r * c * SC_inv * C_hi
  610. //
  611. // ...put the ingredients together to compute
  612. // ... SC_inv(B) * (x + x*P)
  613. // ... -cot(B) + ------------------------- + CORR
  614. // ... tan(B) + (x + x*P)
  615. // ...
  616. // ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
  617. // ...
  618. // ... =-C_hi - C_lo + CORR +
  619. // ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
  620. //
  621. // CORR := CORR - C_lo
  622. // tail := V_lo + P*(V_hi + V_lo)
  623. // tail := Sx * tail + CORR
  624. // tail := Sx * V_hi + tail
  625. // C_hi := -sgn_r * C_hi
  626. //
  627. // ...C_hi + sgn_r*tail now approximates
  628. // ...sgn_r*(-cot(B+x) + CORR) accurately
  629. //
  630. // Result := C_hi + sgn_r*tail in user-defined rounding control
  631. // ...It is crucial that independent paths be fully
  632. // ...exploited for performance's sake.
  633. //
  634. // 3. Implementation Notes
  635. // =======================
  636. //
  637. // Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
  638. //
  639. // Recall that 2^(-2) <= |r| <= pi/4;
  640. //
  641. // r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
  642. //
  643. // and
  644. //
  645. // B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
  646. //
  647. // Thus, for k = -2, possible values of B are
  648. //
  649. // B = 2^(-2) * ( 1 + index/32 + 1/64 ),
  650. // index ranges from 0 to 31
  651. //
  652. // For k = -1, however, since |r| <= pi/4 = 0.78...
  653. // possible values of B are
  654. //
  655. // B = 2^(-1) * ( 1 + index/32 + 1/64 )
  656. // index ranges from 0 to 19.
  657. //
  658. //
  659. .data
  660. .align 128
  661. TAN_BASE_CONSTANTS:
  662. data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
  663. // two**-14, -two**-14
  664. data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
  665. data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
  666. data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
  667. data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
  668. data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
  669. data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
  670. data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
  671. data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
  672. data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
  673. data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
  674. data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
  675. data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
  676. data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
  677. data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
  678. data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
  679. data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
  680. data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
  681. data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
  682. data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
  683. data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
  684. data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
  685. data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
  686. data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
  687. data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
  688. data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
  689. data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
  690. data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
  691. data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
  692. data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
  693. data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
  694. data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
  695. data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
  696. //
  697. // Entries T_hi double-precision memory format
  698. // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
  699. // Entries T_lo single-precision memory format
  700. // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
  701. //
  702. data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
  703. data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
  704. data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
  705. data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
  706. data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
  707. data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
  708. data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
  709. data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
  710. data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
  711. data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
  712. data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
  713. data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
  714. data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
  715. data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
  716. data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
  717. data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
  718. data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
  719. data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
  720. data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
  721. data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
  722. data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
  723. data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
  724. data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
  725. data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
  726. data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
  727. data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
  728. data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
  729. data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
  730. data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
  731. data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
  732. data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
  733. data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
  734. //
  735. // Entries T_hi double-precision memory format
  736. // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
  737. // Entries T_lo single-precision memory format
  738. // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
  739. //
  740. data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
  741. data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
  742. data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
  743. data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
  744. data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
  745. data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
  746. data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
  747. data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
  748. data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
  749. data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
  750. data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
  751. data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
  752. data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
  753. data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
  754. data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
  755. data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
  756. data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
  757. data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
  758. data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
  759. data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
  760. //
  761. // Entries C_hi double-precision memory format
  762. // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
  763. // Entries C_lo single-precision memory format
  764. // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
  765. //
  766. data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
  767. data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
  768. data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
  769. data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
  770. data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
  771. data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
  772. data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
  773. data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
  774. data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
  775. data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
  776. data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
  777. data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
  778. data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
  779. data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
  780. data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
  781. data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
  782. data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
  783. data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
  784. data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
  785. data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
  786. data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
  787. data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
  788. data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
  789. data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
  790. data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
  791. data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
  792. data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
  793. data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
  794. data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
  795. data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
  796. data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
  797. data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
  798. //
  799. // Entries C_hi double-precision memory format
  800. // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
  801. // Entries C_lo single-precision memory format
  802. // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
  803. //
  804. data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
  805. data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
  806. data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
  807. data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
  808. data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
  809. data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
  810. data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
  811. data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
  812. data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
  813. data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
  814. data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
  815. data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
  816. data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
  817. data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
  818. data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
  819. data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
  820. data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
  821. data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
  822. data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
  823. data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
  824. //
  825. // Entries SC_inv in Swapped IEEE format (extended)
  826. // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
  827. //
  828. data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
  829. data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
  830. data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
  831. data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
  832. data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
  833. data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
  834. data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
  835. data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
  836. data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
  837. data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
  838. data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
  839. data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
  840. data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
  841. data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
  842. data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
  843. data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
  844. data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
  845. data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
  846. data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
  847. data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
  848. data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
  849. data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
  850. data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
  851. data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
  852. data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
  853. data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
  854. data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
  855. data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
  856. data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
  857. data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
  858. data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
  859. data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
  860. //
  861. // Entries SC_inv in Swapped IEEE format (extended)
  862. // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
  863. //
  864. data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
  865. data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
  866. data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
  867. data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
  868. data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
  869. data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
  870. data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
  871. data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
  872. data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
  873. data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
  874. data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
  875. data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
  876. data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
  877. data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
  878. data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
  879. data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
  880. data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
  881. data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
  882. data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
  883. data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000
  884. Arg = f8
  885. Result = f8
  886. U_2 = f10
  887. rsq = f11
  888. C_hi = f12
  889. C_lo = f13
  890. T_hi = f14
  891. T_lo = f15
  892. N_0 = f32
  893. d_1 = f33
  894. MPI_BY_4 = f34
  895. tail = f35
  896. tanx = f36
  897. Cx = f37
  898. Sx = f38
  899. sgn_r = f39
  900. CORR = f40
  901. P = f41
  902. D = f42
  903. ArgPrime = f43
  904. P_0 = f44
  905. P2_1 = f45
  906. P2_2 = f46
  907. P2_3 = f47
  908. P1_1 = f45
  909. P1_2 = f46
  910. P1_3 = f47
  911. P1_4 = f48
  912. P1_5 = f49
  913. P1_6 = f50
  914. P1_7 = f51
  915. P1_8 = f52
  916. P1_9 = f53
  917. TWO_TO_63 = f54
  918. NEGTWO_TO_63 = f55
  919. x = f56
  920. xsq = f57
  921. Tx = f58
  922. Tx1 = f59
  923. Set = f60
  924. poly1 = f61
  925. poly2 = f62
  926. Poly = f63
  927. Poly1 = f64
  928. Poly2 = f65
  929. r_to_the_8 = f66
  930. B = f67
  931. SC_inv = f68
  932. Pos_r = f69
  933. N_0_fix = f70
  934. PI_BY_4 = f71
  935. NEGTWO_TO_NEG2 = f72
  936. TWO_TO_24 = f73
  937. TWO_TO_NEG14 = f74
  938. TWO_TO_NEG33 = f75
  939. NEGTWO_TO_24 = f76
  940. NEGTWO_TO_NEG14 = f76
  941. NEGTWO_TO_NEG33 = f77
  942. two_by_PI = f78
  943. N = f79
  944. N_fix = f80
  945. P_1 = f81
  946. P_2 = f82
  947. P_3 = f83
  948. s_val = f84
  949. w = f85
  950. c = f86
  951. r = f87
  952. Z = f88
  953. A = f89
  954. a = f90
  955. t = f91
  956. U_1 = f92
  957. d_2 = f98
  958. TWO_TO_NEG2 = f94
  959. Q1_1 = f95
  960. Q1_2 = f96
  961. Q1_3 = f97
  962. Q1_4 = f98
  963. Q1_5 = f99
  964. Q1_6 = f100
  965. Q1_7 = f101
  966. Q1_8 = f102
  967. S_hi = f103
  968. S_lo = f104
  969. V_hi = f105
  970. V_lo = f106
  971. U_hi = f107
  972. U_lo = f108
  973. U_hiabs = f109
  974. V_hiabs = f110
  975. V = f111
  976. Inv_P_0 = f112
  977. GR_SAVE_B0 = r33
  978. GR_SAVE_GP = r34
  979. GR_SAVE_PFS = r35
  980. delta1 = r36
  981. table_ptr1 = r37
  982. table_ptr2 = r38
  983. i_0 = r39
  984. i_1 = r40
  985. N_fix_gr = r41
  986. N_inc = r42
  987. exp_Arg = r43
  988. exp_r = r44
  989. sig_r = r45
  990. lookup = r46
  991. table_offset = r47
  992. Create_B = r48
  993. GR_Parameter_X = r49
  994. GR_Parameter_r = r50
  995. .global __libm_tan
  996. .section .text
  997. .proc __libm_tan
  998. __libm_tan:
  999. { .mfi
  1000. alloc r32 = ar.pfs, 0,17,2,0
  1001. (p0) fclass.m.unc p6,p0 = Arg, 0x1E7
  1002. nop.i 999
  1003. }
  1004. ;;
  1005. { .mfi
  1006. nop.m 999
  1007. (p0) fclass.nm.unc p7,p0 = Arg, 0x1FF
  1008. nop.i 999
  1009. }
  1010. ;;
  1011. { .mfi
  1012. (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
  1013. nop.f 999
  1014. nop.i 999
  1015. }
  1016. ;;
  1017. { .mmi
  1018. ld8 table_ptr1 = [table_ptr1]
  1019. nop.m 999
  1020. nop.i 999
  1021. }
  1022. ;;
  1023. //
  1024. // Check for NatVals, Infs , NaNs, and Zeros
  1025. // Check for everything - if false, then must be pseudo-zero
  1026. // or pseudo-nan.
  1027. // Local table pointer
  1028. //
  1029. { .mbb
  1030. (p0) add table_ptr2 = 96, table_ptr1
  1031. (p6) br.cond.spnt __libm_TAN_SPECIAL
  1032. (p7) br.cond.spnt __libm_TAN_SPECIAL ;;
  1033. }
  1034. //
  1035. // Point to Inv_P_0
  1036. // Branch out to deal with unsupporteds and special values.
  1037. //
  1038. { .mmf
  1039. (p0) ldfs TWO_TO_24 = [table_ptr1],4
  1040. (p0) ldfs TWO_TO_63 = [table_ptr2],4
  1041. //
  1042. // Load -2**24, load -2**63.
  1043. //
  1044. (p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;;
  1045. }
  1046. { .mfi
  1047. (p0) ldfs NEGTWO_TO_63 = [table_ptr2],12
  1048. (p0) fnorm.s1 Arg = Arg
  1049. nop.i 999
  1050. }
  1051. //
  1052. // Load 2**24, Load 2**63.
  1053. //
  1054. { .mmi
  1055. (p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
  1056. //
  1057. // Do fcmp to generate Denormal exception
  1058. // - can't do FNORM (will generate Underflow when U is unmasked!)
  1059. // Normalize input argument.
  1060. //
  1061. (p0) ldfe two_by_PI = [table_ptr1],16
  1062. nop.i 999
  1063. }
  1064. { .mmi
  1065. (p0) ldfe Inv_P_0 = [table_ptr2],16 ;;
  1066. (p0) ldfe d_1 = [table_ptr2],16
  1067. nop.i 999
  1068. }
  1069. //
  1070. // Decide about the paths to take:
  1071. // PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
  1072. // OTHERWISE - CASE 3 OR 4
  1073. // Load inverse of P_0 .
  1074. // Set PR_6 if Arg <= -2**63
  1075. // Are there any Infs, NaNs, or zeros?
  1076. //
  1077. { .mmi
  1078. (p0) ldfe P_0 = [table_ptr1],16 ;;
  1079. (p0) ldfe d_2 = [table_ptr2],16
  1080. nop.i 999
  1081. }
  1082. //
  1083. // Set PR_8 if Arg <= -2**24
  1084. // Set PR_6 if Arg >= 2**63
  1085. //
  1086. { .mmi
  1087. (p0) ldfe P_1 = [table_ptr1],16 ;;
  1088. (p0) ldfe PI_BY_4 = [table_ptr2],16
  1089. nop.i 999
  1090. }
  1091. //
  1092. // Set PR_8 if Arg >= 2**24
  1093. //
  1094. { .mmi
  1095. (p0) ldfe P_2 = [table_ptr1],16 ;;
  1096. (p0) ldfe MPI_BY_4 = [table_ptr2],16
  1097. nop.i 999
  1098. }
  1099. //
  1100. // Load P_2 and PI_BY_4
  1101. //
  1102. { .mfi
  1103. (p0) ldfe P_3 = [table_ptr1],16
  1104. nop.f 999
  1105. nop.i 999 ;;
  1106. }
  1107. { .mfi
  1108. nop.m 999
  1109. (p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
  1110. nop.i 999
  1111. }
  1112. { .mfi
  1113. nop.m 999
  1114. (p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
  1115. nop.i 999 ;;
  1116. }
  1117. { .mfi
  1118. nop.m 999
  1119. (p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
  1120. nop.i 999
  1121. }
  1122. { .mfi
  1123. nop.m 999
  1124. (p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
  1125. nop.i 999 ;;
  1126. }
  1127. { .mib
  1128. nop.m 999
  1129. nop.i 999
  1130. //
  1131. // Load P_3 and -PI_BY_4
  1132. //
  1133. (p6) br.cond.spnt TAN_ARG_TOO_LARGE ;;
  1134. }
  1135. { .mib
  1136. nop.m 999
  1137. nop.i 999
  1138. //
  1139. // Load 2**(-2).
  1140. // Load -2**(-2).
  1141. // Branch out if we have a special argument.
  1142. // Branch out if the magnitude of the input argument is too large
  1143. // - do this branch before the next.
  1144. //
  1145. (p8) br.cond.spnt TAN_LARGER_ARG ;;
  1146. }
  1147. //
  1148. // Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
  1149. //
  1150. { .mfi
  1151. (p0) ldfs TWO_TO_NEG2 = [table_ptr2],4
  1152. // ARGUMENT REDUCTION CODE - CASE 1 and 2
  1153. // Load 2**(-2).
  1154. // Load -2**(-2).
  1155. (p0) fmpy.s1 N = Arg,two_by_PI
  1156. nop.i 999 ;;
  1157. }
  1158. { .mfi
  1159. (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
  1160. //
  1161. // N = Arg * 2/pi
  1162. //
  1163. (p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
  1164. nop.i 999 ;;
  1165. }
  1166. { .mfi
  1167. nop.m 999
  1168. //
  1169. // if Arg < pi/4, set PR_8.
  1170. //
  1171. (p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
  1172. nop.i 999 ;;
  1173. }
  1174. //
  1175. // Case 1: Is |r| < 2**(-2).
  1176. // Arg is the same as r in this case.
  1177. // r = Arg
  1178. // c = 0
  1179. //
  1180. { .mfi
  1181. (p8) mov N_fix_gr = r0
  1182. //
  1183. // if Arg > -pi/4, reset PR_8.
  1184. // Select the case when |Arg| < pi/4 - set PR[8] = true.
  1185. // Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
  1186. //
  1187. (p0) fcvt.fx.s1 N_fix = N
  1188. nop.i 999 ;;
  1189. }
  1190. { .mfi
  1191. nop.m 999
  1192. //
  1193. // Grab the integer part of N .
  1194. //
  1195. (p8) mov r = Arg
  1196. nop.i 999
  1197. }
  1198. { .mfi
  1199. nop.m 999
  1200. (p8) mov c = f0
  1201. nop.i 999 ;;
  1202. }
  1203. { .mfi
  1204. nop.m 999
  1205. (p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
  1206. nop.i 999 ;;
  1207. }
  1208. { .mfi
  1209. nop.m 999
  1210. (p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
  1211. nop.i 999 ;;
  1212. }
  1213. { .mfi
  1214. nop.m 999
  1215. //
  1216. // Case 2: Place integer part of N in GP register.
  1217. //
  1218. (p9) fcvt.xf N = N_fix
  1219. nop.i 999 ;;
  1220. }
  1221. { .mib
  1222. (p9) getf.sig N_fix_gr = N_fix
  1223. nop.i 999
  1224. //
  1225. // Case 2: Convert integer N_fix back to normalized floating-point value.
  1226. //
  1227. (p10) br.cond.spnt TAN_SMALL_R ;;
  1228. }
  1229. { .mib
  1230. nop.m 999
  1231. nop.i 999
  1232. (p8) br.cond.sptk TAN_NORMAL_R ;;
  1233. }
  1234. //
  1235. // Case 1: PR_3 is only affected when PR_1 is set.
  1236. //
  1237. { .mmi
  1238. (p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
  1239. //
  1240. // Case 2: Load 2**(-33).
  1241. //
  1242. (p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
  1243. nop.i 999 ;;
  1244. }
  1245. { .mfi
  1246. nop.m 999
  1247. //
  1248. // Case 2: Load -2**(-33).
  1249. //
  1250. (p9) fnma.s1 s_val = N, P_1, Arg
  1251. nop.i 999
  1252. }
  1253. { .mfi
  1254. nop.m 999
  1255. (p9) fmpy.s1 w = N, P_2
  1256. nop.i 999 ;;
  1257. }
  1258. { .mfi
  1259. nop.m 999
  1260. //
  1261. // Case 2: w = N * P_2
  1262. // Case 2: s_val = -N * P_1 + Arg
  1263. //
  1264. (p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
  1265. nop.i 999 ;;
  1266. }
  1267. { .mfi
  1268. nop.m 999
  1269. //
  1270. // Decide between case_1 and case_2 reduce:
  1271. //
  1272. (p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
  1273. nop.i 999 ;;
  1274. }
  1275. { .mfi
  1276. nop.m 999
  1277. //
  1278. // Case 1_reduce: s <= -2**(-33) or s >= 2**(-33)
  1279. // Case 2_reduce: -2**(-33) < s < 2**(-33)
  1280. //
  1281. (p8) fsub.s1 r = s_val, w
  1282. nop.i 999
  1283. }
  1284. { .mfi
  1285. nop.m 999
  1286. (p9) fmpy.s1 w = N, P_3
  1287. nop.i 999 ;;
  1288. }
  1289. { .mfi
  1290. nop.m 999
  1291. (p9) fma.s1 U_1 = N, P_2, w
  1292. nop.i 999
  1293. }
  1294. { .mfi
  1295. nop.m 999
  1296. //
  1297. // Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
  1298. // else set PR_11.
  1299. //
  1300. (p8) fsub.s1 c = s_val, r
  1301. nop.i 999 ;;
  1302. }
  1303. { .mfi
  1304. nop.m 999
  1305. //
  1306. // Case 1_reduce: r = s + w (change sign)
  1307. // Case 2_reduce: w = N * P_3 (change sign)
  1308. //
  1309. (p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
  1310. nop.i 999 ;;
  1311. }
  1312. { .mfi
  1313. nop.m 999
  1314. (p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
  1315. nop.i 999 ;;
  1316. }
  1317. { .mfi
  1318. nop.m 999
  1319. (p9) fsub.s1 r = s_val, U_1
  1320. nop.i 999
  1321. }
  1322. { .mfi
  1323. nop.m 999
  1324. //
  1325. // Case 1_reduce: c is complete here.
  1326. // c = c + w (w has not been negated.)
  1327. // Case 2_reduce: r is complete here - continue to calculate c .
  1328. // r = s - U_1
  1329. //
  1330. (p9) fms.s1 U_2 = N, P_2, U_1
  1331. nop.i 999 ;;
  1332. }
  1333. { .mfi
  1334. nop.m 999
  1335. //
  1336. // Case 1_reduce: c = s - r
  1337. // Case 2_reduce: U_1 = N * P_2 + w
  1338. //
  1339. (p8) fsub.s1 c = c, w
  1340. nop.i 999 ;;
  1341. }
  1342. { .mfi
  1343. nop.m 999
  1344. (p9) fsub.s1 s_val = s_val, r
  1345. nop.i 999
  1346. }
  1347. { .mfb
  1348. nop.m 999
  1349. //
  1350. // Case 2_reduce:
  1351. // U_2 = N * P_2 - U_1
  1352. // Not needed until later.
  1353. //
  1354. (p9) fadd.s1 U_2 = U_2, w
  1355. //
  1356. // Case 2_reduce:
  1357. // s = s - r
  1358. // U_2 = U_2 + w
  1359. //
  1360. (p10) br.cond.spnt TAN_SMALL_R ;;
  1361. }
  1362. { .mib
  1363. nop.m 999
  1364. nop.i 999
  1365. (p11) br.cond.sptk TAN_NORMAL_R ;;
  1366. }
  1367. { .mii
  1368. nop.m 999
  1369. //
  1370. // Case 2_reduce:
  1371. // c = c - U_2
  1372. // c is complete here
  1373. // Argument reduction ends here.
  1374. //
  1375. (p9) extr.u i_1 = N_fix_gr, 0, 1 ;;
  1376. (p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
  1377. }
  1378. { .mfi
  1379. nop.m 999
  1380. //
  1381. // Is i_1 even or odd?
  1382. // if i_1 == 0, set p11, else set p12.
  1383. //
  1384. (p11) fmpy.s1 rsq = r, Z
  1385. nop.i 999 ;;
  1386. }
  1387. { .mfi
  1388. nop.m 999
  1389. (p12) frcpa.s1 S_hi,p0 = f1, r
  1390. nop.i 999
  1391. }
  1392. //
  1393. // Case 1: Branch to SMALL_R or NORMAL_R.
  1394. // Case 1 is done now.
  1395. //
  1396. { .mfi
  1397. (p9) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
  1398. (p9) fsub.s1 c = s_val, U_1
  1399. nop.i 999 ;;
  1400. }
  1401. ;;
  1402. { .mmi
  1403. (p9) ld8 table_ptr1 = [table_ptr1]
  1404. nop.m 999
  1405. nop.i 999
  1406. }
  1407. ;;
  1408. { .mmi
  1409. (p9) add table_ptr1 = 224, table_ptr1 ;;
  1410. (p9) ldfe P1_1 = [table_ptr1],144
  1411. nop.i 999 ;;
  1412. }
  1413. //
  1414. // Get [i_1] - lsb of N_fix_gr .
  1415. // Load P1_1 and point to Q1_1 .
  1416. //
  1417. { .mfi
  1418. (p9) ldfe Q1_1 = [table_ptr1] , 0
  1419. //
  1420. // N even: rsq = r * Z
  1421. // N odd: S_hi = frcpa(r)
  1422. //
  1423. (p12) fmerge.ns S_hi = S_hi, S_hi
  1424. nop.i 999
  1425. }
  1426. { .mfi
  1427. nop.m 999
  1428. //
  1429. // Case 2_reduce:
  1430. // c = s - U_1
  1431. //
  1432. (p9) fsub.s1 c = c, U_2
  1433. nop.i 999 ;;
  1434. }
  1435. { .mfi
  1436. nop.m 999
  1437. (p12) fma.s1 poly1 = S_hi, r, f1
  1438. nop.i 999 ;;
  1439. }
  1440. { .mfi
  1441. nop.m 999
  1442. //
  1443. // N odd: Change sign of S_hi
  1444. //
  1445. (p11) fmpy.s1 rsq = rsq, P1_1
  1446. nop.i 999 ;;
  1447. }
  1448. { .mfi
  1449. nop.m 999
  1450. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  1451. nop.i 999 ;;
  1452. }
  1453. { .mfi
  1454. nop.m 999
  1455. //
  1456. // N even: rsq = rsq * P1_1
  1457. // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
  1458. //
  1459. (p11) fma.s1 Result = r, rsq, c
  1460. nop.i 999 ;;
  1461. }
  1462. { .mfi
  1463. nop.m 999
  1464. //
  1465. // N even: Result = c + r * rsq
  1466. // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
  1467. //
  1468. (p12) fma.s1 poly1 = S_hi, r, f1
  1469. nop.i 999 ;;
  1470. }
  1471. { .mfi
  1472. nop.m 999
  1473. //
  1474. // N even: Result = Result + r
  1475. // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
  1476. //
  1477. (p11) fadd.s0 Result = r, Result
  1478. nop.i 999 ;;
  1479. }
  1480. { .mfi
  1481. nop.m 999
  1482. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  1483. nop.i 999 ;;
  1484. }
  1485. { .mfi
  1486. nop.m 999
  1487. //
  1488. // N even: Result1 = Result + r
  1489. // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
  1490. //
  1491. (p12) fma.s1 poly1 = S_hi, r, f1
  1492. nop.i 999 ;;
  1493. }
  1494. { .mfi
  1495. nop.m 999
  1496. //
  1497. // N odd: poly1 = S_hi * r + 1.0 64 bits partial
  1498. //
  1499. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  1500. nop.i 999 ;;
  1501. }
  1502. { .mfi
  1503. nop.m 999
  1504. //
  1505. // N odd: poly1 = S_hi * poly + 1.0 64 bits
  1506. //
  1507. (p12) fma.s1 poly1 = S_hi, r, f1
  1508. nop.i 999 ;;
  1509. }
  1510. { .mfi
  1511. nop.m 999
  1512. //
  1513. // N odd: poly1 = S_hi * r + 1.0
  1514. //
  1515. (p12) fma.s1 poly1 = S_hi, c, poly1
  1516. nop.i 999 ;;
  1517. }
  1518. { .mfi
  1519. nop.m 999
  1520. //
  1521. // N odd: poly1 = S_hi * c + poly1
  1522. //
  1523. (p12) fmpy.s1 S_lo = S_hi, poly1
  1524. nop.i 999 ;;
  1525. }
  1526. { .mfi
  1527. nop.m 999
  1528. //
  1529. // N odd: S_lo = S_hi * poly1
  1530. //
  1531. (p12) fma.s1 S_lo = Q1_1, r, S_lo
  1532. nop.i 999
  1533. }
  1534. { .mfi
  1535. nop.m 999
  1536. //
  1537. // N odd: Result = S_hi + S_lo
  1538. //
  1539. (p0) fmpy.s0 Q1_1 = Q1_1, Q1_1
  1540. nop.i 999 ;;
  1541. }
  1542. { .mfb
  1543. nop.m 999
  1544. //
  1545. // N odd: S_lo = S_lo + Q1_1 * r
  1546. //
  1547. (p12) fadd.s0 Result = S_hi, S_lo
  1548. //
  1549. // Do a dummy multiply to raise inexact.
  1550. //
  1551. (p0) br.ret.sptk b0 ;;
  1552. }
  1553. TAN_LARGER_ARG:
  1554. { .mmf
  1555. (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
  1556. nop.m 999
  1557. (p0) fmpy.s1 N_0 = Arg, Inv_P_0
  1558. }
  1559. ;;
  1560. //
  1561. // ARGUMENT REDUCTION CODE - CASE 3 and 4
  1562. //
  1563. //
  1564. // Adjust table_ptr1 to beginning of table.
  1565. // N_0 = Arg * Inv_P_0
  1566. //
  1567. { .mmi
  1568. (p0) ld8 table_ptr1 = [table_ptr1]
  1569. nop.m 999
  1570. nop.i 999
  1571. }
  1572. ;;
  1573. { .mmi
  1574. (p0) add table_ptr1 = 8, table_ptr1 ;;
  1575. //
  1576. // Point to 2*-14
  1577. //
  1578. (p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4
  1579. nop.i 999 ;;
  1580. }
  1581. //
  1582. // Load 2**(-14).
  1583. //
  1584. { .mmi
  1585. (p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
  1586. //
  1587. // N_0_fix = integer part of N_0 .
  1588. // Adjust table_ptr1 to beginning of table.
  1589. //
  1590. (p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4
  1591. nop.i 999 ;;
  1592. }
  1593. //
  1594. // Make N_0 the integer part.
  1595. //
  1596. { .mfi
  1597. (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1]
  1598. //
  1599. // Load -2**(-14).
  1600. //
  1601. (p0) fcvt.fx.s1 N_0_fix = N_0
  1602. nop.i 999 ;;
  1603. }
  1604. { .mfi
  1605. nop.m 999
  1606. (p0) fcvt.xf N_0 = N_0_fix
  1607. nop.i 999 ;;
  1608. }
  1609. { .mfi
  1610. nop.m 999
  1611. (p0) fnma.s1 ArgPrime = N_0, P_0, Arg
  1612. nop.i 999
  1613. }
  1614. { .mfi
  1615. nop.m 999
  1616. (p0) fmpy.s1 w = N_0, d_1
  1617. nop.i 999 ;;
  1618. }
  1619. { .mfi
  1620. nop.m 999
  1621. //
  1622. // ArgPrime = -N_0 * P_0 + Arg
  1623. // w = N_0 * d_1
  1624. //
  1625. (p0) fmpy.s1 N = ArgPrime, two_by_PI
  1626. nop.i 999 ;;
  1627. }
  1628. { .mfi
  1629. nop.m 999
  1630. //
  1631. // N = ArgPrime * 2/pi
  1632. //
  1633. (p0) fcvt.fx.s1 N_fix = N
  1634. nop.i 999 ;;
  1635. }
  1636. { .mfi
  1637. nop.m 999
  1638. //
  1639. // N_fix is the integer part.
  1640. //
  1641. (p0) fcvt.xf N = N_fix
  1642. nop.i 999 ;;
  1643. }
  1644. { .mfi
  1645. (p0) getf.sig N_fix_gr = N_fix
  1646. nop.f 999
  1647. nop.i 999 ;;
  1648. }
  1649. { .mfi
  1650. nop.m 999
  1651. //
  1652. // N is the integer part of the reduced-reduced argument.
  1653. // Put the integer in a GP register.
  1654. //
  1655. (p0) fnma.s1 s_val = N, P_1, ArgPrime
  1656. nop.i 999
  1657. }
  1658. { .mfi
  1659. nop.m 999
  1660. (p0) fnma.s1 w = N, P_2, w
  1661. nop.i 999 ;;
  1662. }
  1663. { .mfi
  1664. nop.m 999
  1665. //
  1666. // s_val = -N*P_1 + ArgPrime
  1667. // w = -N*P_2 + w
  1668. //
  1669. (p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
  1670. nop.i 999 ;;
  1671. }
  1672. { .mfi
  1673. nop.m 999
  1674. (p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
  1675. nop.i 999 ;;
  1676. }
  1677. { .mfi
  1678. nop.m 999
  1679. //
  1680. // Case 3: r = s_val + w (Z complete)
  1681. // Case 4: U_hi = N_0 * d_1
  1682. //
  1683. (p10) fmpy.s1 V_hi = N, P_2
  1684. nop.i 999
  1685. }
  1686. { .mfi
  1687. nop.m 999
  1688. (p11) fmpy.s1 U_hi = N_0, d_1
  1689. nop.i 999 ;;
  1690. }
  1691. { .mfi
  1692. nop.m 999
  1693. //
  1694. // Case 3: r = s_val + w (Z complete)
  1695. // Case 4: U_hi = N_0 * d_1
  1696. //
  1697. (p11) fmpy.s1 V_hi = N, P_2
  1698. nop.i 999
  1699. }
  1700. { .mfi
  1701. nop.m 999
  1702. (p11) fmpy.s1 U_hi = N_0, d_1
  1703. nop.i 999 ;;
  1704. }
  1705. { .mfi
  1706. nop.m 999
  1707. //
  1708. // Decide between case 3 and 4:
  1709. // Case 3: s <= -2**(-14) or s >= 2**(-14)
  1710. // Case 4: -2**(-14) < s < 2**(-14)
  1711. //
  1712. (p10) fadd.s1 r = s_val, w
  1713. nop.i 999
  1714. }
  1715. { .mfi
  1716. nop.m 999
  1717. (p11) fmpy.s1 w = N, P_3
  1718. nop.i 999 ;;
  1719. }
  1720. { .mfi
  1721. nop.m 999
  1722. //
  1723. // Case 4: We need abs of both U_hi and V_hi - dont
  1724. // worry about switched sign of V_hi .
  1725. //
  1726. (p11) fsub.s1 A = U_hi, V_hi
  1727. nop.i 999
  1728. }
  1729. { .mfi
  1730. nop.m 999
  1731. //
  1732. // Case 4: A = U_hi + V_hi
  1733. // Note: Worry about switched sign of V_hi, so subtract instead of add.
  1734. //
  1735. (p11) fnma.s1 V_lo = N, P_2, V_hi
  1736. nop.i 999 ;;
  1737. }
  1738. { .mfi
  1739. nop.m 999
  1740. (p11) fms.s1 U_lo = N_0, d_1, U_hi
  1741. nop.i 999 ;;
  1742. }
  1743. { .mfi
  1744. nop.m 999
  1745. (p11) fabs V_hiabs = V_hi
  1746. nop.i 999
  1747. }
  1748. { .mfi
  1749. nop.m 999
  1750. //
  1751. // Case 4: V_hi = N * P_2
  1752. // w = N * P_3
  1753. // Note the product does not include the (-) as in the writeup
  1754. // so (-) missing for V_hi and w .
  1755. (p10) fadd.s1 r = s_val, w
  1756. nop.i 999 ;;
  1757. }
  1758. { .mfi
  1759. nop.m 999
  1760. //
  1761. // Case 3: c = s_val - r
  1762. // Case 4: U_lo = N_0 * d_1 - U_hi
  1763. //
  1764. (p11) fabs U_hiabs = U_hi
  1765. nop.i 999
  1766. }
  1767. { .mfi
  1768. nop.m 999
  1769. (p11) fmpy.s1 w = N, P_3
  1770. nop.i 999 ;;
  1771. }
  1772. { .mfi
  1773. nop.m 999
  1774. //
  1775. // Case 4: Set P_12 if U_hiabs >= V_hiabs
  1776. //
  1777. (p11) fadd.s1 C_hi = s_val, A
  1778. nop.i 999 ;;
  1779. }
  1780. { .mfi
  1781. nop.m 999
  1782. //
  1783. // Case 4: C_hi = s_val + A
  1784. //
  1785. (p11) fadd.s1 t = U_lo, V_lo
  1786. nop.i 999 ;;
  1787. }
  1788. { .mfi
  1789. nop.m 999
  1790. //
  1791. // Case 3: Is |r| < 2**(-2), if so set PR_7
  1792. // else set PR_8.
  1793. // Case 3: If PR_7 is set, prepare to branch to Small_R.
  1794. // Case 3: If PR_8 is set, prepare to branch to Normal_R.
  1795. //
  1796. (p10) fsub.s1 c = s_val, r
  1797. nop.i 999 ;;
  1798. }
  1799. { .mfi
  1800. nop.m 999
  1801. //
  1802. // Case 3: c = (s - r) + w (c complete)
  1803. //
  1804. (p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
  1805. nop.i 999
  1806. }
  1807. { .mfi
  1808. nop.m 999
  1809. (p11) fms.s1 w = N_0, d_2, w
  1810. nop.i 999 ;;
  1811. }
  1812. { .mfi
  1813. nop.m 999
  1814. //
  1815. // Case 4: V_hi = N * P_2
  1816. // w = N * P_3
  1817. // Note the product does not include the (-) as in the writeup
  1818. // so (-) missing for V_hi and w .
  1819. //
  1820. (p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
  1821. nop.i 999 ;;
  1822. }
  1823. { .mfi
  1824. nop.m 999
  1825. (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
  1826. nop.i 999 ;;
  1827. }
  1828. { .mfb
  1829. nop.m 999
  1830. //
  1831. // Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
  1832. // Note: the (-) is still missing for V_hi .
  1833. // Case 4: w = w + N_0 * d_2
  1834. // Note: the (-) is now incorporated in w .
  1835. //
  1836. (p10) fadd.s1 c = c, w
  1837. //
  1838. // Case 4: t = U_lo + V_lo
  1839. // Note: remember V_lo should be (-), subtract instead of add. NO
  1840. //
  1841. (p14) br.cond.spnt TAN_SMALL_R ;;
  1842. }
  1843. { .mib
  1844. nop.m 999
  1845. nop.i 999
  1846. (p15) br.cond.spnt TAN_NORMAL_R ;;
  1847. }
  1848. { .mfi
  1849. nop.m 999
  1850. //
  1851. // Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true.
  1852. // The remaining stuff is for Case 4.
  1853. //
  1854. (p12) fsub.s1 a = U_hi, A
  1855. (p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
  1856. }
  1857. { .mfi
  1858. nop.m 999
  1859. //
  1860. // Case 4: C_lo = s_val - C_hi
  1861. //
  1862. (p11) fadd.s1 t = t, w
  1863. nop.i 999
  1864. }
  1865. { .mfi
  1866. nop.m 999
  1867. (p13) fadd.s1 a = V_hi, A
  1868. nop.i 999 ;;
  1869. }
  1870. //
  1871. // Case 4: a = U_hi - A
  1872. // a = V_hi - A (do an add to account for missing (-) on V_hi
  1873. //
  1874. { .mfi
  1875. (p11) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
  1876. (p11) fsub.s1 C_lo = s_val, C_hi
  1877. nop.i 999
  1878. }
  1879. ;;
  1880. { .mmi
  1881. (p11) ld8 table_ptr1 = [table_ptr1]
  1882. nop.m 999
  1883. nop.i 999
  1884. }
  1885. ;;
  1886. //
  1887. // Case 4: a = (U_hi - A) + V_hi
  1888. // a = (V_hi - A) + U_hi
  1889. // In each case account for negative missing form V_hi .
  1890. //
  1891. //
  1892. // Case 4: C_lo = (s_val - C_hi) + A
  1893. //
  1894. { .mmi
  1895. (p11) add table_ptr1 = 224, table_ptr1 ;;
  1896. (p11) ldfe P1_1 = [table_ptr1], 16
  1897. nop.i 999 ;;
  1898. }
  1899. { .mfi
  1900. (p11) ldfe P1_2 = [table_ptr1], 128
  1901. //
  1902. // Case 4: w = U_lo + V_lo + w
  1903. //
  1904. (p12) fsub.s1 a = a, V_hi
  1905. nop.i 999 ;;
  1906. }
  1907. //
  1908. // Case 4: r = C_hi + C_lo
  1909. //
  1910. { .mfi
  1911. (p11) ldfe Q1_1 = [table_ptr1], 16
  1912. (p11) fadd.s1 C_lo = C_lo, A
  1913. nop.i 999 ;;
  1914. }
  1915. //
  1916. // Case 4: c = C_hi - r
  1917. // Get [i_1] - lsb of N_fix_gr.
  1918. //
  1919. { .mfi
  1920. (p11) ldfe Q1_2 = [table_ptr1], 16
  1921. nop.f 999
  1922. nop.i 999 ;;
  1923. }
  1924. { .mfi
  1925. nop.m 999
  1926. (p13) fsub.s1 a = U_hi, a
  1927. nop.i 999 ;;
  1928. }
  1929. { .mfi
  1930. nop.m 999
  1931. (p11) fadd.s1 t = t, a
  1932. nop.i 999 ;;
  1933. }
  1934. { .mfi
  1935. nop.m 999
  1936. //
  1937. // Case 4: t = t + a
  1938. //
  1939. (p11) fadd.s1 C_lo = C_lo, t
  1940. nop.i 999 ;;
  1941. }
  1942. { .mfi
  1943. nop.m 999
  1944. //
  1945. // Case 4: C_lo = C_lo + t
  1946. //
  1947. (p11) fadd.s1 r = C_hi, C_lo
  1948. nop.i 999 ;;
  1949. }
  1950. { .mfi
  1951. nop.m 999
  1952. (p11) fsub.s1 c = C_hi, r
  1953. nop.i 999
  1954. }
  1955. { .mfi
  1956. nop.m 999
  1957. //
  1958. // Case 4: c = c + C_lo finished.
  1959. // Is i_1 even or odd?
  1960. // if i_1 == 0, set PR_4, else set PR_5.
  1961. //
  1962. // r and c have been computed.
  1963. // We known whether this is the sine or cosine routine.
  1964. // Make sure ftz mode is set - should be automatic when using wre
  1965. (p0) fmpy.s1 rsq = r, r
  1966. nop.i 999 ;;
  1967. }
  1968. { .mfi
  1969. nop.m 999
  1970. (p11) fadd.s1 c = c , C_lo
  1971. (p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
  1972. }
  1973. { .mfi
  1974. nop.m 999
  1975. (p12) frcpa.s1 S_hi, p0 = f1, r
  1976. nop.i 999
  1977. }
  1978. { .mfi
  1979. nop.m 999
  1980. //
  1981. // N odd: Change sign of S_hi
  1982. //
  1983. (p11) fma.s1 Result = rsq, P1_2, P1_1
  1984. nop.i 999 ;;
  1985. }
  1986. { .mfi
  1987. nop.m 999
  1988. (p12) fma.s1 P = rsq, Q1_2, Q1_1
  1989. nop.i 999
  1990. }
  1991. { .mfi
  1992. nop.m 999
  1993. //
  1994. // N odd: Result = S_hi + S_lo (User supplied rounding mode for C1)
  1995. //
  1996. (p0) fmpy.s0 Q1_1 = Q1_1, Q1_1
  1997. nop.i 999 ;;
  1998. }
  1999. { .mfi
  2000. nop.m 999
  2001. //
  2002. // N even: rsq = r * r
  2003. // N odd: S_hi = frcpa(r)
  2004. //
  2005. (p12) fmerge.ns S_hi = S_hi, S_hi
  2006. nop.i 999
  2007. }
  2008. { .mfi
  2009. nop.m 999
  2010. //
  2011. // N even: rsq = rsq * P1_2 + P1_1
  2012. // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
  2013. //
  2014. (p11) fmpy.s1 Result = rsq, Result
  2015. nop.i 999 ;;
  2016. }
  2017. { .mfi
  2018. nop.m 999
  2019. (p12) fma.s1 poly1 = S_hi, r,f1
  2020. nop.i 999
  2021. }
  2022. { .mfi
  2023. nop.m 999
  2024. //
  2025. // N even: Result = Result * rsq
  2026. // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
  2027. //
  2028. (p11) fma.s1 Result = r, Result, c
  2029. nop.i 999 ;;
  2030. }
  2031. { .mfi
  2032. nop.m 999
  2033. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  2034. nop.i 999
  2035. }
  2036. { .mfi
  2037. nop.m 999
  2038. //
  2039. // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
  2040. //
  2041. (p11) fadd.s0 Result= r, Result
  2042. nop.i 999 ;;
  2043. }
  2044. { .mfi
  2045. nop.m 999
  2046. (p12) fma.s1 poly1 = S_hi, r, f1
  2047. nop.i 999 ;;
  2048. }
  2049. { .mfi
  2050. nop.m 999
  2051. //
  2052. // N even: Result = Result * r + c
  2053. // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
  2054. //
  2055. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  2056. nop.i 999 ;;
  2057. }
  2058. { .mfi
  2059. nop.m 999
  2060. (p12) fma.s1 poly1 = S_hi, r, f1
  2061. nop.i 999 ;;
  2062. }
  2063. { .mfi
  2064. nop.m 999
  2065. //
  2066. // N even: Result1 = Result + r (Rounding mode S0)
  2067. // N odd: poly1 = S_hi * r + 1.0 64 bits partial
  2068. //
  2069. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  2070. nop.i 999 ;;
  2071. }
  2072. { .mfi
  2073. nop.m 999
  2074. //
  2075. // N odd: poly1 = S_hi * poly + S_hi 64 bits
  2076. //
  2077. (p12) fma.s1 poly1 = S_hi, r, f1
  2078. nop.i 999 ;;
  2079. }
  2080. { .mfi
  2081. nop.m 999
  2082. //
  2083. // N odd: poly1 = S_hi * r + 1.0
  2084. //
  2085. (p12) fma.s1 poly1 = S_hi, c, poly1
  2086. nop.i 999 ;;
  2087. }
  2088. { .mfi
  2089. nop.m 999
  2090. //
  2091. // N odd: poly1 = S_hi * c + poly1
  2092. //
  2093. (p12) fmpy.s1 S_lo = S_hi, poly1
  2094. nop.i 999 ;;
  2095. }
  2096. { .mfi
  2097. nop.m 999
  2098. //
  2099. // N odd: S_lo = S_hi * poly1
  2100. //
  2101. (p12) fma.s1 S_lo = P, r, S_lo
  2102. nop.i 999 ;;
  2103. }
  2104. { .mfb
  2105. nop.m 999
  2106. //
  2107. // N odd: S_lo = S_lo + r * P
  2108. //
  2109. (p12) fadd.s0 Result = S_hi, S_lo
  2110. //
  2111. // Do dummy multiply to raise inexact.
  2112. //
  2113. (p0) br.ret.sptk b0 ;;
  2114. }
  2115. TAN_SMALL_R:
  2116. { .mii
  2117. nop.m 999
  2118. (p0) extr.u i_1 = N_fix_gr, 0, 1 ;;
  2119. (p0) cmp.eq.unc p11, p12 = 0x0000, i_1
  2120. }
  2121. { .mfi
  2122. nop.m 999
  2123. (p0) fmpy.s1 rsq = r, r
  2124. nop.i 999 ;;
  2125. }
  2126. { .mfi
  2127. nop.m 999
  2128. (p12) frcpa.s1 S_hi, p0 = f1, r
  2129. nop.i 999
  2130. }
  2131. { .mfi
  2132. (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
  2133. nop.f 999
  2134. nop.i 999
  2135. }
  2136. ;;
  2137. { .mmi
  2138. (p0) ld8 table_ptr1 = [table_ptr1]
  2139. nop.m 999
  2140. nop.i 999
  2141. }
  2142. ;;
  2143. // *****************************************************************
  2144. // *****************************************************************
  2145. // *****************************************************************
  2146. { .mmi
  2147. (p0) add table_ptr1 = 224, table_ptr1 ;;
  2148. (p0) ldfe P1_1 = [table_ptr1], 16
  2149. nop.i 999 ;;
  2150. }
  2151. // r and c have been computed.
  2152. // We known whether this is the sine or cosine routine.
  2153. // Make sure ftz mode is set - should be automatic when using wre
  2154. // |r| < 2**(-2)
  2155. { .mfi
  2156. (p0) ldfe P1_2 = [table_ptr1], 16
  2157. (p11) fmpy.s1 r_to_the_8 = rsq, rsq
  2158. nop.i 999 ;;
  2159. }
  2160. //
  2161. // Set table_ptr1 to beginning of constant table.
  2162. // Get [i_1] - lsb of N_fix_gr.
  2163. //
  2164. { .mfi
  2165. (p0) ldfe P1_3 = [table_ptr1], 96
  2166. //
  2167. // N even: rsq = r * r
  2168. // N odd: S_hi = frcpa(r)
  2169. //
  2170. (p12) fmerge.ns S_hi = S_hi, S_hi
  2171. nop.i 999 ;;
  2172. }
  2173. //
  2174. // Is i_1 even or odd?
  2175. // if i_1 == 0, set PR_11.
  2176. // if i_1 != 0, set PR_12.
  2177. //
  2178. { .mfi
  2179. (p11) ldfe P1_9 = [table_ptr1], -16
  2180. //
  2181. // N even: Poly2 = P1_7 + Poly2 * rsq
  2182. // N odd: poly2 = Q1_5 + poly2 * rsq
  2183. //
  2184. (p11) fadd.s1 CORR = rsq, f1
  2185. nop.i 999 ;;
  2186. }
  2187. { .mmi
  2188. (p11) ldfe P1_8 = [table_ptr1], -16 ;;
  2189. //
  2190. // N even: Poly1 = P1_2 + P1_3 * rsq
  2191. // N odd: poly1 = 1.0 + S_hi * r
  2192. // 16 bits partial account for necessary (-1)
  2193. //
  2194. (p11) ldfe P1_7 = [table_ptr1], -16
  2195. nop.i 999 ;;
  2196. }
  2197. //
  2198. // N even: Poly1 = P1_1 + Poly1 * rsq
  2199. // N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary
  2200. //
  2201. { .mfi
  2202. (p11) ldfe P1_6 = [table_ptr1], -16
  2203. //
  2204. // N even: Poly2 = P1_5 + Poly2 * rsq
  2205. // N odd: poly2 = Q1_3 + poly2 * rsq
  2206. //
  2207. (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
  2208. nop.i 999 ;;
  2209. }
  2210. //
  2211. // N even: Poly1 = Poly1 * rsq
  2212. // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
  2213. //
  2214. { .mfi
  2215. (p11) ldfe P1_5 = [table_ptr1], -16
  2216. (p12) fma.s1 poly1 = S_hi, r, f1
  2217. nop.i 999 ;;
  2218. }
  2219. //
  2220. // N even: CORR = CORR * c
  2221. // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
  2222. //
  2223. //
  2224. // N even: Poly2 = P1_6 + Poly2 * rsq
  2225. // N odd: poly2 = Q1_4 + poly2 * rsq
  2226. //
  2227. { .mmf
  2228. (p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp
  2229. (p11) ldfe P1_4 = [table_ptr1], -16
  2230. (p11) fmpy.s1 CORR = CORR, c
  2231. }
  2232. ;;
  2233. { .mmi
  2234. (p0) ld8 table_ptr2 = [table_ptr2]
  2235. nop.m 999
  2236. nop.i 999
  2237. }
  2238. ;;
  2239. { .mii
  2240. (p0) add table_ptr2 = 464, table_ptr2
  2241. nop.i 999 ;;
  2242. nop.i 999
  2243. }
  2244. { .mfi
  2245. nop.m 999
  2246. (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
  2247. nop.i 999 ;;
  2248. }
  2249. { .mfi
  2250. (p0) ldfe Q1_7 = [table_ptr2], -16
  2251. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  2252. nop.i 999 ;;
  2253. }
  2254. { .mfi
  2255. (p0) ldfe Q1_6 = [table_ptr2], -16
  2256. (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
  2257. nop.i 999 ;;
  2258. }
  2259. { .mmi
  2260. (p0) ldfe Q1_5 = [table_ptr2], -16 ;;
  2261. (p12) ldfe Q1_4 = [table_ptr2], -16
  2262. nop.i 999 ;;
  2263. }
  2264. { .mfi
  2265. (p12) ldfe Q1_3 = [table_ptr2], -16
  2266. //
  2267. // N even: Poly2 = P1_8 + P1_9 * rsq
  2268. // N odd: poly2 = Q1_6 + Q1_7 * rsq
  2269. //
  2270. (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
  2271. nop.i 999 ;;
  2272. }
  2273. { .mfi
  2274. (p12) ldfe Q1_2 = [table_ptr2], -16
  2275. (p12) fma.s1 poly1 = S_hi, r, f1
  2276. nop.i 999 ;;
  2277. }
  2278. { .mfi
  2279. (p12) ldfe Q1_1 = [table_ptr2], -16
  2280. (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
  2281. nop.i 999 ;;
  2282. }
  2283. { .mfi
  2284. nop.m 999
  2285. //
  2286. // N even: CORR = rsq + 1
  2287. // N even: r_to_the_8 = rsq * rsq
  2288. //
  2289. (p11) fmpy.s1 Poly1 = Poly1, rsq
  2290. nop.i 999 ;;
  2291. }
  2292. { .mfi
  2293. nop.m 999
  2294. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  2295. nop.i 999
  2296. }
  2297. { .mfi
  2298. nop.m 999
  2299. (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
  2300. nop.i 999 ;;
  2301. }
  2302. { .mfi
  2303. nop.m 999
  2304. (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
  2305. nop.i 999 ;;
  2306. }
  2307. { .mfi
  2308. nop.m 999
  2309. (p12) fma.s1 poly1 = S_hi, r, f1
  2310. nop.i 999
  2311. }
  2312. { .mfi
  2313. nop.m 999
  2314. (p12) fma.s1 poly2 = poly2, rsq, Q1_5
  2315. nop.i 999 ;;
  2316. }
  2317. { .mfi
  2318. nop.m 999
  2319. (p11) fma.s1 Poly2= Poly2, rsq, P1_5
  2320. nop.i 999 ;;
  2321. }
  2322. { .mfi
  2323. nop.m 999
  2324. (p12) fma.s1 S_hi = S_hi, poly1, S_hi
  2325. nop.i 999
  2326. }
  2327. { .mfi
  2328. nop.m 999
  2329. (p12) fma.s1 poly2 = poly2, rsq, Q1_4
  2330. nop.i 999 ;;
  2331. }
  2332. { .mfi
  2333. nop.m 999
  2334. //
  2335. // N even: r_to_the_8 = r_to_the_8 * r_to_the_8
  2336. // N odd: poly1 = S_hi * r + 1.0 64 bits partial
  2337. //
  2338. (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
  2339. nop.i 999 ;;
  2340. }
  2341. { .mfi
  2342. nop.m 999
  2343. //
  2344. // N even: Result = CORR + Poly * r
  2345. // N odd: P = Q1_1 + poly2 * rsq
  2346. //
  2347. (p12) fma.s1 poly1 = S_hi, r, f1
  2348. nop.i 999
  2349. }
  2350. { .mfi
  2351. nop.m 999
  2352. (p12) fma.s1 poly2 = poly2, rsq, Q1_3
  2353. nop.i 999 ;;
  2354. }
  2355. { .mfi
  2356. nop.m 999
  2357. //
  2358. // N even: Poly2 = P1_4 + Poly2 * rsq
  2359. // N odd: poly2 = Q1_2 + poly2 * rsq
  2360. //
  2361. (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
  2362. nop.i 999 ;;
  2363. }
  2364. { .mfi
  2365. nop.m 999
  2366. (p12) fma.s1 poly1 = S_hi, c, poly1
  2367. nop.i 999
  2368. }
  2369. { .mfi
  2370. nop.m 999
  2371. (p12) fma.s1 poly2 = poly2, rsq, Q1_2
  2372. nop.i 999 ;;
  2373. }
  2374. { .mfi
  2375. nop.m 999
  2376. //
  2377. // N even: Poly = Poly1 + Poly2 * r_to_the_8
  2378. // N odd: S_hi = S_hi * poly1 + S_hi 64 bits
  2379. //
  2380. (p11) fma.s1 Result = Poly, r, CORR
  2381. nop.i 999 ;;
  2382. }
  2383. { .mfi
  2384. nop.m 999
  2385. //
  2386. // N even: Result = r + Result (User supplied rounding mode)
  2387. // N odd: poly1 = S_hi * c + poly1
  2388. //
  2389. (p12) fmpy.s1 S_lo = S_hi, poly1
  2390. nop.i 999
  2391. }
  2392. { .mfi
  2393. nop.m 999
  2394. (p12) fma.s1 P = poly2, rsq, Q1_1
  2395. nop.i 999 ;;
  2396. }
  2397. { .mfi
  2398. nop.m 999
  2399. //
  2400. // N odd: poly1 = S_hi * r + 1.0
  2401. //
  2402. (p11) fadd.s0 Result = Result, r
  2403. nop.i 999 ;;
  2404. }
  2405. { .mfi
  2406. nop.m 999
  2407. //
  2408. // N odd: S_lo = S_hi * poly1
  2409. //
  2410. (p12) fma.s1 S_lo = Q1_1, c, S_lo
  2411. nop.i 999
  2412. }
  2413. { .mfi
  2414. nop.m 999
  2415. //
  2416. // N odd: Result = Result + S_hi (user supplied rounding mode)
  2417. //
  2418. (p0) fmpy.s0 Q1_1 = Q1_1, Q1_1
  2419. nop.i 999 ;;
  2420. }
  2421. { .mfi
  2422. nop.m 999
  2423. //
  2424. // N odd: S_lo = Q1_1 * c + S_lo
  2425. //
  2426. (p12) fma.s1 Result = P, r, S_lo
  2427. nop.i 999 ;;
  2428. }
  2429. { .mfb
  2430. nop.m 999
  2431. //
  2432. // N odd: Result = S_lo + r * P
  2433. //
  2434. (p12) fadd.s0 Result = Result, S_hi
  2435. //
  2436. // Do multiply to raise inexact.
  2437. //
  2438. (p0) br.ret.sptk b0 ;;
  2439. }
  2440. TAN_NORMAL_R:
  2441. { .mfi
  2442. (p0) getf.sig sig_r = r
  2443. // *******************************************************************
  2444. // *******************************************************************
  2445. // *******************************************************************
  2446. //
  2447. // r and c have been computed.
  2448. // Make sure ftz mode is set - should be automatic when using wre
  2449. //
  2450. //
  2451. // Get [i_1] - lsb of N_fix_gr alone.
  2452. //
  2453. (p0) fmerge.s Pos_r = f1, r
  2454. (p0) extr.u i_1 = N_fix_gr, 0, 1 ;;
  2455. }
  2456. { .mfi
  2457. nop.m 999
  2458. (p0) fmerge.s sgn_r = r, f1
  2459. (p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
  2460. }
  2461. { .mfi
  2462. nop.m 999
  2463. nop.f 999
  2464. (p0) extr.u lookup = sig_r, 58, 5
  2465. }
  2466. { .mlx
  2467. nop.m 999
  2468. (p0) movl Create_B = 0x8200000000000000 ;;
  2469. }
  2470. { .mfi
  2471. (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
  2472. nop.f 999
  2473. (p0) dep Create_B = lookup, Create_B, 58, 5
  2474. }
  2475. ;;
  2476. //
  2477. // Get [i_1] - lsb of N_fix_gr alone.
  2478. // Pos_r = abs (r)
  2479. //
  2480. { .mmi
  2481. ld8 table_ptr1 = [table_ptr1]
  2482. nop.m 999
  2483. nop.i 999
  2484. }
  2485. ;;
  2486. { .mmi
  2487. nop.m 999
  2488. (p0) setf.sig B = Create_B
  2489. //
  2490. // Set table_ptr1 and table_ptr2 to base address of
  2491. // constant table.
  2492. //
  2493. (p0) add table_ptr1 = 480, table_ptr1 ;;
  2494. }
  2495. { .mmb
  2496. nop.m 999
  2497. //
  2498. // Is i_1 or i_0 == 0 ?
  2499. // Create the constant 1 00000 1000000000000000000000...
  2500. //
  2501. (p0) ldfe P2_1 = [table_ptr1], 16
  2502. nop.b 999
  2503. }
  2504. { .mmi
  2505. nop.m 999 ;;
  2506. (p0) getf.exp exp_r = Pos_r
  2507. nop.i 999
  2508. }
  2509. //
  2510. // Get r's exponent
  2511. // Get r's significand
  2512. //
  2513. { .mmi
  2514. (p0) ldfe P2_2 = [table_ptr1], 16 ;;
  2515. //
  2516. // Get the 5 bits or r for the lookup. 1.xxxxx ....
  2517. // from sig_r.
  2518. // Grab lsb of exp of B
  2519. //
  2520. (p0) ldfe P2_3 = [table_ptr1], 16
  2521. nop.i 999 ;;
  2522. }
  2523. { .mii
  2524. nop.m 999
  2525. (p0) andcm table_offset = 0x0001, exp_r ;;
  2526. (p0) shl table_offset = table_offset, 9 ;;
  2527. }
  2528. { .mii
  2529. nop.m 999
  2530. //
  2531. // Deposit 0 00000 1000000000000000000000... on
  2532. // 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
  2533. // getting rid of the ys.
  2534. // Is B = 2** -2 or B= 2** -1? If 2**-1, then
  2535. // we want an offset of 512 for table addressing.
  2536. //
  2537. (p0) shladd table_offset = lookup, 4, table_offset ;;
  2538. //
  2539. // B = ........ 1xxxxx 1000000000000000000...
  2540. //
  2541. (p0) add table_ptr1 = table_ptr1, table_offset ;;
  2542. }
  2543. { .mmb
  2544. nop.m 999
  2545. //
  2546. // B = ........ 1xxxxx 1000000000000000000...
  2547. // Convert B so it has the same exponent as Pos_r
  2548. //
  2549. (p0) ldfd T_hi = [table_ptr1], 8
  2550. nop.b 999 ;;
  2551. }
  2552. //
  2553. // x = |r| - B
  2554. // Load T_hi.
  2555. // Load C_hi.
  2556. //
  2557. { .mmf
  2558. (p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp
  2559. (p0) ldfs T_lo = [table_ptr1]
  2560. (p0) fmerge.se B = Pos_r, B
  2561. }
  2562. ;;
  2563. { .mmi
  2564. ld8 table_ptr2 = [table_ptr2]
  2565. nop.m 999
  2566. nop.i 999
  2567. }
  2568. ;;
  2569. { .mii
  2570. (p0) add table_ptr2 = 1360, table_ptr2
  2571. nop.i 999 ;;
  2572. (p0) add table_ptr2 = table_ptr2, table_offset ;;
  2573. }
  2574. { .mfi
  2575. (p0) ldfd C_hi = [table_ptr2], 8
  2576. (p0) fsub.s1 x = Pos_r, B
  2577. nop.i 999 ;;
  2578. }
  2579. { .mii
  2580. (p0) ldfs C_lo = [table_ptr2],255
  2581. nop.i 999 ;;
  2582. //
  2583. // xsq = x * x
  2584. // N even: Tx = T_hi * x
  2585. // Load T_lo.
  2586. // Load C_lo - increment pointer to get SC_inv
  2587. // - cant get all the way, do an add later.
  2588. //
  2589. (p0) add table_ptr2 = 569, table_ptr2 ;;
  2590. }
  2591. //
  2592. // N even: Tx1 = Tx + 1
  2593. // N odd: Cx1 = 1 - Cx
  2594. //
  2595. { .mfi
  2596. (p0) ldfe SC_inv = [table_ptr2], 0
  2597. nop.f 999
  2598. nop.i 999 ;;
  2599. }
  2600. { .mfi
  2601. nop.m 999
  2602. (p0) fmpy.s1 xsq = x, x
  2603. nop.i 999
  2604. }
  2605. { .mfi
  2606. nop.m 999
  2607. (p11) fmpy.s1 Tx = T_hi, x
  2608. nop.i 999 ;;
  2609. }
  2610. { .mfi
  2611. nop.m 999
  2612. (p12) fmpy.s1 Cx = C_hi, x
  2613. nop.i 999 ;;
  2614. }
  2615. { .mfi
  2616. nop.m 999
  2617. //
  2618. // N odd: Cx = C_hi * x
  2619. //
  2620. (p0) fma.s1 P = P2_3, xsq, P2_2
  2621. nop.i 999
  2622. }
  2623. { .mfi
  2624. nop.m 999
  2625. //
  2626. // N even and odd: P = P2_3 + P2_2 * xsq
  2627. //
  2628. (p11) fadd.s1 Tx1 = Tx, f1
  2629. nop.i 999 ;;
  2630. }
  2631. { .mfi
  2632. nop.m 999
  2633. //
  2634. // N even: D = C_hi - tanx
  2635. // N odd: D = T_hi + tanx
  2636. //
  2637. (p11) fmpy.s1 CORR = SC_inv, T_hi
  2638. nop.i 999
  2639. }
  2640. { .mfi
  2641. nop.m 999
  2642. (p0) fmpy.s1 Sx = SC_inv, x
  2643. nop.i 999 ;;
  2644. }
  2645. { .mfi
  2646. nop.m 999
  2647. (p12) fmpy.s1 CORR = SC_inv, C_hi
  2648. nop.i 999 ;;
  2649. }
  2650. { .mfi
  2651. nop.m 999
  2652. (p12) fsub.s1 V_hi = f1, Cx
  2653. nop.i 999 ;;
  2654. }
  2655. { .mfi
  2656. nop.m 999
  2657. (p0) fma.s1 P = P, xsq, P2_1
  2658. nop.i 999
  2659. }
  2660. { .mfi
  2661. nop.m 999
  2662. //
  2663. // N even and odd: P = P2_1 + P * xsq
  2664. //
  2665. (p11) fma.s1 V_hi = Tx, Tx1, f1
  2666. nop.i 999 ;;
  2667. }
  2668. { .mfi
  2669. nop.m 999
  2670. //
  2671. // N even: Result = sgn_r * tail + T_hi (user rounding mode for C1)
  2672. // N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1)
  2673. //
  2674. (p0) fmpy.s0 P2_1 = P2_1, P2_1
  2675. nop.i 999 ;;
  2676. }
  2677. { .mfi
  2678. nop.m 999
  2679. (p0) fmpy.s1 CORR = CORR, c
  2680. nop.i 999 ;;
  2681. }
  2682. { .mfi
  2683. nop.m 999
  2684. (p12) fnma.s1 V_hi = Cx,V_hi,f1
  2685. nop.i 999 ;;
  2686. }
  2687. { .mfi
  2688. nop.m 999
  2689. //
  2690. // N even: V_hi = Tx * Tx1 + 1
  2691. // N odd: Cx1 = 1 - Cx * Cx1
  2692. //
  2693. (p0) fmpy.s1 P = P, xsq
  2694. nop.i 999
  2695. }
  2696. { .mfi
  2697. nop.m 999
  2698. //
  2699. // N even and odd: P = P * xsq
  2700. //
  2701. (p11) fmpy.s1 V_hi = V_hi, T_hi
  2702. nop.i 999 ;;
  2703. }
  2704. { .mfi
  2705. nop.m 999
  2706. //
  2707. // N even and odd: tail = P * tail + V_lo
  2708. //
  2709. (p11) fmpy.s1 T_hi = sgn_r, T_hi
  2710. nop.i 999 ;;
  2711. }
  2712. { .mfi
  2713. nop.m 999
  2714. (p0) fmpy.s1 CORR = CORR, sgn_r
  2715. nop.i 999 ;;
  2716. }
  2717. { .mfi
  2718. nop.m 999
  2719. (p12) fmpy.s1 V_hi = V_hi,C_hi
  2720. nop.i 999 ;;
  2721. }
  2722. { .mfi
  2723. nop.m 999
  2724. //
  2725. // N even: V_hi = T_hi * V_hi
  2726. // N odd: V_hi = C_hi * V_hi
  2727. //
  2728. (p0) fma.s1 tanx = P, x, x
  2729. nop.i 999
  2730. }
  2731. { .mfi
  2732. nop.m 999
  2733. (p12) fnmpy.s1 C_hi = sgn_r, C_hi
  2734. nop.i 999 ;;
  2735. }
  2736. { .mfi
  2737. nop.m 999
  2738. //
  2739. // N even: V_lo = 1 - V_hi + C_hi
  2740. // N odd: V_lo = 1 - V_hi + T_hi
  2741. //
  2742. (p11) fadd.s1 CORR = CORR, T_lo
  2743. nop.i 999
  2744. }
  2745. { .mfi
  2746. nop.m 999
  2747. (p12) fsub.s1 CORR = CORR, C_lo
  2748. nop.i 999 ;;
  2749. }
  2750. { .mfi
  2751. nop.m 999
  2752. //
  2753. // N even and odd: tanx = x + x * P
  2754. // N even and odd: Sx = SC_inv * x
  2755. //
  2756. (p11) fsub.s1 D = C_hi, tanx
  2757. nop.i 999
  2758. }
  2759. { .mfi
  2760. nop.m 999
  2761. (p12) fadd.s1 D = T_hi, tanx
  2762. nop.i 999 ;;
  2763. }
  2764. { .mfi
  2765. nop.m 999
  2766. //
  2767. // N odd: CORR = SC_inv * C_hi
  2768. // N even: CORR = SC_inv * T_hi
  2769. //
  2770. (p0) fnma.s1 D = V_hi, D, f1
  2771. nop.i 999 ;;
  2772. }
  2773. { .mfi
  2774. nop.m 999
  2775. //
  2776. // N even and odd: D = 1 - V_hi * D
  2777. // N even and odd: CORR = CORR * c
  2778. //
  2779. (p0) fma.s1 V_hi = V_hi, D, V_hi
  2780. nop.i 999 ;;
  2781. }
  2782. { .mfi
  2783. nop.m 999
  2784. //
  2785. // N even and odd: V_hi = V_hi + V_hi * D
  2786. // N even and odd: CORR = sgn_r * CORR
  2787. //
  2788. (p11) fnma.s1 V_lo = V_hi, C_hi, f1
  2789. nop.i 999
  2790. }
  2791. { .mfi
  2792. nop.m 999
  2793. (p12) fnma.s1 V_lo = V_hi, T_hi, f1
  2794. nop.i 999 ;;
  2795. }
  2796. { .mfi
  2797. nop.m 999
  2798. //
  2799. // N even: CORR = COOR + T_lo
  2800. // N odd: CORR = CORR - C_lo
  2801. //
  2802. (p11) fma.s1 V_lo = tanx, V_hi, V_lo
  2803. nop.i 999
  2804. }
  2805. { .mfi
  2806. nop.m 999
  2807. (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
  2808. nop.i 999 ;;
  2809. }
  2810. { .mfi
  2811. nop.m 999
  2812. //
  2813. // N even: V_lo = V_lo + V_hi * tanx
  2814. // N odd: V_lo = V_lo - V_hi * tanx
  2815. //
  2816. (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
  2817. nop.i 999
  2818. }
  2819. { .mfi
  2820. nop.m 999
  2821. (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
  2822. nop.i 999 ;;
  2823. }
  2824. { .mfi
  2825. nop.m 999
  2826. //
  2827. // N even: V_lo = V_lo - V_hi * C_lo
  2828. // N odd: V_lo = V_lo - V_hi * T_lo
  2829. //
  2830. (p0) fmpy.s1 V_lo = V_hi, V_lo
  2831. nop.i 999 ;;
  2832. }
  2833. { .mfi
  2834. nop.m 999
  2835. //
  2836. // N even and odd: V_lo = V_lo * V_hi
  2837. //
  2838. (p0) fadd.s1 tail = V_hi, V_lo
  2839. nop.i 999 ;;
  2840. }
  2841. { .mfi
  2842. nop.m 999
  2843. //
  2844. // N even and odd: tail = V_hi + V_lo
  2845. //
  2846. (p0) fma.s1 tail = tail, P, V_lo
  2847. nop.i 999 ;;
  2848. }
  2849. { .mfi
  2850. nop.m 999
  2851. //
  2852. // N even: T_hi = sgn_r * T_hi
  2853. // N odd : C_hi = -sgn_r * C_hi
  2854. //
  2855. (p0) fma.s1 tail = tail, Sx, CORR
  2856. nop.i 999 ;;
  2857. }
  2858. { .mfi
  2859. nop.m 999
  2860. //
  2861. // N even and odd: tail = Sx * tail + CORR
  2862. //
  2863. (p0) fma.s1 tail = V_hi, Sx, tail
  2864. nop.i 999 ;;
  2865. }
  2866. { .mfi
  2867. nop.m 999
  2868. //
  2869. // N even an odd: tail = Sx * V_hi + tail
  2870. //
  2871. (p11) fma.s0 Result = sgn_r, tail, T_hi
  2872. nop.i 999
  2873. }
  2874. { .mfb
  2875. nop.m 999
  2876. (p12) fma.s0 Result = sgn_r, tail, C_hi
  2877. //
  2878. // Do a multiply to raise inexact.
  2879. //
  2880. (p0) br.ret.sptk b0 ;;
  2881. }
  2882. .endp __libm_tan
  2883. // *******************************************************************
  2884. // *******************************************************************
  2885. // *******************************************************************
  2886. //
  2887. // Special Code to handle very large argument case.
  2888. // Call int pi_by_2_reduce(&x,&r)
  2889. // for |arguments| >= 2**63
  2890. // (Arg or x) is in f8
  2891. // Address to save r and c as double
  2892. // (1) (2) (3) (call) (4)
  2893. // sp -> + psp -> + psp -> + sp -> +
  2894. // | | | |
  2895. // | r50 ->| <- r50 f0 ->| r50 -> | -> c
  2896. // | | | |
  2897. // sp-32 -> | <- r50 f0 ->| f0 ->| <- r50 r49 -> | -> r
  2898. // | | | |
  2899. // | r49 ->| <- r49 Arg ->| <- r49 | -> x
  2900. // | | | |
  2901. // sp -64 ->| sp -64 ->| sp -64 ->| |
  2902. //
  2903. // save pfs save b0 restore gp
  2904. // save gp restore b0
  2905. // restore pfs
  2906. .proc __libm_callout
  2907. __libm_callout:
  2908. TAN_ARG_TOO_LARGE:
  2909. .prologue
  2910. // (1)
  2911. { .mfi
  2912. add GR_Parameter_r =-32,sp // Parameter: r address
  2913. nop.f 0
  2914. .save ar.pfs,GR_SAVE_PFS
  2915. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  2916. }
  2917. { .mfi
  2918. .fframe 64
  2919. add sp=-64,sp // Create new stack
  2920. nop.f 0
  2921. mov GR_SAVE_GP=gp // Save gp
  2922. };;
  2923. // (2)
  2924. { .mmi
  2925. stfe [GR_Parameter_r ] = f0,16 // Clear Parameter r on stack
  2926. add GR_Parameter_X = 16,sp // Parameter x address
  2927. .save b0, GR_SAVE_B0
  2928. mov GR_SAVE_B0=b0 // Save b0
  2929. };;
  2930. // (3)
  2931. .body
  2932. { .mib
  2933. stfe [GR_Parameter_r ] = f0,-16 // Clear Parameter c on stack
  2934. nop.i 0
  2935. nop.b 0
  2936. }
  2937. { .mib
  2938. stfe [GR_Parameter_X] = Arg // Store Parameter x on stack
  2939. nop.i 0
  2940. (p0) br.call.sptk b0=__libm_pi_by_2_reduce#
  2941. }
  2942. ;;
  2943. // (4)
  2944. { .mmi
  2945. mov gp = GR_SAVE_GP // Restore gp
  2946. (p0) mov N_fix_gr = r8
  2947. nop.i 999
  2948. }
  2949. ;;
  2950. { .mmi
  2951. (p0) ldfe Arg =[GR_Parameter_X],16
  2952. (p0) ldfs TWO_TO_NEG2 = [table_ptr2],4
  2953. nop.i 999
  2954. }
  2955. ;;
  2956. { .mmb
  2957. (p0) ldfe r =[GR_Parameter_r ],16
  2958. (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4
  2959. nop.b 999 ;;
  2960. }
  2961. { .mfi
  2962. (p0) ldfe c =[GR_Parameter_r ]
  2963. nop.f 999
  2964. nop.i 999 ;;
  2965. }
  2966. { .mfi
  2967. nop.m 999
  2968. //
  2969. // Is |r| < 2**(-2)
  2970. //
  2971. (p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2
  2972. mov b0 = GR_SAVE_B0 // Restore return address
  2973. }
  2974. ;;
  2975. { .mfi
  2976. nop.m 999
  2977. (p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2
  2978. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  2979. }
  2980. ;;
  2981. { .mbb
  2982. .restore
  2983. add sp = 64,sp // Restore stack pointer
  2984. (p6) br.cond.spnt TAN_SMALL_R
  2985. (p0) br.cond.sptk TAN_NORMAL_R
  2986. }
  2987. ;;
  2988. .endp __libm_callout
  2989. .proc __libm_TAN_SPECIAL
  2990. __libm_TAN_SPECIAL:
  2991. //
  2992. // Code for NaNs, Unsupporteds, Infs, or +/- zero ?
  2993. // Invalid raised for Infs and SNaNs.
  2994. //
  2995. { .mfb
  2996. nop.m 999
  2997. (p0) fmpy.s0 Arg = Arg, f0
  2998. (p0) br.ret.sptk b0
  2999. }
  3000. .endp __libm_TAN_SPECIAL
  3001. .type __libm_pi_by_2_reduce#,@function
  3002. .global __libm_pi_by_2_reduce#