Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1515 lines
42 KiB

  1. .file "libm_reduce.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History: 02/02/00 Initial Version
  27. //
  28. //*********************************************************************
  29. //*********************************************************************
  30. //
  31. // Function: __libm_pi_by_two_reduce(x) return r, c, and N where
  32. // x = N * pi/4 + (r+c) , where |r+c| <= pi/4.
  33. // This function is not designed to be used by the
  34. // general user.
  35. //
  36. //*********************************************************************
  37. //
  38. // Accuracy: Returns double-precision values
  39. //
  40. //*********************************************************************
  41. //
  42. // Resources Used:
  43. //
  44. // Floating-Point Registers: f32-f70
  45. //
  46. // General Purpose Registers:
  47. // r8 = return value N
  48. // r32 = Address of x
  49. // r33 = Address of where to place r and then c
  50. // r34-r64
  51. //
  52. // Predicate Registers: p6-p14
  53. //
  54. //*********************************************************************
  55. //
  56. // IEEE Special Conditions:
  57. //
  58. // No condions should be raised.
  59. //
  60. //*********************************************************************
  61. //
  62. // I. Introduction
  63. // ===============
  64. //
  65. // For the forward trigonometric functions sin, cos, sincos, and
  66. // tan, the original algorithms for IA 64 handle arguments up to
  67. // 1 ulp less than 2^63 in magnitude. For double-extended arguments x,
  68. // |x| >= 2^63, this routine returns CASE, N and r_hi, r_lo where
  69. //
  70. // x is accurately approximated by
  71. // 2*K*pi + N * pi/2 + r_hi + r_lo, |r_hi+r_lo| <= pi/4.
  72. // CASE = 1 or 2.
  73. // CASE is 1 unless |r_hi + r_lo| < 2^(-33).
  74. //
  75. // The exact value of K is not determined, but that information is
  76. // not required in trigonometric function computations.
  77. //
  78. // We first assume the argument x in question satisfies x >= 2^(63).
  79. // In particular, it is positive. Negative x can be handled by symmetry:
  80. //
  81. // -x is accurately approximated by
  82. // -2*K*pi + (-N) * pi/2 - (r_hi + r_lo), |r_hi+r_lo| <= pi/4.
  83. //
  84. // The idea of the reduction is that
  85. //
  86. // x * 2/pi = N_big + N + f, |f| <= 1/2
  87. //
  88. // Moreover, for double extended x, |f| >= 2^(-75). (This is an
  89. // non-obvious fact found by enumeration using a special algorithm
  90. // involving continued fraction.) The algorithm described below
  91. // calculates N and an accurate approximation of f.
  92. //
  93. // Roughly speaking, an appropriate 256-bit (4 X 64) portion of
  94. // 2/pi is multiplied with x to give the desired information.
  95. //
  96. // II. Representation of 2/PI
  97. // ==========================
  98. //
  99. // The value of 2/pi in binary fixed-point is
  100. //
  101. // .101000101111100110......
  102. //
  103. // We store 2/pi in a table, starting at the position corresponding
  104. // to bit position 63
  105. //
  106. // bit position 63 62 ... 0 -1 -2 -3 -4 -5 -6 -7 .... -16576
  107. //
  108. // 0 0 ... 0 . 1 0 1 0 1 0 1 .... X
  109. //
  110. // ^
  111. // |__ implied binary pt
  112. //
  113. // III. Algorithm
  114. // ==============
  115. //
  116. // This describes the algorithm in the most natural way using
  117. // unsigned interger multiplication. The implementation section
  118. // describes how the integer arithmetic is simulated.
  119. //
  120. // STEP 0. Initialization
  121. // ----------------------
  122. //
  123. // Let the input argument x be
  124. //
  125. // x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ), 63 <= m <= 16383.
  126. //
  127. // The first crucial step is to fetch four 64-bit portions of 2/pi.
  128. // To fulfill this goal, we calculate the bit position L of the
  129. // beginning of these 256-bit quantity by
  130. //
  131. // L := 62 - m.
  132. //
  133. // Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that
  134. // the storage of 2/pi is adequate.
  135. //
  136. // Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus:
  137. //
  138. // bit position L L-1 L-2 ... L-63
  139. //
  140. // P_1 = b b b ... b
  141. //
  142. // each b can be 0 or 1. Also, let P_0 be the two bits correspoding to
  143. // bit positions L+2 and L+1. So, when each of the P_j is interpreted
  144. // with appropriate scaling, we have
  145. //
  146. // 2/pi = P_big + P_0 + (P_1 + P_2 + P_3 + P_4) + P_small
  147. //
  148. // Note that P_big and P_small can be ignored. The reasons are as follow.
  149. // First, consider P_big. If P_big = 0, we can certainly ignore it.
  150. // Otherwise, P_big >= 2^(L+3). Now,
  151. //
  152. // P_big * ulp(x) >= 2^(L+3) * 2^(m-63)
  153. // >= 2^(65-m + m-63 )
  154. // >= 2^2
  155. //
  156. // Thus, P_big * x is an integer of the form 4*K. So
  157. //
  158. // x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
  159. // + x*P_small*(pi/2).
  160. //
  161. // Hence, P_big*x corresponds to information that can be ignored for
  162. // trigonometic function evaluation.
  163. //
  164. // Next, we must estimate the effect of ignoring P_small. The absolute
  165. // error made by ignoring P_small is bounded by
  166. //
  167. // |P_small * x| <= ulp(P_4) * x
  168. // <= 2^(L-255) * 2^(m+1)
  169. // <= 2^(62-m-255 + m + 1)
  170. // <= 2^(-192)
  171. //
  172. // Since for double-extended precision, x * 2/pi = integer + f,
  173. // 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring
  174. // P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable.
  175. //
  176. // Further note that if x is split into x_hi + x_lo where x_lo is the
  177. // two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then
  178. //
  179. // P_0 * x_hi
  180. //
  181. // is also an integer of the form 4*K; and thus can also be ignored.
  182. // Let M := P_0 * x_lo which is a small integer. The main part of the
  183. // calculation is really the multiplication of x with the four pieces
  184. // P_1, P_2, P_3, and P_4.
  185. //
  186. // Unless the reduced argument is extremely small in magnitude, it
  187. // suffices to carry out the multiplication of x with P_1, P_2, and
  188. // P_3. x*P_4 will be carried out and added on as a correction only
  189. // when it is found to be needed. Note also that x*P_4 need not be
  190. // computed exactly. A straightforward multiplication suffices since
  191. // the rounding error thus produced would be bounded by 2^(-3*64),
  192. // that is 2^(-192) which is small enough as the reduced argument
  193. // is bounded from below by 2^(-75).
  194. //
  195. // Now that we have four 64-bit data representing 2/pi and a
  196. // 64-bit x. We first need to calculate a highly accurate product
  197. // of x and P_1, P_2, P_3. This is best understood as integer
  198. // multiplication.
  199. //
  200. //
  201. // STEP 1. Multiplication
  202. // ----------------------
  203. //
  204. //
  205. // --------- --------- ---------
  206. // | P_1 | | P_2 | | P_3 |
  207. // --------- --------- ---------
  208. //
  209. // ---------
  210. // X | X |
  211. // ---------
  212. // ----------------------------------------------------
  213. //
  214. // --------- ---------
  215. // | A_hi | | A_lo |
  216. // --------- ---------
  217. //
  218. //
  219. // --------- ---------
  220. // | B_hi | | B_lo |
  221. // --------- ---------
  222. //
  223. //
  224. // --------- ---------
  225. // | C_hi | | C_lo |
  226. // --------- ---------
  227. //
  228. // ====================================================
  229. // --------- --------- --------- ---------
  230. // | S_0 | | S_1 | | S_2 | | S_3 |
  231. // --------- --------- --------- ---------
  232. //
  233. //
  234. //
  235. // STEP 2. Get N and f
  236. // -------------------
  237. //
  238. // Conceptually, after the individual pieces S_0, S_1, ..., are obtained,
  239. // we have to sum them and obtain an integer part, N, and a fraction, f.
  240. // Here, |f| <= 1/2, and N is an integer. Note also that N need only to
  241. // be known to module 2^k, k >= 2. In the case when |f| is small enough,
  242. // we would need to add in the value x*P_4.
  243. //
  244. //
  245. // STEP 3. Get reduced argument
  246. // ----------------------------
  247. //
  248. // The value f is not yet the reduced argument that we seek. The
  249. // equation
  250. //
  251. // x * 2/pi = 4K + N + f
  252. //
  253. // says that
  254. //
  255. // x = 2*K*pi + N * pi/2 + f * (pi/2).
  256. //
  257. // Thus, the reduced argument is given by
  258. //
  259. // reduced argument = f * pi/2.
  260. //
  261. // This multiplication must be performed to extra precision.
  262. //
  263. // IV. Implementation
  264. // ==================
  265. //
  266. // Step 0. Initialization
  267. // ----------------------
  268. //
  269. // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
  270. //
  271. // In memory, 2/pi is stored contigously as
  272. //
  273. // 0x00000000 0x00000000 0xA2F....
  274. // ^
  275. // |__ implied binary bit
  276. //
  277. // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus
  278. // -1 <= L <= -16321. We fetch from memory 5 integer pieces of data.
  279. //
  280. // P_0 is the two bits corresponding to bit positions L+2 and L+1
  281. // P_1 is the 64-bit starting at bit position L
  282. // P_2 is the 64-bit starting at bit position L-64
  283. // P_3 is the 64-bit starting at bit position L-128
  284. // P_4 is the 64-bit starting at bit position L-192
  285. //
  286. // For example, if m = 63, P_0 would be 0 and P_1 would look like
  287. // 0xA2F...
  288. //
  289. // If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary.
  290. // P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 ....
  291. //
  292. // Step 1. Multiplication
  293. // ----------------------
  294. //
  295. // At this point, P_1, P_2, P_3, P_4 are integers. They are
  296. // supposed to be interpreted as
  297. //
  298. // 2^(L-63) * P_1;
  299. // 2^(L-63-64) * P_2;
  300. // 2^(L-63-128) * P_3;
  301. // 2^(L-63-192) * P_4;
  302. //
  303. // Since each of them need to be multiplied to x, we would scale
  304. // both x and the P_j's by some convenient factors: scale each
  305. // of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
  306. //
  307. // p_1 := fcvt.xf ( P_1 )
  308. // p_2 := fcvt.xf ( P_2 ) * 2^(-64)
  309. // p_3 := fcvt.xf ( P_3 ) * 2^(-128)
  310. // p_4 := fcvt.xf ( P_4 ) * 2^(-192)
  311. // x := replace exponent of x by -1
  312. // because 2^m * 1.xxxx...xxx * 2^(L-63)
  313. // is 2^(-1) * 1.xxxx...xxx
  314. //
  315. // We are now faced with the task of computing the following
  316. //
  317. // --------- --------- ---------
  318. // | P_1 | | P_2 | | P_3 |
  319. // --------- --------- ---------
  320. //
  321. // ---------
  322. // X | X |
  323. // ---------
  324. // ----------------------------------------------------
  325. //
  326. // --------- ---------
  327. // | A_hi | | A_lo |
  328. // --------- ---------
  329. //
  330. // --------- ---------
  331. // | B_hi | | B_lo |
  332. // --------- ---------
  333. //
  334. // --------- ---------
  335. // | C_hi | | C_lo |
  336. // --------- ---------
  337. //
  338. // ====================================================
  339. // ----------- --------- --------- ---------
  340. // | S_0 | | S_1 | | S_2 | | S_3 |
  341. // ----------- --------- --------- ---------
  342. // ^ ^
  343. // | |___ binary point
  344. // |
  345. // |___ possibly one more bit
  346. //
  347. // Let FPSR3 be set to round towards zero with widest precision
  348. // and exponent range. Unless an explicit FPSR is given,
  349. // round-to-nearest with widest precision and exponent range is
  350. // used.
  351. //
  352. // Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65).
  353. //
  354. // Tmp_C := fmpy.fpsr3( x, p_1 );
  355. // If Tmp_C >= sigma_C then
  356. // C_hi := Tmp_C;
  357. // C_lo := x*p_1 - C_hi ...fma, exact
  358. // Else
  359. // C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
  360. // ...subtraction is exact, regardless
  361. // ...of rounding direction
  362. // C_lo := x*p_1 - C_hi ...fma, exact
  363. // End If
  364. //
  365. // Tmp_B := fmpy.fpsr3( x, p_2 );
  366. // If Tmp_B >= sigma_B then
  367. // B_hi := Tmp_B;
  368. // B_lo := x*p_2 - B_hi ...fma, exact
  369. // Else
  370. // B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
  371. // ...subtraction is exact, regardless
  372. // ...of rounding direction
  373. // B_lo := x*p_2 - B_hi ...fma, exact
  374. // End If
  375. //
  376. // Tmp_A := fmpy.fpsr3( x, p_3 );
  377. // If Tmp_A >= sigma_A then
  378. // A_hi := Tmp_A;
  379. // A_lo := x*p_3 - A_hi ...fma, exact
  380. // Else
  381. // A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
  382. // ...subtraction is exact, regardless
  383. // ...of rounding direction
  384. // A_lo := x*p_3 - A_hi ...fma, exact
  385. // End If
  386. //
  387. // ...Note that C_hi is of integer value. We need only the
  388. // ...last few bits. Thus we can ensure C_hi is never a big
  389. // ...integer, freeing us from overflow worry.
  390. //
  391. // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
  392. // ...Tmp_C is the upper portion of C_hi
  393. // C_hi := C_hi - Tmp_C
  394. // ...0 <= C_hi < 2^7
  395. //
  396. // Step 2. Get N and f
  397. // -------------------
  398. //
  399. // At this point, we have all the components to obtain
  400. // S_0, S_1, S_2, S_3 and thus N and f. We start by adding
  401. // C_lo and B_hi. This sum together with C_hi gives a good
  402. // estimation of N and f.
  403. //
  404. // A := fadd.fpsr3( B_hi, C_lo )
  405. // B := max( B_hi, C_lo )
  406. // b := min( B_hi, C_lo )
  407. //
  408. // a := (B - A) + b ...exact. Note that a is either 0
  409. // ...or 2^(-64).
  410. //
  411. // N := round_to_nearest_integer_value( A );
  412. // f := A - N; ...exact because lsb(A) >= 2^(-64)
  413. // ...and |f| <= 1/2.
  414. //
  415. // f := f + a ...exact because a is 0 or 2^(-64);
  416. // ...the msb of the sum is <= 1/2
  417. // ...lsb >= 2^(-64).
  418. //
  419. // N := convert to integer format( C_hi + N );
  420. // M := P_0 * x_lo;
  421. // N := N + M;
  422. //
  423. // If sgn_x == 1 (that is original x was negative)
  424. // N := 2^10 - N
  425. // ...this maintains N to be non-negative, but still
  426. // ...equivalent to the (negated N) mod 4.
  427. // End If
  428. //
  429. // If |f| >= 2^(-33)
  430. //
  431. // ...Case 1
  432. // CASE := 1
  433. // g := A_hi + B_lo;
  434. // s_hi := f + g;
  435. // s_lo := (f - s_hi) + g;
  436. //
  437. // Else
  438. //
  439. // ...Case 2
  440. // CASE := 2
  441. // A := fadd.fpsr3( A_hi, B_lo )
  442. // B := max( A_hi, B_lo )
  443. // b := min( A_hi, B_lo )
  444. //
  445. // a := (B - A) + b ...exact. Note that a is either 0
  446. // ...or 2^(-128).
  447. //
  448. // f_hi := A + f;
  449. // f_lo := (f - f_hi) + A;
  450. // ...this is exact.
  451. // ...f-f_hi is exact because either |f| >= |A|, in which
  452. // ...case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
  453. // ...means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
  454. // ...If f = 2^(-64), f-f_hi involves cancellation and is
  455. // ...exact. If f = -2^(-64), then A + f is exact. Hence
  456. // ...f-f_hi is -A exactly, giving f_lo = 0.
  457. //
  458. // f_lo := f_lo + a;
  459. //
  460. // If |f| >= 2^(-50) then
  461. // s_hi := f_hi;
  462. // s_lo := f_lo;
  463. // Else
  464. // f_lo := (f_lo + A_lo) + x*p_4
  465. // s_hi := f_hi + f_lo
  466. // s_lo := (f_hi - s_hi) + f_lo
  467. // End If
  468. //
  469. // End If
  470. //
  471. // Step 3. Get reduced argument
  472. // ----------------------------
  473. //
  474. // If sgn_x == 0 (that is original x is positive)
  475. //
  476. // D_hi := Pi_by_2_hi
  477. // D_lo := Pi_by_2_lo
  478. // ...load from table
  479. //
  480. // Else
  481. //
  482. // D_hi := neg_Pi_by_2_hi
  483. // D_lo := neg_Pi_by_2_lo
  484. // ...load from table
  485. // End If
  486. //
  487. // r_hi := s_hi*D_hi
  488. // r_lo := s_hi*D_hi - r_hi ...fma
  489. // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
  490. //
  491. // Return CASE, N, r_hi, r_lo
  492. //
  493. FR_X = f32
  494. FR_N = f33
  495. FR_p_1 = f34
  496. FR_TWOM33 = f35
  497. FR_TWOM50 = f36
  498. FR_g = f37
  499. FR_p_2 = f38
  500. FR_f = f39
  501. FR_s_lo = f40
  502. FR_p_3 = f41
  503. FR_f_abs = f42
  504. FR_D_lo = f43
  505. FR_p_4 = f44
  506. FR_D_hi = f45
  507. FR_Tmp2_C = f46
  508. FR_s_hi = f47
  509. FR_sigma_A = f48
  510. FR_A = f49
  511. FR_sigma_B = f50
  512. FR_B = f51
  513. FR_sigma_C = f52
  514. FR_b = f53
  515. FR_ScaleP2 = f54
  516. FR_ScaleP3 = f55
  517. FR_ScaleP4 = f56
  518. FR_Tmp_A = f57
  519. FR_Tmp_B = f58
  520. FR_Tmp_C = f59
  521. FR_A_hi = f60
  522. FR_f_hi = f61
  523. FR_r_hi = f62
  524. FR_A_lo = f63
  525. FR_B_hi = f64
  526. FR_a = f65
  527. FR_B_lo = f66
  528. FR_f_lo = f67
  529. FR_r_lo = f68
  530. FR_C_hi = f69
  531. FR_C_lo = f70
  532. GR_N = r8
  533. GR_Address_of_Input = r32
  534. GR_Address_of_Outputs = r33
  535. GR_Exp_x = r36
  536. GR_Temp = r37
  537. GR_BIASL63 = r38
  538. GR_CASE = r39
  539. GR_x_lo = r40
  540. GR_sgn_x = r41
  541. GR_M = r42
  542. GR_BASE = r43
  543. GR_LENGTH1 = r44
  544. GR_LENGTH2 = r45
  545. GR_ASUB = r46
  546. GR_P_0 = r47
  547. GR_P_1 = r48
  548. GR_P_2 = r49
  549. GR_P_3 = r50
  550. GR_P_4 = r51
  551. GR_START = r52
  552. GR_SEGMENT = r53
  553. GR_A = r54
  554. GR_B = r55
  555. GR_C = r56
  556. GR_D = r57
  557. GR_E = r58
  558. GR_TEMP1 = r59
  559. GR_TEMP2 = r60
  560. GR_TEMP3 = r61
  561. GR_TEMP4 = r62
  562. GR_TEMP5 = r63
  563. GR_TEMP6 = r64
  564. .align 64
  565. .data
  566. Constants_Bits_of_2_by_pi:
  567. data8 0x0000000000000000,0xA2F9836E4E441529
  568. data8 0xFC2757D1F534DDC0,0xDB6295993C439041
  569. data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0
  570. data8 0x06492EEA09D1921C,0xFE1DEB1CB129A73E
  571. data8 0xE88235F52EBB4484,0xE99C7026B45F7E41
  572. data8 0x3991D639835339F4,0x9C845F8BBDF9283B
  573. data8 0x1FF897FFDE05980F,0xEF2F118B5A0A6D1F
  574. data8 0x6D367ECF27CB09B7,0x4F463F669E5FEA2D
  575. data8 0x7527BAC7EBE5F17B,0x3D0739F78A5292EA
  576. data8 0x6BFB5FB11F8D5D08,0x56033046FC7B6BAB
  577. data8 0xF0CFBC209AF4361D,0xA9E391615EE61B08
  578. data8 0x6599855F14A06840,0x8DFFD8804D732731
  579. data8 0x06061556CA73A8C9,0x60E27BC08C6B47C4
  580. data8 0x19C367CDDCE8092A,0x8359C4768B961CA6
  581. data8 0xDDAF44D15719053E,0xA5FF07053F7E33E8
  582. data8 0x32C2DE4F98327DBB,0xC33D26EF6B1E5EF8
  583. data8 0x9F3A1F35CAF27F1D,0x87F121907C7C246A
  584. data8 0xFA6ED5772D30433B,0x15C614B59D19C3C2
  585. data8 0xC4AD414D2C5D000C,0x467D862D71E39AC6
  586. data8 0x9B0062337CD2B497,0xA7B4D55537F63ED7
  587. data8 0x1810A3FC764D2A9D,0x64ABD770F87C6357
  588. data8 0xB07AE715175649C0,0xD9D63B3884A7CB23
  589. data8 0x24778AD623545AB9,0x1F001B0AF1DFCE19
  590. data8 0xFF319F6A1E666157,0x9947FBACD87F7EB7
  591. data8 0x652289E83260BFE6,0xCDC4EF09366CD43F
  592. data8 0x5DD7DE16DE3B5892,0x9BDE2822D2E88628
  593. data8 0x4D58E232CAC616E3,0x08CB7DE050C017A7
  594. data8 0x1DF35BE01834132E,0x6212830148835B8E
  595. data8 0xF57FB0ADF2E91E43,0x4A48D36710D8DDAA
  596. data8 0x425FAECE616AA428,0x0AB499D3F2A6067F
  597. data8 0x775C83C2A3883C61,0x78738A5A8CAFBDD7
  598. data8 0x6F63A62DCBBFF4EF,0x818D67C12645CA55
  599. data8 0x36D9CAD2A8288D61,0xC277C9121426049B
  600. data8 0x4612C459C444C5C8,0x91B24DF31700AD43
  601. data8 0xD4E5492910D5FDFC,0xBE00CC941EEECE70
  602. data8 0xF53E1380F1ECC3E7,0xB328F8C79405933E
  603. data8 0x71C1B3092EF3450B,0x9C12887B20AB9FB5
  604. data8 0x2EC292472F327B6D,0x550C90A7721FE76B
  605. data8 0x96CB314A1679E279,0x4189DFF49794E884
  606. data8 0xE6E29731996BED88,0x365F5F0EFDBBB49A
  607. data8 0x486CA46742727132,0x5D8DB8159F09E5BC
  608. data8 0x25318D3974F71C05,0x30010C0D68084B58
  609. data8 0xEE2C90AA4702E774,0x24D6BDA67DF77248
  610. data8 0x6EEF169FA6948EF6,0x91B45153D1F20ACF
  611. data8 0x3398207E4BF56863,0xB25F3EDD035D407F
  612. data8 0x8985295255C06437,0x10D86D324832754C
  613. data8 0x5BD4714E6E5445C1,0x090B69F52AD56614
  614. data8 0x9D072750045DDB3B,0xB4C576EA17F9877D
  615. data8 0x6B49BA271D296996,0xACCCC65414AD6AE2
  616. data8 0x9089D98850722CBE,0xA4049407777030F3
  617. data8 0x27FC00A871EA49C2,0x663DE06483DD9797
  618. data8 0x3FA3FD94438C860D,0xDE41319D39928C70
  619. data8 0xDDE7B7173BDF082B,0x3715A0805C93805A
  620. data8 0x921110D8E80FAF80,0x6C4BFFDB0F903876
  621. data8 0x185915A562BBCB61,0xB989C7BD401004F2
  622. data8 0xD2277549F6B6EBBB,0x22DBAA140A2F2689
  623. data8 0x768364333B091A94,0x0EAA3A51C2A31DAE
  624. data8 0xEDAF12265C4DC26D,0x9C7A2D9756C0833F
  625. data8 0x03F6F0098C402B99,0x316D07B43915200C
  626. data8 0x5BC3D8C492F54BAD,0xC6A5CA4ECD37A736
  627. data8 0xA9E69492AB6842DD,0xDE6319EF8C76528B
  628. data8 0x6837DBFCABA1AE31,0x15DFA1AE00DAFB0C
  629. data8 0x664D64B705ED3065,0x29BF56573AFF47B9
  630. data8 0xF96AF3BE75DF9328,0x3080ABF68C6615CB
  631. data8 0x040622FA1DE4D9A4,0xB33D8F1B5709CD36
  632. data8 0xE9424EA4BE13B523,0x331AAAF0A8654FA5
  633. data8 0xC1D20F3F0BCD785B,0x76F923048B7B7217
  634. data8 0x8953A6C6E26E6F00,0xEBEF584A9BB7DAC4
  635. data8 0xBA66AACFCF761D02,0xD12DF1B1C1998C77
  636. data8 0xADC3DA4886A05DF7,0xF480C62FF0AC9AEC
  637. data8 0xDDBC5C3F6DDED01F,0xC790B6DB2A3A25A3
  638. data8 0x9AAF009353AD0457,0xB6B42D297E804BA7
  639. data8 0x07DA0EAA76A1597B,0x2A12162DB7DCFDE5
  640. data8 0xFAFEDB89FDBE896C,0x76E4FCA90670803E
  641. data8 0x156E85FF87FD073E,0x2833676186182AEA
  642. data8 0xBD4DAFE7B36E6D8F,0x3967955BBF3148D7
  643. data8 0x8416DF30432DC735,0x6125CE70C9B8CB30
  644. data8 0xFD6CBFA200A4E46C,0x05A0DD5A476F21D2
  645. data8 0x1262845CB9496170,0xE0566B0152993755
  646. data8 0x50B7D51EC4F1335F,0x6E13E4305DA92E85
  647. data8 0xC3B21D3632A1A4B7,0x08D4B1EA21F716E4
  648. data8 0x698F77FF2780030C,0x2D408DA0CD4F99A5
  649. data8 0x20D3A2B30A5D2F42,0xF9B4CBDA11D0BE7D
  650. data8 0xC1DB9BBD17AB81A2,0xCA5C6A0817552E55
  651. data8 0x0027F0147F8607E1,0x640B148D4196DEBE
  652. data8 0x872AFDDAB6256B34,0x897BFEF3059EBFB9
  653. data8 0x4F6A68A82A4A5AC4,0x4FBCF82D985AD795
  654. data8 0xC7F48D4D0DA63A20,0x5F57A4B13F149538
  655. data8 0x800120CC86DD71B6,0xDEC9F560BF11654D
  656. data8 0x6B0701ACB08CD0C0,0xB24855510EFB1EC3
  657. data8 0x72953B06A33540C0,0x7BDC06CC45E0FA29
  658. data8 0x4EC8CAD641F3E8DE,0x647CD8649B31BED9
  659. data8 0xC397A4D45877C5E3,0x6913DAF03C3ABA46
  660. data8 0x18465F7555F5BDD2,0xC6926E5D2EACED44
  661. data8 0x0E423E1C87C461E9,0xFD29F3D6E7CA7C22
  662. data8 0x35916FC5E0088DD7,0xFFE26A6EC6FDB0C1
  663. data8 0x0893745D7CB2AD6B,0x9D6ECD7B723E6A11
  664. data8 0xC6A9CFF7DF7329BA,0xC9B55100B70DB2E2
  665. data8 0x24BA74607DE58AD8,0x742C150D0C188194
  666. data8 0x667E162901767A9F,0xBEFDFDEF4556367E
  667. data8 0xD913D9ECB9BA8BFC,0x97C427A831C36EF1
  668. data8 0x36C59456A8D8B5A8,0xB40ECCCF2D891234
  669. data8 0x576F89562CE3CE99,0xB920D6AA5E6B9C2A
  670. data8 0x3ECC5F114A0BFDFB,0xF4E16D3B8E2C86E2
  671. data8 0x84D4E9A9B4FCD1EE,0xEFC9352E61392F44
  672. data8 0x2138C8D91B0AFC81,0x6A4AFBD81C2F84B4
  673. data8 0x538C994ECC2254DC,0x552AD6C6C096190B
  674. data8 0xB8701A649569605A,0x26EE523F0F117F11
  675. data8 0xB5F4F5CBFC2DBC34,0xEEBC34CC5DE8605E
  676. data8 0xDD9B8E67EF3392B8,0x17C99B5861BC57E1
  677. data8 0xC68351103ED84871,0xDDDD1C2DA118AF46
  678. data8 0x2C21D7F359987AD9,0xC0549EFA864FFC06
  679. data8 0x56AE79E536228922,0xAD38DC9367AAE855
  680. data8 0x3826829BE7CAA40D,0x51B133990ED7A948
  681. data8 0x0569F0B265A7887F,0x974C8836D1F9B392
  682. data8 0x214A827B21CF98DC,0x9F405547DC3A74E1
  683. data8 0x42EB67DF9DFE5FD4,0x5EA4677B7AACBAA2
  684. data8 0xF65523882B55BA41,0x086E59862A218347
  685. data8 0x39E6E389D49EE540,0xFB49E956FFCA0F1C
  686. data8 0x8A59C52BFA94C5C1,0xD3CFC50FAE5ADB86
  687. data8 0xC5476243853B8621,0x94792C8761107B4C
  688. data8 0x2A1A2C8012BF4390,0x2688893C78E4C4A8
  689. data8 0x7BDBE5C23AC4EAF4,0x268A67F7BF920D2B
  690. data8 0xA365B1933D0B7CBD,0xDC51A463DD27DDE1
  691. data8 0x6919949A9529A828,0xCE68B4ED09209F44
  692. data8 0xCA984E638270237C,0x7E32B90F8EF5A7E7
  693. data8 0x561408F1212A9DB5,0x4D7E6F5119A5ABF9
  694. data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283
  695. data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED
  696. data8 0x34007700D255F4FC,0x4D59018071E0E13F
  697. data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB
  698. Constants_Bits_of_pi_by_2:
  699. data4 0x2168C234,0xC90FDAA2,0x00003FFF,0x00000000
  700. data4 0x80DC1CD1,0xC4C6628B,0x00003FBF,0x00000000
  701. .section .text
  702. .proc __libm_pi_by_2_reduce#
  703. .global __libm_pi_by_2_reduce#
  704. .align 64
  705. __libm_pi_by_2_reduce:
  706. // X is at the address in Address_of_Input
  707. // Place the two-piece result at the address in Address_of_Outputs
  708. // r followed by c
  709. // N is returned
  710. { .mmf
  711. alloc r34 = ar.pfs,2,34,0,0
  712. ldfe FR_X = [GR_Address_of_Input]
  713. fsetc.s3 0x00,0x7F ;;
  714. }
  715. { .mlx
  716. nop.m 999
  717. movl GR_BIASL63 = 0x1003E
  718. }
  719. ;;
  720. // L -1-2-3-4
  721. // 0 0 0 0 0. 1 0 1 0
  722. // M 0 1 2 .... 63, 64 65 ... 127, 128
  723. // ---------------------------------------------
  724. // Segment 0. 1 , 2 , 3
  725. // START = M - 63 M = 128 becomes 65
  726. // LENGTH1 = START & 0x3F 65 become position 1
  727. // SEGMENT = shr(START,6) + 1 0 maps to 1, 64 maps to 2,
  728. // LENGTH2 = 64 - LENGTH1
  729. // Address_BASE = shladd(SEGMENT,3) + BASE
  730. { .mmi
  731. nop.m 999
  732. addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp
  733. nop.i 999
  734. }
  735. ;;
  736. { .mmi
  737. ld8 GR_BASE = [GR_BASE]
  738. nop.m 999
  739. nop.i 999
  740. }
  741. ;;
  742. { .mlx
  743. nop.m 999
  744. movl GR_TEMP5 = 0x000000000000FFFE
  745. }
  746. { .mmi
  747. nop.m 999 ;;
  748. setf.exp FR_sigma_B = GR_TEMP5
  749. nop.i 999
  750. }
  751. { .mlx
  752. nop.m 999
  753. movl GR_TEMP6 = 0x000000000000FFBE ;;
  754. }
  755. // Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
  756. { .mfi
  757. setf.exp FR_sigma_A = GR_TEMP6
  758. nop.f 999
  759. nop.i 999 ;;
  760. }
  761. // Special Code for testing DE arguments
  762. // movl GR_BIASL63 = 0x0000000000013FFE
  763. // movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
  764. // setf.exp FR_X = GR_BIASL63
  765. // setf.sig FR_ScaleP3 = GR_x_lo
  766. // fmerge.se FR_X = FR_X,FR_ScaleP3
  767. // Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
  768. // 2/pi is stored contigously as
  769. // 0x00000000 0x00000000.0xA2F....
  770. // M = EXP - BIAS ( M >= 63)
  771. // Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m.
  772. // Thus -1 <= L <= -16321.
  773. { .mmf
  774. getf.exp GR_Exp_x = FR_X
  775. getf.sig GR_x_lo = FR_X
  776. fabs FR_X = FR_X ;;
  777. }
  778. { .mii
  779. and GR_x_lo = 0x03,GR_x_lo
  780. extr.u GR_M = GR_Exp_x,0,17 ;;
  781. sub GR_START = GR_M,GR_BIASL63
  782. }
  783. { .mmi
  784. nop.m 999 ;;
  785. and GR_LENGTH1 = 0x3F,GR_START
  786. shr.u GR_SEGMENT = GR_START,6
  787. }
  788. { .mmi
  789. nop.m 999 ;;
  790. add GR_SEGMENT = 0x1,GR_SEGMENT
  791. sub GR_LENGTH2 = 0x40,GR_LENGTH1
  792. }
  793. // P_0 is the two bits corresponding to bit positions L+2 and L+1
  794. // P_1 is the 64-bit starting at bit position L
  795. // P_2 is the 64-bit starting at bit position L-64
  796. // P_3 is the 64-bit starting at bit position L-128
  797. // P_4 is the 64-bit starting at bit position L-192
  798. // P_1 is made up of Alo and Bhi
  799. // P_1 = deposit Alo, position 0, length2 into P_1,position length1
  800. // deposit Bhi, position length2, length1 into P_1, position 0
  801. // P_2 is made up of Blo and Chi
  802. // P_2 = deposit Blo, position 0, length2 into P_2, position length1
  803. // deposit Chi, position length2, length1 into P_2, position 0
  804. // P_3 is made up of Clo and Dhi
  805. // P_3 = deposit Clo, position 0, length2 into P_3, position length1
  806. // deposit Dhi, position length2, length1 into P_3, position 0
  807. // P_4 is made up of Clo and Dhi
  808. // P_4 = deposit Dlo, position 0, length2 into P_4, position length1
  809. // deposit Ehi, position length2, length1 into P_4, position 0
  810. { .mmi
  811. cmp.le.unc p6,p7 = 0x2,GR_LENGTH1 ;;
  812. shladd GR_BASE = GR_SEGMENT,3,GR_BASE
  813. (p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 ;;
  814. }
  815. { .mmi
  816. nop.m 999
  817. // ld_64 A at Base and increment Base by 8
  818. // ld_64 B at Base and increment Base by 8
  819. // ld_64 C at Base and increment Base by 8
  820. // ld_64 D at Base and increment Base by 8
  821. // ld_64 E at Base and increment Base by 8
  822. // A/B/C/D
  823. // ---------------------
  824. // A, B, C, D, and E look like | length1 | length2 |
  825. // ---------------------
  826. // hi lo
  827. ld8 GR_A = [GR_BASE],8
  828. extr.u GR_sgn_x = GR_Exp_x,17,1 ;;
  829. }
  830. { .mmf
  831. nop.m 999
  832. ld8 GR_B = [GR_BASE],8
  833. fmerge.se FR_X = FR_sigma_B,FR_X ;;
  834. }
  835. { .mii
  836. ld8 GR_C = [GR_BASE],8
  837. (p8) extr.u GR_Temp = GR_A,63,1 ;;
  838. shl GR_TEMP1 = GR_A,GR_LENGTH1
  839. }
  840. { .mii
  841. ld8 GR_D = [GR_BASE],8
  842. // If length1 >= 2,
  843. // P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0.
  844. (p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 ;;
  845. shl GR_TEMP2 = GR_B,GR_LENGTH1
  846. }
  847. { .mii
  848. ld8 GR_E = [GR_BASE],-40
  849. shr.u GR_P_1 = GR_B,GR_LENGTH2 ;;
  850. shr.u GR_P_2 = GR_C,GR_LENGTH2
  851. }
  852. // Else
  853. // Load 16 bit of ASUB from (Base_Address_of_A - 2)
  854. // P_0 = ASUB & 0x3
  855. // If length1 == 0,
  856. // P_0 complete
  857. // Else
  858. // Deposit element 63 from Ahi and place in element 0 of P_0.
  859. // Endif
  860. // Endif
  861. { .mii
  862. (p7) ld2 GR_ASUB = [GR_BASE],8
  863. shl GR_TEMP3 = GR_C,GR_LENGTH1 ;;
  864. shl GR_TEMP4 = GR_D,GR_LENGTH1
  865. }
  866. { .mii
  867. nop.m 999
  868. shr.u GR_P_3 = GR_D,GR_LENGTH2 ;;
  869. shr.u GR_P_4 = GR_E,GR_LENGTH2
  870. }
  871. { .mii
  872. (p7) and GR_P_0 = 0x03,GR_ASUB
  873. (p6) and GR_P_0 = 0x03,GR_P_0 ;;
  874. or GR_P_1 = GR_P_1,GR_TEMP1
  875. }
  876. { .mmi
  877. (p8) and GR_P_0 = 0x1,GR_P_0 ;;
  878. or GR_P_2 = GR_P_2,GR_TEMP2
  879. (p8) shl GR_P_0 = GR_P_0,0x1 ;;
  880. }
  881. { .mii
  882. nop.m 999
  883. or GR_P_3 = GR_P_3,GR_TEMP3
  884. (p8) or GR_P_0 = GR_P_0,GR_Temp
  885. }
  886. { .mmi
  887. setf.sig FR_p_1 = GR_P_1 ;;
  888. setf.sig FR_p_2 = GR_P_2
  889. or GR_P_4 = GR_P_4,GR_TEMP4 ;;
  890. }
  891. { .mmi
  892. nop.m 999 ;;
  893. setf.sig FR_p_3 = GR_P_3
  894. pmpy2.r GR_M = GR_P_0,GR_x_lo
  895. }
  896. { .mlx
  897. setf.sig FR_p_4 = GR_P_4
  898. // P_1, P_2, P_3, P_4 are integers. They should be
  899. // 2^(L-63) * P_1;
  900. // 2^(L-63-64) * P_2;
  901. // 2^(L-63-128) * P_3;
  902. // 2^(L-63-192) * P_4;
  903. // Since each of them need to be multiplied to x, we would scale
  904. // both x and the P_j's by some convenient factors: scale each
  905. // of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
  906. // p_1 := fcvt.xf ( P_1 )
  907. // p_2 := fcvt.xf ( P_2 ) * 2^(-64)
  908. // p_3 := fcvt.xf ( P_3 ) * 2^(-128)
  909. // p_4 := fcvt.xf ( P_4 ) * 2^(-192)
  910. // x= Set x's exp to -1 because 2^m*1.x...x *2^(L-63)=2^(-1)*1.x...xxx
  911. // --------- --------- ---------
  912. // | P_1 | | P_2 | | P_3 |
  913. // --------- --------- ---------
  914. // ---------
  915. // X | X |
  916. // ---------
  917. // ----------------------------------------------------
  918. // --------- ---------
  919. // | A_hi | | A_lo |
  920. // --------- ---------
  921. // --------- ---------
  922. // | B_hi | | B_lo |
  923. // --------- ---------
  924. // --------- ---------
  925. // | C_hi | | C_lo |
  926. // --------- ---------
  927. // ====================================================
  928. // ----------- --------- --------- ---------
  929. // | S_0 | | S_1 | | S_2 | | S_3 |
  930. // ----------- --------- --------- ---------
  931. // | |___ binary point
  932. // |___ possibly one more bit
  933. //
  934. // Let FPSR3 be set to round towards zero with widest precision
  935. // and exponent range. Unless an explicit FPSR is given,
  936. // round-to-nearest with widest precision and exponent range is
  937. // used.
  938. movl GR_TEMP1 = 0x000000000000FFBF
  939. }
  940. { .mmi
  941. nop.m 999 ;;
  942. setf.exp FR_ScaleP2 = GR_TEMP1
  943. nop.i 999
  944. }
  945. { .mlx
  946. nop.m 999
  947. movl GR_TEMP4 = 0x000000000001003E
  948. }
  949. { .mmi
  950. nop.m 999 ;;
  951. setf.exp FR_sigma_C = GR_TEMP4
  952. nop.i 999
  953. }
  954. { .mlx
  955. nop.m 999
  956. movl GR_TEMP2 = 0x000000000000FF7F ;;
  957. }
  958. { .mmf
  959. nop.m 999
  960. setf.exp FR_ScaleP3 = GR_TEMP2
  961. fcvt.xuf.s1 FR_p_1 = FR_p_1 ;;
  962. }
  963. { .mfi
  964. nop.m 999
  965. fcvt.xuf.s1 FR_p_2 = FR_p_2
  966. nop.i 999
  967. }
  968. { .mlx
  969. nop.m 999
  970. movl GR_Temp = 0x000000000000FFDE ;;
  971. }
  972. { .mmf
  973. nop.m 999
  974. setf.exp FR_TWOM33 = GR_Temp
  975. fcvt.xuf.s1 FR_p_3 = FR_p_3 ;;
  976. }
  977. { .mfi
  978. nop.m 999
  979. fcvt.xuf.s1 FR_p_4 = FR_p_4
  980. nop.i 999 ;;
  981. }
  982. { .mfi
  983. nop.m 999
  984. // Tmp_C := fmpy.fpsr3( x, p_1 );
  985. // Tmp_B := fmpy.fpsr3( x, p_2 );
  986. // Tmp_A := fmpy.fpsr3( x, p_3 );
  987. // If Tmp_C >= sigma_C then
  988. // C_hi := Tmp_C;
  989. // C_lo := x*p_1 - C_hi ...fma, exact
  990. // Else
  991. // C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
  992. // C_lo := x*p_1 - C_hi ...fma, exact
  993. // End If
  994. // If Tmp_B >= sigma_B then
  995. // B_hi := Tmp_B;
  996. // B_lo := x*p_2 - B_hi ...fma, exact
  997. // Else
  998. // B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
  999. // B_lo := x*p_2 - B_hi ...fma, exact
  1000. // End If
  1001. // If Tmp_A >= sigma_A then
  1002. // A_hi := Tmp_A;
  1003. // A_lo := x*p_3 - A_hi ...fma, exact
  1004. // Else
  1005. // A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
  1006. // Exact, regardless ...of rounding direction
  1007. // A_lo := x*p_3 - A_hi ...fma, exact
  1008. // Endif
  1009. fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
  1010. nop.i 999 ;;
  1011. }
  1012. { .mfi
  1013. nop.m 999
  1014. fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
  1015. nop.i 999
  1016. }
  1017. { .mlx
  1018. nop.m 999
  1019. movl GR_Temp = 0x0000000000000400
  1020. }
  1021. { .mlx
  1022. nop.m 999
  1023. movl GR_TEMP3 = 0x000000000000FF3F ;;
  1024. }
  1025. { .mmf
  1026. nop.m 999
  1027. setf.exp FR_ScaleP4 = GR_TEMP3
  1028. fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 ;;
  1029. }
  1030. { .mlx
  1031. nop.m 999
  1032. movl GR_TEMP4 = 0x0000000000010045 ;;
  1033. }
  1034. { .mmf
  1035. nop.m 999
  1036. setf.exp FR_Tmp2_C = GR_TEMP4
  1037. fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 ;;
  1038. }
  1039. { .mfi
  1040. nop.m 999
  1041. fcmp.ge.unc.s1 p12, p9 = FR_Tmp_C,FR_sigma_C
  1042. nop.i 999 ;;
  1043. }
  1044. { .mfi
  1045. nop.m 999
  1046. fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
  1047. nop.i 999 ;;
  1048. }
  1049. { .mfi
  1050. nop.m 999
  1051. (p12) mov FR_C_hi = FR_Tmp_C
  1052. nop.i 999 ;;
  1053. }
  1054. { .mfi
  1055. addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp
  1056. (p9) fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C
  1057. nop.i 999
  1058. }
  1059. ;;
  1060. // End If
  1061. // Step 3. Get reduced argument
  1062. // If sgn_x == 0 (that is original x is positive)
  1063. // D_hi := Pi_by_2_hi
  1064. // D_lo := Pi_by_2_lo
  1065. // Load from table
  1066. // Else
  1067. // D_hi := neg_Pi_by_2_hi
  1068. // D_lo := neg_Pi_by_2_lo
  1069. // Load from table
  1070. // End If
  1071. { .mmi
  1072. ld8 GR_BASE = [GR_BASE]
  1073. nop.m 999
  1074. nop.i 999
  1075. }
  1076. ;;
  1077. { .mfi
  1078. ldfe FR_D_hi = [GR_BASE],16
  1079. fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
  1080. nop.i 999 ;;
  1081. }
  1082. { .mfi
  1083. ldfe FR_D_lo = [GR_BASE],0
  1084. fcmp.ge.unc.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
  1085. nop.i 999 ;;
  1086. }
  1087. { .mfi
  1088. nop.m 999
  1089. (p13) mov FR_B_hi = FR_Tmp_B
  1090. nop.i 999
  1091. }
  1092. { .mfi
  1093. nop.m 999
  1094. (p12) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
  1095. nop.i 999 ;;
  1096. }
  1097. { .mfi
  1098. nop.m 999
  1099. (p10) fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B
  1100. nop.i 999
  1101. }
  1102. { .mfi
  1103. nop.m 999
  1104. (p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
  1105. nop.i 999 ;;
  1106. }
  1107. { .mfi
  1108. nop.m 999
  1109. fcmp.ge.unc.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
  1110. nop.i 999 ;;
  1111. }
  1112. { .mfi
  1113. nop.m 999
  1114. (p14) mov FR_A_hi = FR_Tmp_A
  1115. nop.i 999 ;;
  1116. }
  1117. { .mfi
  1118. nop.m 999
  1119. (p11) fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A
  1120. nop.i 999 ;;
  1121. }
  1122. { .mfi
  1123. nop.m 999
  1124. (p9) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
  1125. cmp.eq.unc p12,p9 = 0x1,GR_sgn_x
  1126. }
  1127. { .mfi
  1128. nop.m 999
  1129. (p13) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
  1130. nop.i 999 ;;
  1131. }
  1132. { .mfi
  1133. nop.m 999
  1134. (p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
  1135. nop.i 999
  1136. }
  1137. { .mfi
  1138. nop.m 999
  1139. // Note that C_hi is of integer value. We need only the
  1140. // last few bits. Thus we can ensure C_hi is never a big
  1141. // integer, freeing us from overflow worry.
  1142. // Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
  1143. // Tmp_C is the upper portion of C_hi
  1144. fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
  1145. nop.i 999 ;;
  1146. }
  1147. { .mfi
  1148. nop.m 999
  1149. (p14) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
  1150. nop.i 999
  1151. }
  1152. { .mfi
  1153. nop.m 999
  1154. (p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
  1155. nop.i 999 ;;
  1156. }
  1157. { .mfi
  1158. nop.m 999
  1159. // *******************
  1160. // Step 2. Get N and f
  1161. // *******************
  1162. // We have all the components to obtain
  1163. // S_0, S_1, S_2, S_3 and thus N and f. We start by adding
  1164. // C_lo and B_hi. This sum together with C_hi estimates
  1165. // N and f well.
  1166. // A := fadd.fpsr3( B_hi, C_lo )
  1167. // B := max( B_hi, C_lo )
  1168. // b := min( B_hi, C_lo )
  1169. fadd.s3 FR_A = FR_B_hi,FR_C_lo
  1170. nop.i 999
  1171. }
  1172. { .mfi
  1173. nop.m 999
  1174. (p10) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
  1175. nop.i 999 ;;
  1176. }
  1177. { .mfi
  1178. nop.m 999
  1179. fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
  1180. nop.i 999 ;;
  1181. }
  1182. { .mfi
  1183. nop.m 999
  1184. fmax.s1 FR_B = FR_B_hi,FR_C_lo
  1185. nop.i 999 ;;
  1186. }
  1187. { .mfi
  1188. nop.m 999
  1189. fmin.s1 FR_b = FR_B_hi,FR_C_lo
  1190. nop.i 999
  1191. }
  1192. { .mfi
  1193. nop.m 999
  1194. (p11) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
  1195. nop.i 999 ;;
  1196. }
  1197. { .mfi
  1198. nop.m 999
  1199. // N := round_to_nearest_integer_value( A );
  1200. fcvt.fx.s1 FR_N = FR_A
  1201. nop.i 999 ;;
  1202. }
  1203. { .mfi
  1204. nop.m 999
  1205. // C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
  1206. fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
  1207. nop.i 999 ;;
  1208. }
  1209. { .mfi
  1210. nop.m 999
  1211. // a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
  1212. fsub.s1 FR_a = FR_B,FR_A
  1213. nop.i 999 ;;
  1214. }
  1215. { .mfi
  1216. nop.m 999
  1217. // f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
  1218. fnorm.s1 FR_N = FR_N
  1219. nop.i 999
  1220. }
  1221. { .mfi
  1222. nop.m 999
  1223. fadd.s1 FR_a = FR_a,FR_b
  1224. nop.i 999 ;;
  1225. }
  1226. { .mfi
  1227. nop.m 999
  1228. fsub.s1 FR_f = FR_A,FR_N
  1229. nop.i 999
  1230. }
  1231. { .mfi
  1232. nop.m 999
  1233. // N := convert to integer format( C_hi + N );
  1234. // M := P_0 * x_lo;
  1235. // N := N + M;
  1236. fadd.s1 FR_N = FR_N,FR_C_hi
  1237. nop.i 999 ;;
  1238. }
  1239. { .mfi
  1240. nop.m 999
  1241. // f = f + a Exact because a is 0 or 2^(-64);
  1242. // the msb of the sum is <= 1/2 and lsb >= 2^(-64).
  1243. fadd.s1 FR_f = FR_f,FR_a
  1244. nop.i 999
  1245. }
  1246. { .mfi
  1247. nop.m 999
  1248. //
  1249. // Create 2**(-33)
  1250. //
  1251. fcvt.fx.s1 FR_N = FR_N
  1252. nop.i 999 ;;
  1253. }
  1254. { .mfi
  1255. nop.m 999
  1256. fabs FR_f_abs = FR_f
  1257. nop.i 999 ;;
  1258. }
  1259. { .mfi
  1260. getf.sig GR_N = FR_N
  1261. nop.f 999
  1262. nop.i 999 ;;
  1263. }
  1264. { .mii
  1265. nop.m 999
  1266. nop.i 999 ;;
  1267. add GR_N = GR_N,GR_M ;;
  1268. }
  1269. // If sgn_x == 1 (that is original x was negative)
  1270. // N := 2^10 - N
  1271. // this maintains N to be non-negative, but still
  1272. // equivalent to the (negated N) mod 4.
  1273. // End If
  1274. { .mii
  1275. (p12) sub GR_N = GR_Temp,GR_N
  1276. cmp.eq.unc p12,p9 = 0x0,GR_sgn_x ;;
  1277. nop.i 999
  1278. }
  1279. { .mfi
  1280. nop.m 999
  1281. fcmp.ge.unc.s1 p13, p10 = FR_f_abs,FR_TWOM33
  1282. nop.i 999 ;;
  1283. }
  1284. { .mfi
  1285. nop.m 999
  1286. (p9) fsub.s1 FR_D_hi = f0, FR_D_hi
  1287. nop.i 999 ;;
  1288. }
  1289. { .mfi
  1290. nop.m 999
  1291. (p10) fadd.s3 FR_A = FR_A_hi,FR_B_lo
  1292. nop.i 999
  1293. }
  1294. { .mfi
  1295. nop.m 999
  1296. (p13) fadd.s1 FR_g = FR_A_hi,FR_B_lo
  1297. nop.i 999 ;;
  1298. }
  1299. { .mfi
  1300. nop.m 999
  1301. (p10) fmax.s1 FR_B = FR_A_hi,FR_B_lo
  1302. nop.i 999
  1303. }
  1304. { .mfi
  1305. nop.m 999
  1306. (p9) fsub.s1 FR_D_lo = f0, FR_D_lo
  1307. nop.i 999 ;;
  1308. }
  1309. { .mfi
  1310. nop.m 999
  1311. (p10) fmin.s1 FR_b = FR_A_hi,FR_B_lo
  1312. nop.i 999 ;;
  1313. }
  1314. { .mfi
  1315. nop.m 999
  1316. fsetc.s3 0x7F,0x40
  1317. nop.i 999
  1318. }
  1319. { .mlx
  1320. nop.m 999
  1321. (p10) movl GR_Temp = 0x000000000000FFCD ;;
  1322. }
  1323. { .mmf
  1324. nop.m 999
  1325. (p10) setf.exp FR_TWOM50 = GR_Temp
  1326. (p10) fadd.s1 FR_f_hi = FR_A,FR_f ;;
  1327. }
  1328. { .mfi
  1329. nop.m 999
  1330. // a := (B - A) + b Exact.
  1331. // Note that a is either 0 or 2^(-128).
  1332. // f_hi := A + f;
  1333. // f_lo := (f - f_hi) + A
  1334. // f_lo=f-f_hi is exact because either |f| >= |A|, in which
  1335. // case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
  1336. // means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
  1337. // If f = 2^(-64), f-f_hi involves cancellation and is
  1338. // exact. If f = -2^(-64), then A + f is exact. Hence
  1339. // f-f_hi is -A exactly, giving f_lo = 0.
  1340. // f_lo := f_lo + a;
  1341. (p10) fsub.s1 FR_a = FR_B,FR_A
  1342. nop.i 999
  1343. }
  1344. { .mfi
  1345. nop.m 999
  1346. (p13) fadd.s1 FR_s_hi = FR_f,FR_g
  1347. nop.i 999 ;;
  1348. }
  1349. { .mlx
  1350. nop.m 999
  1351. // If |f| >= 2^(-33)
  1352. // Case 1
  1353. // CASE := 1
  1354. // g := A_hi + B_lo;
  1355. // s_hi := f + g;
  1356. // s_lo := (f - s_hi) + g;
  1357. (p13) movl GR_CASE = 0x1 ;;
  1358. }
  1359. { .mlx
  1360. nop.m 999
  1361. // Else
  1362. // Case 2
  1363. // CASE := 2
  1364. // A := fadd.fpsr3( A_hi, B_lo )
  1365. // B := max( A_hi, B_lo )
  1366. // b := min( A_hi, B_lo )
  1367. (p10) movl GR_CASE = 0x2
  1368. }
  1369. { .mfi
  1370. nop.m 999
  1371. (p10) fsub.s1 FR_f_lo = FR_f,FR_f_hi
  1372. nop.i 999 ;;
  1373. }
  1374. { .mfi
  1375. nop.m 999
  1376. (p10) fadd.s1 FR_a = FR_a,FR_b
  1377. nop.i 999
  1378. }
  1379. { .mfi
  1380. nop.m 999
  1381. (p13) fsub.s1 FR_s_lo = FR_f,FR_s_hi
  1382. nop.i 999 ;;
  1383. }
  1384. { .mfi
  1385. nop.m 999
  1386. (p13) fadd.s1 FR_s_lo = FR_s_lo,FR_g
  1387. nop.i 999 ;;
  1388. }
  1389. { .mfi
  1390. nop.m 999
  1391. (p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
  1392. nop.i 999 ;;
  1393. }
  1394. { .mfi
  1395. nop.m 999
  1396. //
  1397. // Create 2**(-50)
  1398. (p10) fadd.s1 FR_f_lo = FR_f_lo,FR_A
  1399. nop.i 999 ;;
  1400. }
  1401. { .mfi
  1402. nop.m 999
  1403. // If |f| >= 2^(-50) then
  1404. // s_hi := f_hi;
  1405. // s_lo := f_lo;
  1406. // Else
  1407. // f_lo := (f_lo + A_lo) + x*p_4
  1408. // s_hi := f_hi + f_lo
  1409. // s_lo := (f_hi - s_hi) + f_lo
  1410. // End If
  1411. (p14) mov FR_s_hi = FR_f_hi
  1412. nop.i 999 ;;
  1413. }
  1414. { .mfi
  1415. nop.m 999
  1416. (p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a
  1417. nop.i 999 ;;
  1418. }
  1419. { .mfi
  1420. nop.m 999
  1421. (p14) mov FR_s_lo = FR_f_lo
  1422. nop.i 999
  1423. }
  1424. { .mfi
  1425. nop.m 999
  1426. (p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
  1427. nop.i 999 ;;
  1428. }
  1429. { .mfi
  1430. nop.m 999
  1431. (p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
  1432. nop.i 999 ;;
  1433. }
  1434. { .mfi
  1435. nop.m 999
  1436. (p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
  1437. nop.i 999 ;;
  1438. }
  1439. { .mfi
  1440. nop.m 999
  1441. // r_hi := s_hi*D_hi
  1442. // r_lo := s_hi*D_hi - r_hi with fma
  1443. // r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
  1444. fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
  1445. nop.i 999
  1446. }
  1447. { .mfi
  1448. nop.m 999
  1449. (p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
  1450. nop.i 999 ;;
  1451. }
  1452. { .mfi
  1453. nop.m 999
  1454. fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
  1455. nop.i 999
  1456. }
  1457. { .mfi
  1458. nop.m 999
  1459. (p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
  1460. nop.i 999 ;;
  1461. }
  1462. { .mmi
  1463. nop.m 999 ;;
  1464. // Return N, r_hi, r_lo
  1465. // We do not return CASE
  1466. stfe [GR_Address_of_Outputs] = FR_r_hi,16
  1467. nop.i 999 ;;
  1468. }
  1469. { .mfi
  1470. nop.m 999
  1471. fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
  1472. nop.i 999 ;;
  1473. }
  1474. { .mfi
  1475. nop.m 999
  1476. fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
  1477. nop.i 999 ;;
  1478. }
  1479. { .mmi
  1480. nop.m 999 ;;
  1481. stfe [GR_Address_of_Outputs] = FR_r_lo,-16
  1482. nop.i 999
  1483. }
  1484. { .mib
  1485. nop.m 999
  1486. nop.i 999
  1487. br.ret.sptk b0 ;;
  1488. }
  1489. .endp __libm_pi_by_2_reduce