Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1027 lines
30 KiB

  1. .file "cosh.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00 Initial version
  29. // 4/04/00 Unwind support added
  30. // 8/15/00 Bundle added after call to __libm_error_support to properly
  31. // set [the previously overwritten] GR_Parameter_RESULT.
  32. // 5/07/01 Reworked to improve speed of all paths
  33. //
  34. // API
  35. //==============================================================
  36. // double = cosh(double)
  37. // input floating point f8
  38. // output floating point f8
  39. //
  40. // Registers used
  41. //==============================================================
  42. // general registers:
  43. // r32 -> r47
  44. // predicate registers used:
  45. // p6 -> p11
  46. // floating-point registers used:
  47. // f9 -> f15; f32 -> f90;
  48. // f8 has input, then output
  49. //
  50. // Overview of operation
  51. //==============================================================
  52. // There are seven paths
  53. // 1. 0 < |x| < 0.25 COSH_BY_POLY
  54. // 2. 0.25 <=|x| < 32 COSH_BY_TBL
  55. // 3. 32 <= |x| < 710.47586 COSH_BY_EXP (merged path with COSH_BY_TBL)
  56. // 4. |x| >= 710.47586 COSH_HUGE
  57. // 5. x=0 Done with early exit
  58. // 6. x=inf,nan Done with early exit
  59. // 7. x=denormal COSH_DENORM
  60. //
  61. // For double we get overflow for x >= 4008 b19e 747d cfc3 ed8b
  62. // >= 710.475860073
  63. //
  64. //
  65. // 1. COSH_BY_POLY 0 < |x| < 0.25
  66. // ===============
  67. // Evaluate cosh(x) by a 12th order polynomial
  68. // Care is take for the order of multiplication; and P2 is not exactly 1/4!,
  69. // P3 is not exactly 1/6!, etc.
  70. // cosh(x) = 1 + (P1*x^2 + P2*x^4 + P3*x^6 + P4*x^8 + P5*x^10 + P6*x^12)
  71. //
  72. // 2. COSH_BY_TBL 0.25 <= |x| < 32.0
  73. // =============
  74. // cosh(x) = cosh(B+R)
  75. // = cosh(B)cosh(R) + sinh(B)sinh(R)
  76. //
  77. // ax = |x| = M*log2/64 + R
  78. // B = M*log2/64
  79. // M = 64*N + j
  80. // We will calculate M and get N as (M-j)/64
  81. // The division is a shift.
  82. // exp(B) = exp(N*log2 + j*log2/64)
  83. // = 2^N * 2^(j*log2/64)
  84. // cosh(B) = 1/2(e^B + e^-B)
  85. // = 1/2(2^N * 2^(j*log2/64) + 2^-N * 2^(-j*log2/64))
  86. // cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
  87. // sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
  88. // 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
  89. // Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
  90. //
  91. // R = ax - M*log2/64
  92. // R = ax - M*log2_by_64_hi - M*log2_by_64_lo
  93. // exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
  94. // = 1 + p_odd + p_even
  95. // where the p_even uses the A coefficients and the p_even uses
  96. // the B coefficients
  97. //
  98. // So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
  99. // cosh(R) = 1 + p_even
  100. // cosh(B) = C_hi + C_lo
  101. // sinh(B) = S_hi
  102. // cosh(x) = cosh(B)cosh(R) + sinh(B)sinh(R)
  103. //
  104. // 3. COSH_BY_EXP 32.0 <= |x| < 710.47586 ( 4008 b19e 747d cfc3 ed8b )
  105. // ==============
  106. // Can approximate result by exp(x)/2 in this region.
  107. // Y_hi = Tjhi
  108. // Y_lo = Tjhi * (p_odd + p_even) + Tjlo
  109. // cosh(x) = Y_hi + Y_lo
  110. //
  111. // 4. COSH_HUGE |x| >= 710.47586 ( 4008 b19e 747d cfc3 ed8b )
  112. // ============
  113. // Set error tag and call error support
  114. //
  115. //
  116. // Assembly macros
  117. //==============================================================
  118. cosh_GR_ad1 = r34
  119. cosh_GR_Mmj = r34
  120. cosh_GR_jshf = r36
  121. cosh_GR_M = r35
  122. cosh_GR_N = r35
  123. cosh_GR_exp_2tom57 = r36
  124. cosh_GR_j = r36
  125. cosh_GR_joff = r36
  126. cosh_GR_exp_mask = r37
  127. cosh_GR_mJ = r38
  128. cosh_AD_mJ = r38
  129. cosh_GR_signexp_x = r38
  130. cosh_GR_signexp_0_5 = r38
  131. cosh_GR_exp_0_25 = r39
  132. cosh_GR_J = r39
  133. cosh_AD_J = r39
  134. cosh_GR_sig_inv_ln2 = r40
  135. cosh_GR_exp_32 = r40
  136. cosh_GR_exp_huge = r40
  137. cosh_GR_all_ones = r40
  138. cosh_GR_ad2e = r41
  139. cosh_GR_ad3 = r42
  140. cosh_GR_ad4 = r43
  141. cosh_GR_rshf = r44
  142. cosh_GR_ad2o = r45
  143. cosh_GR_rshf_2to57 = r46
  144. cosh_GR_exp_denorm = r46
  145. cosh_GR_exp_x = r47
  146. GR_SAVE_PFS = r41
  147. GR_SAVE_B0 = r42
  148. GR_SAVE_GP = r43
  149. GR_Parameter_X = r44
  150. GR_Parameter_Y = r45
  151. GR_Parameter_RESULT = r46
  152. GR_Parameter_TAG = r47
  153. cosh_FR_ABS_X = f9
  154. cosh_FR_X2 = f10
  155. cosh_FR_X4 = f11
  156. cosh_FR_all_ones = f13
  157. cosh_FR_tmp = f14
  158. cosh_FR_RSHF = f15
  159. cosh_FR_Inv_log2by64 = f32
  160. cosh_FR_log2by64_lo = f33
  161. cosh_FR_log2by64_hi = f34
  162. cosh_FR_A1 = f35
  163. cosh_FR_A2 = f36
  164. cosh_FR_A3 = f37
  165. cosh_FR_Rcub = f38
  166. cosh_FR_M_temp = f39
  167. cosh_FR_R_temp = f40
  168. cosh_FR_Rsq = f41
  169. cosh_FR_R = f42
  170. cosh_FR_M = f43
  171. cosh_FR_B1 = f44
  172. cosh_FR_B2 = f45
  173. cosh_FR_B3 = f46
  174. cosh_FR_peven_temp1 = f47
  175. cosh_FR_peven_temp2 = f48
  176. cosh_FR_peven = f49
  177. cosh_FR_podd_temp1 = f50
  178. cosh_FR_podd_temp2 = f51
  179. cosh_FR_podd = f52
  180. cosh_FR_poly65 = f53
  181. cosh_FR_poly6543 = f53
  182. cosh_FR_poly6to1 = f53
  183. cosh_FR_poly43 = f54
  184. cosh_FR_poly21 = f55
  185. cosh_FR_INV_LN2_2TO63= f57
  186. cosh_FR_RSHF_2TO57 = f58
  187. cosh_FR_2TOM57 = f59
  188. cosh_FR_smlst_oflow_input = f60
  189. cosh_FR_pre_result = f61
  190. cosh_FR_huge = f62
  191. cosh_FR_spos = f63
  192. cosh_FR_sneg = f64
  193. cosh_FR_Tjhi = f65
  194. cosh_FR_Tjlo = f66
  195. cosh_FR_Tmjhi = f67
  196. cosh_FR_Tmjlo = f68
  197. cosh_FR_S_hi = f69
  198. cosh_FR_SC_hi_temp = f70
  199. cosh_FR_C_lo_temp1 = f71
  200. cosh_FR_C_lo_temp2 = f72
  201. cosh_FR_C_lo_temp3 = f73
  202. cosh_FR_C_lo_temp4 = f73
  203. cosh_FR_C_lo = f74
  204. cosh_FR_C_hi = f75
  205. cosh_FR_C_hi_temp1 = f76
  206. cosh_FR_Y_hi = f77
  207. cosh_FR_Y_lo_temp = f78
  208. cosh_FR_Y_lo = f79
  209. cosh_FR_NORM_X = f80
  210. cosh_FR_P1 = f81
  211. cosh_FR_P2 = f82
  212. cosh_FR_P3 = f83
  213. cosh_FR_P4 = f84
  214. cosh_FR_P5 = f85
  215. cosh_FR_P6 = f86
  216. cosh_FR_Tjhi_spos = f87
  217. cosh_FR_Tjlo_spos = f88
  218. cosh_FR_huge = f89
  219. cosh_FR_signed_hi_lo = f90
  220. // Data tables
  221. //==============================================================
  222. // DO NOT CHANGE ORDER OF THESE TABLES
  223. .data
  224. .align 16
  225. double_cosh_arg_reduction:
  226. // data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf
  227. data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part
  228. data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part
  229. double_cosh_p_table:
  230. data8 0xb19e747dcfc3ed8b, 0x00004008 // Smallest x to overflow (710.47586)
  231. data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // P6
  232. data8 0xD00D00D1021D7370, 0x00003FEF // P4
  233. data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // P2
  234. data8 0x93F27740C0C2F1CC, 0x00003FE9 // P5
  235. data8 0xB60B60B60B4FE884, 0x00003FF5 // P3
  236. data8 0x8000000000000000, 0x00003FFE // P1
  237. double_cosh_ab_table:
  238. data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1
  239. data8 0x88888888884ECDD5, 0x00003FF8 // A2
  240. data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3
  241. data8 0x8000000000000002, 0x00003FFE // B1
  242. data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2
  243. data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3
  244. double_cosh_j_table:
  245. data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
  246. data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
  247. data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
  248. data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
  249. data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
  250. data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
  251. data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
  252. data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
  253. data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
  254. data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
  255. data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
  256. data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
  257. data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
  258. data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
  259. data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
  260. data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
  261. data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
  262. data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
  263. data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
  264. data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
  265. data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
  266. data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
  267. data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
  268. data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
  269. data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
  270. data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
  271. data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
  272. data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
  273. data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
  274. data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
  275. data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
  276. data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
  277. data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
  278. data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
  279. data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
  280. data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
  281. data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
  282. data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
  283. data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
  284. data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
  285. data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
  286. data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
  287. data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
  288. data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
  289. data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
  290. data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
  291. data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
  292. data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
  293. data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
  294. data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
  295. data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
  296. data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
  297. data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
  298. data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
  299. data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
  300. data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
  301. data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
  302. data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
  303. data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
  304. data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
  305. data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
  306. data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
  307. data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
  308. data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
  309. data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
  310. .align 32
  311. .global cosh#
  312. .section .text
  313. .proc cosh#
  314. .align 32
  315. cosh:
  316. { .mlx
  317. alloc r32 = ar.pfs,0,12,4,0
  318. movl cosh_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
  319. }
  320. { .mlx
  321. addl cosh_GR_ad1 = @ltoff(double_cosh_arg_reduction), gp
  322. movl cosh_GR_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57)
  323. }
  324. ;;
  325. { .mfi
  326. ld8 cosh_GR_ad1 = [cosh_GR_ad1]
  327. fmerge.s cosh_FR_ABS_X = f0,f8
  328. mov cosh_GR_exp_0_25 = 0x0fffd // Form exponent for 0.25
  329. }
  330. { .mfi
  331. nop.m 999
  332. fnorm.s1 cosh_FR_NORM_X = f8
  333. mov cosh_GR_exp_2tom57 = 0xffff-57
  334. }
  335. ;;
  336. { .mfi
  337. setf.d cosh_FR_RSHF_2TO57 = cosh_GR_rshf_2to57 // Form const 1.100 * 2^120
  338. fclass.m p10,p0 = f8, 0x0b // Test for denorm
  339. mov cosh_GR_exp_mask = 0x1ffff
  340. }
  341. { .mlx
  342. setf.sig cosh_FR_INV_LN2_2TO63 = cosh_GR_sig_inv_ln2 // Form 1/ln2 * 2^63
  343. movl cosh_GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
  344. }
  345. ;;
  346. { .mfi
  347. getf.exp cosh_GR_signexp_x = f8 // Extract signexp of x
  348. fclass.m p7,p0 = f8, 0x07 // Test if x=0
  349. nop.i 999
  350. }
  351. { .mfi
  352. setf.exp cosh_FR_2TOM57 = cosh_GR_exp_2tom57 // Form 2^-57 for scaling
  353. nop.f 999
  354. add cosh_GR_ad3 = 0x90, cosh_GR_ad1 // Point to ab_table
  355. }
  356. ;;
  357. { .mfi
  358. setf.d cosh_FR_RSHF = cosh_GR_rshf // Form right shift const 1.100 * 2^63
  359. fclass.m p6,p0 = f8, 0xc3 // Test if x nan
  360. add cosh_GR_ad4 = 0x4f0, cosh_GR_ad1 // Point to j_table midpoint
  361. }
  362. { .mib
  363. add cosh_GR_ad2e = 0x20, cosh_GR_ad1 // Point to p_table
  364. mov cosh_GR_all_ones = -1
  365. (p10) br.cond.spnt COSH_DENORM // Branch if x denorm
  366. }
  367. ;;
  368. // Common path -- return here from COSH_DENORM if x is unnorm
  369. COSH_COMMON:
  370. { .mfi
  371. ldfe cosh_FR_smlst_oflow_input = [cosh_GR_ad2e],16
  372. fclass.m p10,p0 = f8, 0x23 // Test if x inf
  373. and cosh_GR_exp_x = cosh_GR_exp_mask, cosh_GR_signexp_x
  374. }
  375. { .mfb
  376. ldfe cosh_FR_log2by64_hi = [cosh_GR_ad1],16
  377. (p7) fma.d.s0 f8 = f1,f1,f0 // If x=0, result is 1.0
  378. (p7) br.ret.spnt b0 // Exit if x=0
  379. }
  380. ;;
  381. { .mfi
  382. // Make constant that will generate inexact when squared
  383. setf.sig cosh_FR_all_ones = cosh_GR_all_ones
  384. nop.f 999
  385. cmp.ge p7,p0 = cosh_GR_exp_x, cosh_GR_exp_0_25 // Test x < 0.25
  386. }
  387. { .mfb
  388. add cosh_GR_ad2o = 0x30, cosh_GR_ad2e // Point to p_table odd coeffs
  389. (p6) fma.d.s0 f8 = f8,f1,f8 // If x nan, return quietized nan
  390. (p6) br.ret.spnt b0 // Exit for x nan
  391. }
  392. ;;
  393. // Get the A coefficients for COSH_BY_TBL
  394. // Calculate X2 = ax*ax for COSH_BY_POLY
  395. { .mfi
  396. ldfe cosh_FR_log2by64_lo = [cosh_GR_ad1],16
  397. (p10) fmerge.s f8 = f0, f8 // If x inf, result is +inf
  398. nop.i 999
  399. }
  400. { .mfb
  401. ldfe cosh_FR_A1 = [cosh_GR_ad3],16
  402. fma.s1 cosh_FR_X2 = cosh_FR_ABS_X, cosh_FR_ABS_X, f0
  403. (p7) br.cond.sptk COSH_BY_TBL
  404. }
  405. ;;
  406. // Here if 0 < |x| < 0.25
  407. COSH_BY_POLY:
  408. { .mmf
  409. ldfe cosh_FR_P6 = [cosh_GR_ad2e],16
  410. ldfe cosh_FR_P5 = [cosh_GR_ad2o],16
  411. nop.f 999
  412. }
  413. ;;
  414. { .mmi
  415. ldfe cosh_FR_P4 = [cosh_GR_ad2e],16
  416. ldfe cosh_FR_P3 = [cosh_GR_ad2o],16
  417. nop.i 999
  418. }
  419. ;;
  420. { .mmi
  421. ldfe cosh_FR_P2 = [cosh_GR_ad2e],16
  422. ldfe cosh_FR_P1 = [cosh_GR_ad2o],16
  423. nop.i 999
  424. }
  425. ;;
  426. { .mfi
  427. nop.m 999
  428. fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0
  429. nop.i 999
  430. }
  431. ;;
  432. { .mfi
  433. nop.m 999
  434. fma.s1 cosh_FR_poly65 = cosh_FR_X2, cosh_FR_P6, cosh_FR_P5
  435. nop.i 999
  436. }
  437. { .mfi
  438. nop.m 999
  439. fma.s1 cosh_FR_poly43 = cosh_FR_X2, cosh_FR_P4, cosh_FR_P3
  440. nop.i 999
  441. }
  442. ;;
  443. { .mfi
  444. nop.m 999
  445. fma.s1 cosh_FR_poly21 = cosh_FR_X2, cosh_FR_P2, cosh_FR_P1
  446. nop.i 999
  447. }
  448. ;;
  449. { .mfi
  450. nop.m 999
  451. fma.s1 cosh_FR_poly6543 = cosh_FR_X4, cosh_FR_poly65, cosh_FR_poly43
  452. nop.i 999
  453. }
  454. ;;
  455. { .mfi
  456. nop.m 999
  457. fma.s1 cosh_FR_poly6to1 = cosh_FR_X4, cosh_FR_poly6543, cosh_FR_poly21
  458. nop.i 999
  459. }
  460. ;;
  461. // Dummy multiply to generate inexact
  462. { .mfi
  463. nop.m 999
  464. fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones
  465. nop.i 999
  466. }
  467. { .mfb
  468. nop.m 999
  469. fma.d.s0 f8 = cosh_FR_poly6to1, cosh_FR_X2, f1
  470. br.ret.sptk b0 // Exit COSH_BY_POLY
  471. }
  472. ;;
  473. // Here if |x| >= 0.25
  474. COSH_BY_TBL:
  475. // ******************************************************
  476. // STEP 1 (TBL and EXP) - Argument reduction
  477. // ******************************************************
  478. // Get the following constants.
  479. // Inv_log2by64
  480. // log2by64_hi
  481. // log2by64_lo
  482. // We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
  483. // put them in an exponent.
  484. // cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1)
  485. // 0xffff + (N-1) = 0xffff +N -1
  486. // 0xffff - (N +1) = 0xffff -N -1
  487. // Calculate M and keep it as integer and floating point.
  488. // M = round-to-integer(x*Inv_log2by64)
  489. // cosh_FR_M = M = truncate(ax/(log2/64))
  490. // Put the integer representation of M in cosh_GR_M
  491. // and the floating point representation of M in cosh_FR_M
  492. // Get the remaining A,B coefficients
  493. { .mfb
  494. ldfe cosh_FR_A2 = [cosh_GR_ad3],16
  495. nop.f 999
  496. (p10) br.ret.spnt b0 // Exit if x inf
  497. }
  498. ;;
  499. { .mmi
  500. ldfe cosh_FR_A3 = [cosh_GR_ad3],16 ;;
  501. ldfe cosh_FR_B1 = [cosh_GR_ad3],16
  502. nop.i 999
  503. }
  504. ;;
  505. // Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand
  506. // |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6))
  507. { .mfi
  508. nop.m 999
  509. fma.s1 cosh_FR_M_temp = cosh_FR_ABS_X, cosh_FR_INV_LN2_2TO63, cosh_FR_RSHF_2TO57
  510. mov cosh_GR_signexp_0_5 = 0x0fffe // signexp of +0.5
  511. }
  512. ;;
  513. // Test for |x| >= overflow limit
  514. { .mfi
  515. nop.m 999
  516. fcmp.ge.s1 p6,p0 = cosh_FR_ABS_X, cosh_FR_smlst_oflow_input
  517. nop.i 999
  518. }
  519. ;;
  520. { .mfi
  521. ldfe cosh_FR_B2 = [cosh_GR_ad3],16
  522. nop.f 999
  523. nop.i 999
  524. }
  525. ;;
  526. // Subtract RSHF constant to get rounded M as a floating point value
  527. // M_temp * 2^(63-6) - 2^63
  528. { .mfb
  529. ldfe cosh_FR_B3 = [cosh_GR_ad3],16
  530. fms.s1 cosh_FR_M = cosh_FR_M_temp, cosh_FR_2TOM57, cosh_FR_RSHF
  531. (p6) br.cond.spnt COSH_HUGE // Branch if result will overflow
  532. }
  533. ;;
  534. { .mfi
  535. getf.sig cosh_GR_M = cosh_FR_M_temp
  536. nop.f 999
  537. nop.i 999
  538. }
  539. ;;
  540. // Calculate j. j is the signed extension of the six lsb of M. It
  541. // has a range of -32 thru 31.
  542. // Calculate R
  543. // ax - M*log2by64_hi
  544. // R = (ax - M*log2by64_hi) - M*log2by64_lo
  545. { .mfi
  546. nop.m 999
  547. fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_ABS_X
  548. and cosh_GR_j = 0x3f, cosh_GR_M
  549. }
  550. ;;
  551. { .mii
  552. nop.m 999
  553. shl cosh_GR_jshf = cosh_GR_j, 0x2 ;; // Shift j so can sign extend it
  554. sxt1 cosh_GR_jshf = cosh_GR_jshf
  555. }
  556. ;;
  557. // N = (M-j)/64
  558. { .mii
  559. mov cosh_GR_exp_32 = 0x10004
  560. shr cosh_GR_j = cosh_GR_jshf, 0x2 ;; // Now j has range -32 to 31
  561. sub cosh_GR_Mmj = cosh_GR_M, cosh_GR_j ;; // M-j
  562. }
  563. ;;
  564. // The TBL and EXP branches are merged and predicated
  565. // If TBL, p6 true, 0.25 <= |x| < 32
  566. // If EXP, p7 true, 32 <= |x| < overflow_limit
  567. //
  568. { .mfi
  569. cmp.ge p7,p6 = cosh_GR_exp_x, cosh_GR_exp_32 // Test if x >= 32
  570. fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp
  571. shr cosh_GR_N = cosh_GR_Mmj, 0x6 // N = (M-j)/64
  572. }
  573. ;;
  574. { .mmi
  575. sub r40 = cosh_GR_signexp_0_5, cosh_GR_N // signexp of 2^(-N-1)
  576. add r39 = cosh_GR_signexp_0_5, cosh_GR_N // signexp of 2^(N-1)
  577. shl cosh_GR_joff = cosh_GR_j,5 // Make j offset to j_table
  578. }
  579. ;;
  580. { .mfi
  581. setf.exp cosh_FR_spos = r39 // Form 2^(N-1)
  582. nop.f 999
  583. sub cosh_GR_mJ = r0, cosh_GR_joff // Table offset for -j
  584. }
  585. { .mfi
  586. setf.exp cosh_FR_sneg = r40 // Form 2^(-N-1)
  587. nop.f 999
  588. add cosh_GR_J = r0, cosh_GR_joff // Table offset for +j
  589. }
  590. ;;
  591. // Get the address of the J table midpoint, add the offset
  592. { .mmf
  593. add cosh_AD_mJ = cosh_GR_ad4, cosh_GR_mJ
  594. add cosh_AD_J = cosh_GR_ad4, cosh_GR_J
  595. nop.f 999
  596. }
  597. ;;
  598. { .mmf
  599. ldfe cosh_FR_Tmjhi = [cosh_AD_mJ],16
  600. ldfe cosh_FR_Tjhi = [cosh_AD_J],16
  601. nop.f 999
  602. }
  603. ;;
  604. // ******************************************************
  605. // STEP 2 (TBL and EXP)
  606. // ******************************************************
  607. // Calculate Rsquared and Rcubed in preparation for p_even and p_odd
  608. { .mmf
  609. ldfs cosh_FR_Tmjlo = [cosh_AD_mJ],16
  610. ldfs cosh_FR_Tjlo = [cosh_AD_J],16
  611. fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0
  612. }
  613. ;;
  614. // Calculate p_even
  615. // B_2 + Rsq *B_3
  616. // B_1 + Rsq * (B_2 + Rsq *B_3)
  617. // p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
  618. { .mfi
  619. nop.m 999
  620. fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2
  621. nop.i 999
  622. }
  623. // Calculate p_odd
  624. // A_2 + Rsq *A_3
  625. // A_1 + Rsq * (A_2 + Rsq *A_3)
  626. // podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
  627. { .mfi
  628. nop.m 999
  629. fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2
  630. nop.i 999
  631. }
  632. ;;
  633. { .mfi
  634. nop.m 999
  635. fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0
  636. nop.i 999
  637. }
  638. ;;
  639. //
  640. // If TBL,
  641. // Calculate C_hi and C_lo, and S_hi
  642. // SC_hi_temp = sneg * Tmjhi
  643. // S_hi = spos * Tjhi - SC_hi_temp
  644. // S_hi = spos * Tjhi - (sneg * Tmjhi)
  645. // C_hi = spos * Tjhi + SC_hi_temp
  646. // C_hi = spos * Tjhi + (sneg * Tmjhi)
  647. { .mfi
  648. nop.m 999
  649. (p6) fma.s1 cosh_FR_SC_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0
  650. nop.i 999
  651. }
  652. ;;
  653. // If TBL,
  654. // C_lo_temp3 = sneg * Tmjlo
  655. // C_lo_temp4 = spos * Tjlo + C_lo_temp3
  656. // C_lo_temp4 = spos * Tjlo + (sneg * Tmjlo)
  657. { .mfi
  658. nop.m 999
  659. (p6) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_sneg, cosh_FR_Tmjlo, f0
  660. nop.i 999
  661. }
  662. ;;
  663. { .mfi
  664. nop.m 999
  665. fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1
  666. nop.i 999
  667. }
  668. { .mfi
  669. nop.m 999
  670. fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1
  671. nop.i 999
  672. }
  673. ;;
  674. // If EXP,
  675. // Compute 2^(N-1) * Tjhi and 2^(N-1) * Tjlo
  676. { .mfi
  677. nop.m 999
  678. (p7) fma.s1 cosh_FR_Tjhi_spos = cosh_FR_Tjhi, cosh_FR_spos, f0
  679. nop.i 999
  680. }
  681. { .mfi
  682. nop.m 999
  683. (p7) fma.s1 cosh_FR_Tjlo_spos = cosh_FR_Tjlo, cosh_FR_spos, f0
  684. nop.i 999
  685. }
  686. ;;
  687. { .mfi
  688. nop.m 999
  689. (p6) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_SC_hi_temp
  690. nop.i 999
  691. }
  692. ;;
  693. { .mfi
  694. nop.m 999
  695. (p6) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_SC_hi_temp
  696. nop.i 999
  697. }
  698. { .mfi
  699. nop.m 999
  700. (p6) fma.s1 cosh_FR_C_lo_temp4 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp3
  701. nop.i 999
  702. }
  703. ;;
  704. { .mfi
  705. nop.m 999
  706. fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0
  707. nop.i 999
  708. }
  709. { .mfi
  710. nop.m 999
  711. fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R
  712. nop.i 999
  713. }
  714. ;;
  715. // If TBL,
  716. // C_lo_temp1 = spos * Tjhi - C_hi
  717. // C_lo_temp2 = sneg * Tmjlo + C_lo_temp1
  718. // C_lo_temp2 = sneg * Tmjlo + (spos * Tjhi - C_hi)
  719. { .mfi
  720. nop.m 999
  721. (p6) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi
  722. nop.i 999
  723. }
  724. ;;
  725. { .mfi
  726. nop.m 999
  727. (p6) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1
  728. nop.i 999
  729. }
  730. ;;
  731. // If EXP,
  732. // Y_hi = 2^(N-1) * Tjhi
  733. // Y_lo = 2^(N-1) * Tjhi * (p_odd + p_even) + 2^(N-1) * Tjlo
  734. { .mfi
  735. nop.m 999
  736. (p7) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd
  737. nop.i 999
  738. }
  739. ;;
  740. // If TBL,
  741. // C_lo = C_lo_temp4 + C_lo_temp2
  742. { .mfi
  743. nop.m 999
  744. (p6) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp4, f1, cosh_FR_C_lo_temp2
  745. nop.i 999
  746. }
  747. ;;
  748. // If TBL,
  749. // Y_hi = C_hi
  750. // Y_lo = S_hi*p_odd + (C_hi*p_even + C_lo)
  751. { .mfi
  752. nop.m 999
  753. (p6) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo
  754. nop.i 999
  755. }
  756. ;;
  757. { .mfi
  758. nop.m 999
  759. (p7) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi_spos, cosh_FR_Y_lo_temp, cosh_FR_Tjlo_spos
  760. nop.i 999
  761. }
  762. ;;
  763. // Dummy multiply to generate inexact
  764. { .mfi
  765. nop.m 999
  766. fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones
  767. nop.i 999
  768. }
  769. { .mfi
  770. nop.m 999
  771. (p6) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp
  772. nop.i 999
  773. }
  774. ;;
  775. // f8 = answer = Y_hi + Y_lo
  776. { .mfi
  777. nop.m 999
  778. (p7) fma.d.s0 f8 = cosh_FR_Y_lo, f1, cosh_FR_Tjhi_spos
  779. nop.i 999
  780. }
  781. ;;
  782. // f8 = answer = Y_hi + Y_lo
  783. { .mfb
  784. nop.m 999
  785. (p6) fma.d.s0 f8 = cosh_FR_Y_lo, f1, cosh_FR_C_hi
  786. br.ret.sptk b0 // Exit for COSH_BY_TBL and COSH_BY_EXP
  787. }
  788. ;;
  789. // Here if x denorm or unorm
  790. COSH_DENORM:
  791. // Determine if x really a denorm and not a unorm
  792. { .mmf
  793. getf.exp cosh_GR_signexp_x = cosh_FR_NORM_X
  794. mov cosh_GR_exp_denorm = 0x0fc01 // Real denorms will have exp < this
  795. fmerge.s cosh_FR_ABS_X = f0, cosh_FR_NORM_X
  796. }
  797. ;;
  798. { .mfi
  799. nop.m 999
  800. fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag
  801. nop.i 999
  802. }
  803. ;;
  804. // Set p8 if really a denorm
  805. { .mmi
  806. and cosh_GR_exp_x = cosh_GR_exp_mask, cosh_GR_signexp_x ;;
  807. cmp.lt p8,p9 = cosh_GR_exp_x, cosh_GR_exp_denorm
  808. nop.i 999
  809. }
  810. ;;
  811. { .mfb
  812. nop.m 999
  813. (p8) fma.d.s0 f8 = f8,f8,f1 // If x denorm, result=1+x^2
  814. (p9) br.cond.sptk COSH_COMMON // Return to main path if x unorm
  815. }
  816. ;;
  817. { .mfb
  818. nop.m 999
  819. nop.f 999
  820. br.ret.sptk b0 // Exit if x denorm
  821. }
  822. ;;
  823. // Here if |x| >= overflow limit
  824. COSH_HUGE:
  825. // for COSH_HUGE, put 24000 in exponent
  826. { .mmi
  827. mov cosh_GR_exp_huge = 0x15dbf ;;
  828. setf.exp cosh_FR_huge = cosh_GR_exp_huge
  829. nop.i 999
  830. }
  831. ;;
  832. { .mfi
  833. nop.m 999
  834. fma.s1 cosh_FR_signed_hi_lo = cosh_FR_huge, f1, f1
  835. nop.i 999
  836. }
  837. ;;
  838. { .mfi
  839. nop.m 999
  840. fma.d.s0 cosh_FR_pre_result = cosh_FR_signed_hi_lo, cosh_FR_huge, f0
  841. mov GR_Parameter_TAG = 64
  842. }
  843. ;;
  844. .endp cosh
  845. // Stack operations when calling error support.
  846. // (1) (2) (3) (call) (4)
  847. // sp -> + psp -> + psp -> + sp -> +
  848. // | | | |
  849. // | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
  850. // | | | |
  851. // | <-GR_Y Y2->| Y2 ->| <- GR_Y |
  852. // | | | |
  853. // | | <- GR_X X1 ->| |
  854. // | | | |
  855. // sp-64 -> + sp -> + sp -> + +
  856. // save ar.pfs save b0 restore gp
  857. // save gp restore ar.pfs
  858. .proc __libm_error_region
  859. __libm_error_region:
  860. COSH_ERROR_SUPPORT:
  861. .prologue
  862. // (1)
  863. { .mfi
  864. add GR_Parameter_Y=-32,sp // Parameter 2 value
  865. nop.f 0
  866. .save ar.pfs,GR_SAVE_PFS
  867. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  868. }
  869. { .mfi
  870. .fframe 64
  871. add sp=-64,sp // Create new stack
  872. nop.f 0
  873. mov GR_SAVE_GP=gp // Save gp
  874. };;
  875. // (2)
  876. { .mmi
  877. stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
  878. add GR_Parameter_X = 16,sp // Parameter 1 address
  879. .save b0, GR_SAVE_B0
  880. mov GR_SAVE_B0=b0 // Save b0
  881. };;
  882. .body
  883. // (3)
  884. { .mib
  885. stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
  886. add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
  887. nop.b 0
  888. }
  889. { .mib
  890. stfd [GR_Parameter_Y] = cosh_FR_pre_result // STORE Parameter 3 on stack
  891. add GR_Parameter_Y = -16,GR_Parameter_Y
  892. br.call.sptk b0=__libm_error_support# // Call error handling function
  893. };;
  894. { .mmi
  895. nop.m 0
  896. nop.m 0
  897. add GR_Parameter_RESULT = 48,sp
  898. };;
  899. // (4)
  900. { .mmi
  901. ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
  902. .restore
  903. add sp = 64,sp // Restore stack pointer
  904. mov b0 = GR_SAVE_B0 // Restore return address
  905. };;
  906. { .mib
  907. mov gp = GR_SAVE_GP // Restore gp
  908. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  909. br.ret.sptk b0 // Return
  910. };;
  911. .endp __libm_error_region
  912. .type __libm_error_support#,@function
  913. .global __libm_error_support#