Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1029 lines
30 KiB

  1. .file "coshf.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00 Initial version
  29. // 2/16/00 The error tag for coshf overflow changed to 65 (from 64).
  30. // 4/04/00 Unwind support added
  31. // 8/15/00 Bundle added after call to __libm_error_support to properly
  32. // set [the previously overwritten] GR_Parameter_RESULT.
  33. // 5/07/01 Reworked to improve speed of all paths
  34. //
  35. // API
  36. //==============================================================
  37. // float = coshf(float)
  38. // input floating point f8
  39. // output floating point f8
  40. //
  41. // Registers used
  42. //==============================================================
  43. // general registers:
  44. // r32 -> r47
  45. // predicate registers used:
  46. // p6 -> p11
  47. // floating-point registers used:
  48. // f9 -> f15; f32 -> f90;
  49. // f8 has input, then output
  50. //
  51. // Overview of operation
  52. //==============================================================
  53. // There are seven paths
  54. // 1. 0 < |x| < 0.25 COSH_BY_POLY
  55. // 2. 0.25 <=|x| < 32 COSH_BY_TBL
  56. // 3. 32 <= |x| < 89.415986 COSH_BY_EXP (merged path with COSH_BY_TBL)
  57. // 4. |x| >= 89.415986 COSH_HUGE
  58. // 5. x=0 Done with early exit
  59. // 6. x=inf,nan Done with early exit
  60. // 7. x=denormal COSH_DENORM
  61. //
  62. // For float we get overflow for x >= 4005 b2d4 fc27 c173 18a0
  63. // >= 89.415986
  64. //
  65. //
  66. // 1. COSH_BY_POLY 0 < |x| < 0.25
  67. // ===============
  68. // Evaluate cosh(x) by a 12th order polynomial
  69. // Care is take for the order of multiplication; and P2 is not exactly 1/4!,
  70. // P3 is not exactly 1/6!, etc.
  71. // cosh(x) = 1 + (P1*x^2 + P2*x^4 + P3*x^6 + P4*x^8 + P5*x^10 + P6*x^12)
  72. //
  73. // 2. COSH_BY_TBL 0.25 <= |x| < 32.0
  74. // =============
  75. // cosh(x) = cosh(B+R)
  76. // = cosh(B)cosh(R) + sinh(B)sinh(R)
  77. //
  78. // ax = |x| = M*log2/64 + R
  79. // B = M*log2/64
  80. // M = 64*N + j
  81. // We will calculate M and get N as (M-j)/64
  82. // The division is a shift.
  83. // exp(B) = exp(N*log2 + j*log2/64)
  84. // = 2^N * 2^(j*log2/64)
  85. // cosh(B) = 1/2(e^B + e^-B)
  86. // = 1/2(2^N * 2^(j*log2/64) + 2^-N * 2^(-j*log2/64))
  87. // cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
  88. // sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
  89. // 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
  90. // Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
  91. //
  92. // R = ax - M*log2/64
  93. // R = ax - M*log2_by_64_hi - M*log2_by_64_lo
  94. // exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
  95. // = 1 + p_odd + p_even
  96. // where the p_even uses the A coefficients and the p_even uses
  97. // the B coefficients
  98. //
  99. // So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
  100. // cosh(R) = 1 + p_even
  101. // cosh(B) = C_hi + C_lo
  102. // sinh(B) = S_hi
  103. // cosh(x) = cosh(B)cosh(R) + sinh(B)sinh(R)
  104. //
  105. // 3. COSH_BY_EXP 32.0 <= |x| < 89.415986 ( 4005 b2d4 fc27 c173 18a0 )
  106. // ==============
  107. // Can approximate result by exp(x)/2 in this region.
  108. // Y_hi = Tjhi
  109. // Y_lo = Tjhi * (p_odd + p_even) + Tjlo
  110. // cosh(x) = Y_hi + Y_lo
  111. //
  112. // 4. COSH_HUGE |x| >= 89.415986 ( 4005 b2d4 fc27 c173 18a0 )
  113. // ============
  114. // Set error tag and call error support
  115. //
  116. //
  117. // Assembly macros
  118. //==============================================================
  119. cosh_GR_ad1 = r34
  120. cosh_GR_Mmj = r34
  121. cosh_GR_jshf = r36
  122. cosh_GR_M = r35
  123. cosh_GR_N = r35
  124. cosh_GR_exp_2tom57 = r36
  125. cosh_GR_j = r36
  126. cosh_GR_joff = r36
  127. cosh_GR_exp_mask = r37
  128. cosh_GR_mJ = r38
  129. cosh_AD_mJ = r38
  130. cosh_GR_signexp_x = r38
  131. cosh_GR_signexp_0_5 = r38
  132. cosh_GR_exp_0_25 = r39
  133. cosh_GR_J = r39
  134. cosh_AD_J = r39
  135. cosh_GR_sig_inv_ln2 = r40
  136. cosh_GR_exp_32 = r40
  137. cosh_GR_exp_huge = r40
  138. cosh_GR_all_ones = r40
  139. cosh_GR_ad2e = r41
  140. cosh_GR_ad3 = r42
  141. cosh_GR_ad4 = r43
  142. cosh_GR_rshf = r44
  143. cosh_GR_ad2o = r45
  144. cosh_GR_rshf_2to57 = r46
  145. cosh_GR_exp_denorm = r46
  146. cosh_GR_exp_x = r47
  147. GR_SAVE_PFS = r41
  148. GR_SAVE_B0 = r42
  149. GR_SAVE_GP = r43
  150. GR_Parameter_X = r44
  151. GR_Parameter_Y = r45
  152. GR_Parameter_RESULT = r46
  153. GR_Parameter_TAG = r47
  154. cosh_FR_ABS_X = f9
  155. cosh_FR_X2 = f10
  156. cosh_FR_X4 = f11
  157. cosh_FR_all_ones = f13
  158. cosh_FR_tmp = f14
  159. cosh_FR_RSHF = f15
  160. cosh_FR_Inv_log2by64 = f32
  161. cosh_FR_log2by64_lo = f33
  162. cosh_FR_log2by64_hi = f34
  163. cosh_FR_A1 = f35
  164. cosh_FR_A2 = f36
  165. cosh_FR_A3 = f37
  166. cosh_FR_Rcub = f38
  167. cosh_FR_M_temp = f39
  168. cosh_FR_R_temp = f40
  169. cosh_FR_Rsq = f41
  170. cosh_FR_R = f42
  171. cosh_FR_M = f43
  172. cosh_FR_B1 = f44
  173. cosh_FR_B2 = f45
  174. cosh_FR_B3 = f46
  175. cosh_FR_peven_temp1 = f47
  176. cosh_FR_peven_temp2 = f48
  177. cosh_FR_peven = f49
  178. cosh_FR_podd_temp1 = f50
  179. cosh_FR_podd_temp2 = f51
  180. cosh_FR_podd = f52
  181. cosh_FR_poly65 = f53
  182. cosh_FR_poly6543 = f53
  183. cosh_FR_poly6to1 = f53
  184. cosh_FR_poly43 = f54
  185. cosh_FR_poly21 = f55
  186. cosh_FR_INV_LN2_2TO63= f57
  187. cosh_FR_RSHF_2TO57 = f58
  188. cosh_FR_2TOM57 = f59
  189. cosh_FR_smlst_oflow_input = f60
  190. cosh_FR_pre_result = f61
  191. cosh_FR_huge = f62
  192. cosh_FR_spos = f63
  193. cosh_FR_sneg = f64
  194. cosh_FR_Tjhi = f65
  195. cosh_FR_Tjlo = f66
  196. cosh_FR_Tmjhi = f67
  197. cosh_FR_Tmjlo = f68
  198. cosh_FR_S_hi = f69
  199. cosh_FR_SC_hi_temp = f70
  200. cosh_FR_C_lo_temp1 = f71
  201. cosh_FR_C_lo_temp2 = f72
  202. cosh_FR_C_lo_temp3 = f73
  203. cosh_FR_C_lo_temp4 = f73
  204. cosh_FR_C_lo = f74
  205. cosh_FR_C_hi = f75
  206. cosh_FR_C_hi_temp1 = f76
  207. cosh_FR_Y_hi = f77
  208. cosh_FR_Y_lo_temp = f78
  209. cosh_FR_Y_lo = f79
  210. cosh_FR_NORM_X = f80
  211. cosh_FR_P1 = f81
  212. cosh_FR_P2 = f82
  213. cosh_FR_P3 = f83
  214. cosh_FR_P4 = f84
  215. cosh_FR_P5 = f85
  216. cosh_FR_P6 = f86
  217. cosh_FR_Tjhi_spos = f87
  218. cosh_FR_Tjlo_spos = f88
  219. cosh_FR_huge = f89
  220. cosh_FR_signed_hi_lo = f90
  221. // Data tables
  222. //==============================================================
  223. // DO NOT CHANGE ORDER OF THESE TABLES
  224. .data
  225. .align 16
  226. double_cosh_arg_reduction:
  227. // data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf
  228. data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part
  229. data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part
  230. double_cosh_p_table:
  231. data8 0xb2d4fc27c17318a0, 0x00004005 // Smallest x to overflow (89.415986)
  232. data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // P6
  233. data8 0xD00D00D1021D7370, 0x00003FEF // P4
  234. data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // P2
  235. data8 0x93F27740C0C2F1CC, 0x00003FE9 // P5
  236. data8 0xB60B60B60B4FE884, 0x00003FF5 // P3
  237. data8 0x8000000000000000, 0x00003FFE // P1
  238. double_cosh_ab_table:
  239. data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1
  240. data8 0x88888888884ECDD5, 0x00003FF8 // A2
  241. data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3
  242. data8 0x8000000000000002, 0x00003FFE // B1
  243. data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2
  244. data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3
  245. double_cosh_j_table:
  246. data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
  247. data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
  248. data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
  249. data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
  250. data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
  251. data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
  252. data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
  253. data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
  254. data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
  255. data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
  256. data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
  257. data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
  258. data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
  259. data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
  260. data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
  261. data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
  262. data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
  263. data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
  264. data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
  265. data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
  266. data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
  267. data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
  268. data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
  269. data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
  270. data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
  271. data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
  272. data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
  273. data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
  274. data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
  275. data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
  276. data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
  277. data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
  278. data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
  279. data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
  280. data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
  281. data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
  282. data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
  283. data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
  284. data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
  285. data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
  286. data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
  287. data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
  288. data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
  289. data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
  290. data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
  291. data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
  292. data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
  293. data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
  294. data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
  295. data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
  296. data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
  297. data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
  298. data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
  299. data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
  300. data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
  301. data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
  302. data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
  303. data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
  304. data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
  305. data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
  306. data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
  307. data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
  308. data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
  309. data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
  310. data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
  311. .align 32
  312. .global coshf#
  313. .section .text
  314. .proc coshf#
  315. .align 32
  316. coshf:
  317. { .mlx
  318. alloc r32 = ar.pfs,0,12,4,0
  319. movl cosh_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
  320. }
  321. { .mlx
  322. addl cosh_GR_ad1 = @ltoff(double_cosh_arg_reduction), gp
  323. movl cosh_GR_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57)
  324. }
  325. ;;
  326. { .mfi
  327. ld8 cosh_GR_ad1 = [cosh_GR_ad1]
  328. fmerge.s cosh_FR_ABS_X = f0,f8
  329. mov cosh_GR_exp_0_25 = 0x0fffd // Form exponent for 0.25
  330. }
  331. { .mfi
  332. nop.m 999
  333. fnorm.s1 cosh_FR_NORM_X = f8
  334. mov cosh_GR_exp_2tom57 = 0xffff-57
  335. }
  336. ;;
  337. { .mfi
  338. setf.d cosh_FR_RSHF_2TO57 = cosh_GR_rshf_2to57 // Form const 1.100 * 2^120
  339. fclass.m p10,p0 = f8, 0x0b // Test for denorm
  340. mov cosh_GR_exp_mask = 0x1ffff
  341. }
  342. { .mlx
  343. setf.sig cosh_FR_INV_LN2_2TO63 = cosh_GR_sig_inv_ln2 // Form 1/ln2 * 2^63
  344. movl cosh_GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
  345. }
  346. ;;
  347. { .mfi
  348. getf.exp cosh_GR_signexp_x = f8 // Extract signexp of x
  349. fclass.m p7,p0 = f8, 0x07 // Test if x=0
  350. nop.i 999
  351. }
  352. { .mfi
  353. setf.exp cosh_FR_2TOM57 = cosh_GR_exp_2tom57 // Form 2^-57 for scaling
  354. nop.f 999
  355. add cosh_GR_ad3 = 0x90, cosh_GR_ad1 // Point to ab_table
  356. }
  357. ;;
  358. { .mfi
  359. setf.d cosh_FR_RSHF = cosh_GR_rshf // Form right shift const 1.100 * 2^63
  360. fclass.m p6,p0 = f8, 0xc3 // Test if x nan
  361. add cosh_GR_ad4 = 0x4f0, cosh_GR_ad1 // Point to j_table midpoint
  362. }
  363. { .mib
  364. add cosh_GR_ad2e = 0x20, cosh_GR_ad1 // Point to p_table
  365. mov cosh_GR_all_ones = -1
  366. (p10) br.cond.spnt COSH_DENORM // Branch if x denorm
  367. }
  368. ;;
  369. // Common path -- return here from COSH_DENORM if x is unnorm
  370. COSH_COMMON:
  371. { .mfi
  372. ldfe cosh_FR_smlst_oflow_input = [cosh_GR_ad2e],16
  373. fclass.m p10,p0 = f8, 0x23 // Test if x inf
  374. and cosh_GR_exp_x = cosh_GR_exp_mask, cosh_GR_signexp_x
  375. }
  376. { .mfb
  377. ldfe cosh_FR_log2by64_hi = [cosh_GR_ad1],16
  378. (p7) fma.s.s0 f8 = f1,f1,f0 // If x=0, result is 1.0
  379. (p7) br.ret.spnt b0 // Exit if x=0
  380. }
  381. ;;
  382. { .mfi
  383. // Make constant that will generate inexact when squared
  384. setf.sig cosh_FR_all_ones = cosh_GR_all_ones
  385. nop.f 999
  386. cmp.ge p7,p0 = cosh_GR_exp_x, cosh_GR_exp_0_25 // Test x < 0.25
  387. }
  388. { .mfb
  389. add cosh_GR_ad2o = 0x30, cosh_GR_ad2e // Point to p_table odd coeffs
  390. (p6) fma.s.s0 f8 = f8,f1,f8 // If x nan, return quietized nan
  391. (p6) br.ret.spnt b0 // Exit for x nan
  392. }
  393. ;;
  394. // Get the A coefficients for COSH_BY_TBL
  395. // Calculate X2 = ax*ax for COSH_BY_POLY
  396. { .mfi
  397. ldfe cosh_FR_log2by64_lo = [cosh_GR_ad1],16
  398. (p10) fmerge.s f8 = f0, f8 // If x inf, result is +inf
  399. nop.i 999
  400. }
  401. { .mfb
  402. ldfe cosh_FR_A1 = [cosh_GR_ad3],16
  403. fma.s1 cosh_FR_X2 = cosh_FR_ABS_X, cosh_FR_ABS_X, f0
  404. (p7) br.cond.sptk COSH_BY_TBL
  405. }
  406. ;;
  407. // Here if 0 < |x| < 0.25
  408. COSH_BY_POLY:
  409. { .mmf
  410. ldfe cosh_FR_P6 = [cosh_GR_ad2e],16
  411. ldfe cosh_FR_P5 = [cosh_GR_ad2o],16
  412. nop.f 999
  413. }
  414. ;;
  415. { .mmi
  416. ldfe cosh_FR_P4 = [cosh_GR_ad2e],16
  417. ldfe cosh_FR_P3 = [cosh_GR_ad2o],16
  418. nop.i 999
  419. }
  420. ;;
  421. { .mmi
  422. ldfe cosh_FR_P2 = [cosh_GR_ad2e],16
  423. ldfe cosh_FR_P1 = [cosh_GR_ad2o],16
  424. nop.i 999
  425. }
  426. ;;
  427. { .mfi
  428. nop.m 999
  429. fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0
  430. nop.i 999
  431. }
  432. ;;
  433. { .mfi
  434. nop.m 999
  435. fma.s1 cosh_FR_poly65 = cosh_FR_X2, cosh_FR_P6, cosh_FR_P5
  436. nop.i 999
  437. }
  438. { .mfi
  439. nop.m 999
  440. fma.s1 cosh_FR_poly43 = cosh_FR_X2, cosh_FR_P4, cosh_FR_P3
  441. nop.i 999
  442. }
  443. ;;
  444. { .mfi
  445. nop.m 999
  446. fma.s1 cosh_FR_poly21 = cosh_FR_X2, cosh_FR_P2, cosh_FR_P1
  447. nop.i 999
  448. }
  449. ;;
  450. { .mfi
  451. nop.m 999
  452. fma.s1 cosh_FR_poly6543 = cosh_FR_X4, cosh_FR_poly65, cosh_FR_poly43
  453. nop.i 999
  454. }
  455. ;;
  456. { .mfi
  457. nop.m 999
  458. fma.s1 cosh_FR_poly6to1 = cosh_FR_X4, cosh_FR_poly6543, cosh_FR_poly21
  459. nop.i 999
  460. }
  461. ;;
  462. // Dummy multiply to generate inexact
  463. { .mfi
  464. nop.m 999
  465. fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones
  466. nop.i 999
  467. }
  468. { .mfb
  469. nop.m 999
  470. fma.s.s0 f8 = cosh_FR_poly6to1, cosh_FR_X2, f1
  471. br.ret.sptk b0 // Exit COSH_BY_POLY
  472. }
  473. ;;
  474. // Here if |x| >= 0.25
  475. COSH_BY_TBL:
  476. // ******************************************************
  477. // STEP 1 (TBL and EXP) - Argument reduction
  478. // ******************************************************
  479. // Get the following constants.
  480. // Inv_log2by64
  481. // log2by64_hi
  482. // log2by64_lo
  483. // We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
  484. // put them in an exponent.
  485. // cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1)
  486. // 0xffff + (N-1) = 0xffff +N -1
  487. // 0xffff - (N +1) = 0xffff -N -1
  488. // Calculate M and keep it as integer and floating point.
  489. // M = round-to-integer(x*Inv_log2by64)
  490. // cosh_FR_M = M = truncate(ax/(log2/64))
  491. // Put the integer representation of M in cosh_GR_M
  492. // and the floating point representation of M in cosh_FR_M
  493. // Get the remaining A,B coefficients
  494. { .mfb
  495. ldfe cosh_FR_A2 = [cosh_GR_ad3],16
  496. nop.f 999
  497. (p10) br.ret.spnt b0 // Exit if x inf
  498. }
  499. ;;
  500. { .mmi
  501. ldfe cosh_FR_A3 = [cosh_GR_ad3],16 ;;
  502. ldfe cosh_FR_B1 = [cosh_GR_ad3],16
  503. nop.i 999
  504. }
  505. ;;
  506. // Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand
  507. // |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6))
  508. { .mfi
  509. nop.m 999
  510. fma.s1 cosh_FR_M_temp = cosh_FR_ABS_X, cosh_FR_INV_LN2_2TO63, cosh_FR_RSHF_2TO57
  511. mov cosh_GR_signexp_0_5 = 0x0fffe // signexp of +0.5
  512. }
  513. ;;
  514. // Test for |x| >= overflow limit
  515. { .mfi
  516. nop.m 999
  517. fcmp.ge.s1 p6,p0 = cosh_FR_ABS_X, cosh_FR_smlst_oflow_input
  518. nop.i 999
  519. }
  520. ;;
  521. { .mfi
  522. ldfe cosh_FR_B2 = [cosh_GR_ad3],16
  523. nop.f 999
  524. nop.i 999
  525. }
  526. ;;
  527. // Subtract RSHF constant to get rounded M as a floating point value
  528. // M_temp * 2^(63-6) - 2^63
  529. { .mfb
  530. ldfe cosh_FR_B3 = [cosh_GR_ad3],16
  531. fms.s1 cosh_FR_M = cosh_FR_M_temp, cosh_FR_2TOM57, cosh_FR_RSHF
  532. (p6) br.cond.spnt COSH_HUGE // Branch if result will overflow
  533. }
  534. ;;
  535. { .mfi
  536. getf.sig cosh_GR_M = cosh_FR_M_temp
  537. nop.f 999
  538. nop.i 999
  539. }
  540. ;;
  541. // Calculate j. j is the signed extension of the six lsb of M. It
  542. // has a range of -32 thru 31.
  543. // Calculate R
  544. // ax - M*log2by64_hi
  545. // R = (ax - M*log2by64_hi) - M*log2by64_lo
  546. { .mfi
  547. nop.m 999
  548. fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_ABS_X
  549. and cosh_GR_j = 0x3f, cosh_GR_M
  550. }
  551. ;;
  552. { .mii
  553. nop.m 999
  554. shl cosh_GR_jshf = cosh_GR_j, 0x2 ;; // Shift j so can sign extend it
  555. sxt1 cosh_GR_jshf = cosh_GR_jshf
  556. }
  557. ;;
  558. // N = (M-j)/64
  559. { .mii
  560. mov cosh_GR_exp_32 = 0x10004
  561. shr cosh_GR_j = cosh_GR_jshf, 0x2 ;; // Now j has range -32 to 31
  562. sub cosh_GR_Mmj = cosh_GR_M, cosh_GR_j ;; // M-j
  563. }
  564. ;;
  565. // The TBL and EXP branches are merged and predicated
  566. // If TBL, p6 true, 0.25 <= |x| < 32
  567. // If EXP, p7 true, 32 <= |x| < overflow_limit
  568. //
  569. { .mfi
  570. cmp.ge p7,p6 = cosh_GR_exp_x, cosh_GR_exp_32 // Test if x >= 32
  571. fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp
  572. shr cosh_GR_N = cosh_GR_Mmj, 0x6 // N = (M-j)/64
  573. }
  574. ;;
  575. { .mmi
  576. sub r40 = cosh_GR_signexp_0_5, cosh_GR_N // signexp of 2^(-N-1)
  577. add r39 = cosh_GR_signexp_0_5, cosh_GR_N // signexp of 2^(N-1)
  578. shl cosh_GR_joff = cosh_GR_j,5 // Make j offset to j_table
  579. }
  580. ;;
  581. { .mfi
  582. setf.exp cosh_FR_spos = r39 // Form 2^(N-1)
  583. nop.f 999
  584. sub cosh_GR_mJ = r0, cosh_GR_joff // Table offset for -j
  585. }
  586. { .mfi
  587. setf.exp cosh_FR_sneg = r40 // Form 2^(-N-1)
  588. nop.f 999
  589. add cosh_GR_J = r0, cosh_GR_joff // Table offset for +j
  590. }
  591. ;;
  592. // Get the address of the J table midpoint, add the offset
  593. { .mmf
  594. add cosh_AD_mJ = cosh_GR_ad4, cosh_GR_mJ
  595. add cosh_AD_J = cosh_GR_ad4, cosh_GR_J
  596. nop.f 999
  597. }
  598. ;;
  599. { .mmf
  600. ldfe cosh_FR_Tmjhi = [cosh_AD_mJ],16
  601. ldfe cosh_FR_Tjhi = [cosh_AD_J],16
  602. nop.f 999
  603. }
  604. ;;
  605. // ******************************************************
  606. // STEP 2 (TBL and EXP)
  607. // ******************************************************
  608. // Calculate Rsquared and Rcubed in preparation for p_even and p_odd
  609. { .mmf
  610. ldfs cosh_FR_Tmjlo = [cosh_AD_mJ],16
  611. ldfs cosh_FR_Tjlo = [cosh_AD_J],16
  612. fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0
  613. }
  614. ;;
  615. // Calculate p_even
  616. // B_2 + Rsq *B_3
  617. // B_1 + Rsq * (B_2 + Rsq *B_3)
  618. // p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
  619. { .mfi
  620. nop.m 999
  621. fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2
  622. nop.i 999
  623. }
  624. // Calculate p_odd
  625. // A_2 + Rsq *A_3
  626. // A_1 + Rsq * (A_2 + Rsq *A_3)
  627. // podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
  628. { .mfi
  629. nop.m 999
  630. fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2
  631. nop.i 999
  632. }
  633. ;;
  634. { .mfi
  635. nop.m 999
  636. fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0
  637. nop.i 999
  638. }
  639. ;;
  640. //
  641. // If TBL,
  642. // Calculate C_hi and C_lo, and S_hi
  643. // SC_hi_temp = sneg * Tmjhi
  644. // S_hi = spos * Tjhi - SC_hi_temp
  645. // S_hi = spos * Tjhi - (sneg * Tmjhi)
  646. // C_hi = spos * Tjhi + SC_hi_temp
  647. // C_hi = spos * Tjhi + (sneg * Tmjhi)
  648. { .mfi
  649. nop.m 999
  650. (p6) fma.s1 cosh_FR_SC_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0
  651. nop.i 999
  652. }
  653. ;;
  654. // If TBL,
  655. // C_lo_temp3 = sneg * Tmjlo
  656. // C_lo_temp4 = spos * Tjlo + C_lo_temp3
  657. // C_lo_temp4 = spos * Tjlo + (sneg * Tmjlo)
  658. { .mfi
  659. nop.m 999
  660. (p6) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_sneg, cosh_FR_Tmjlo, f0
  661. nop.i 999
  662. }
  663. ;;
  664. { .mfi
  665. nop.m 999
  666. fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1
  667. nop.i 999
  668. }
  669. { .mfi
  670. nop.m 999
  671. fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1
  672. nop.i 999
  673. }
  674. ;;
  675. // If EXP,
  676. // Compute 2^(N-1) * Tjhi and 2^(N-1) * Tjlo
  677. { .mfi
  678. nop.m 999
  679. (p7) fma.s1 cosh_FR_Tjhi_spos = cosh_FR_Tjhi, cosh_FR_spos, f0
  680. nop.i 999
  681. }
  682. { .mfi
  683. nop.m 999
  684. (p7) fma.s1 cosh_FR_Tjlo_spos = cosh_FR_Tjlo, cosh_FR_spos, f0
  685. nop.i 999
  686. }
  687. ;;
  688. { .mfi
  689. nop.m 999
  690. (p6) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_SC_hi_temp
  691. nop.i 999
  692. }
  693. ;;
  694. { .mfi
  695. nop.m 999
  696. (p6) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_SC_hi_temp
  697. nop.i 999
  698. }
  699. { .mfi
  700. nop.m 999
  701. (p6) fma.s1 cosh_FR_C_lo_temp4 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp3
  702. nop.i 999
  703. }
  704. ;;
  705. { .mfi
  706. nop.m 999
  707. fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0
  708. nop.i 999
  709. }
  710. { .mfi
  711. nop.m 999
  712. fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R
  713. nop.i 999
  714. }
  715. ;;
  716. // If TBL,
  717. // C_lo_temp1 = spos * Tjhi - C_hi
  718. // C_lo_temp2 = sneg * Tmjlo + C_lo_temp1
  719. // C_lo_temp2 = sneg * Tmjlo + (spos * Tjhi - C_hi)
  720. { .mfi
  721. nop.m 999
  722. (p6) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi
  723. nop.i 999
  724. }
  725. ;;
  726. { .mfi
  727. nop.m 999
  728. (p6) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1
  729. nop.i 999
  730. }
  731. ;;
  732. // If EXP,
  733. // Y_hi = 2^(N-1) * Tjhi
  734. // Y_lo = 2^(N-1) * Tjhi * (p_odd + p_even) + 2^(N-1) * Tjlo
  735. { .mfi
  736. nop.m 999
  737. (p7) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd
  738. nop.i 999
  739. }
  740. ;;
  741. // If TBL,
  742. // C_lo = C_lo_temp4 + C_lo_temp2
  743. { .mfi
  744. nop.m 999
  745. (p6) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp4, f1, cosh_FR_C_lo_temp2
  746. nop.i 999
  747. }
  748. ;;
  749. // If TBL,
  750. // Y_hi = C_hi
  751. // Y_lo = S_hi*p_odd + (C_hi*p_even + C_lo)
  752. { .mfi
  753. nop.m 999
  754. (p6) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo
  755. nop.i 999
  756. }
  757. ;;
  758. { .mfi
  759. nop.m 999
  760. (p7) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi_spos, cosh_FR_Y_lo_temp, cosh_FR_Tjlo_spos
  761. nop.i 999
  762. }
  763. ;;
  764. // Dummy multiply to generate inexact
  765. { .mfi
  766. nop.m 999
  767. fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones
  768. nop.i 999
  769. }
  770. { .mfi
  771. nop.m 999
  772. (p6) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp
  773. nop.i 999
  774. }
  775. ;;
  776. // f8 = answer = Y_hi + Y_lo
  777. { .mfi
  778. nop.m 999
  779. (p7) fma.s.s0 f8 = cosh_FR_Y_lo, f1, cosh_FR_Tjhi_spos
  780. nop.i 999
  781. }
  782. ;;
  783. // f8 = answer = Y_hi + Y_lo
  784. { .mfb
  785. nop.m 999
  786. (p6) fma.s.s0 f8 = cosh_FR_Y_lo, f1, cosh_FR_C_hi
  787. br.ret.sptk b0 // Exit for COSH_BY_TBL and COSH_BY_EXP
  788. }
  789. ;;
  790. // Here if x denorm or unorm
  791. COSH_DENORM:
  792. // Determine if x really a denorm and not a unorm
  793. { .mmf
  794. getf.exp cosh_GR_signexp_x = cosh_FR_NORM_X
  795. mov cosh_GR_exp_denorm = 0x0ff81 // Real denorms will have exp < this
  796. fmerge.s cosh_FR_ABS_X = f0, cosh_FR_NORM_X
  797. }
  798. ;;
  799. { .mfi
  800. nop.m 999
  801. fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag
  802. nop.i 999
  803. }
  804. ;;
  805. // Set p8 if really a denorm
  806. { .mmi
  807. and cosh_GR_exp_x = cosh_GR_exp_mask, cosh_GR_signexp_x ;;
  808. cmp.lt p8,p9 = cosh_GR_exp_x, cosh_GR_exp_denorm
  809. nop.i 999
  810. }
  811. ;;
  812. { .mfb
  813. nop.m 999
  814. (p8) fma.s.s0 f8 = f8,f8,f1 // If x denorm, result=1+x^2
  815. (p9) br.cond.sptk COSH_COMMON // Return to main path if x unorm
  816. }
  817. ;;
  818. { .mfb
  819. nop.m 999
  820. nop.f 999
  821. br.ret.sptk b0 // Exit if x denorm
  822. }
  823. ;;
  824. // Here if |x| >= overflow limit
  825. COSH_HUGE:
  826. // for COSH_HUGE, put 24000 in exponent
  827. { .mmi
  828. mov cosh_GR_exp_huge = 0x15dbf ;;
  829. setf.exp cosh_FR_huge = cosh_GR_exp_huge
  830. nop.i 999
  831. }
  832. ;;
  833. { .mfi
  834. nop.m 999
  835. fma.s1 cosh_FR_signed_hi_lo = cosh_FR_huge, f1, f1
  836. nop.i 999
  837. }
  838. ;;
  839. { .mfi
  840. nop.m 999
  841. fma.s.s0 cosh_FR_pre_result = cosh_FR_signed_hi_lo, cosh_FR_huge, f0
  842. mov GR_Parameter_TAG = 65
  843. }
  844. ;;
  845. .endp coshf
  846. // Stack operations when calling error support.
  847. // (1) (2) (3) (call) (4)
  848. // sp -> + psp -> + psp -> + sp -> +
  849. // | | | |
  850. // | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
  851. // | | | |
  852. // | <-GR_Y Y2->| Y2 ->| <- GR_Y |
  853. // | | | |
  854. // | | <- GR_X X1 ->| |
  855. // | | | |
  856. // sp-64 -> + sp -> + sp -> + +
  857. // save ar.pfs save b0 restore gp
  858. // save gp restore ar.pfs
  859. .proc __libm_error_region
  860. __libm_error_region:
  861. COSH_ERROR_SUPPORT:
  862. .prologue
  863. // (1)
  864. { .mfi
  865. add GR_Parameter_Y=-32,sp // Parameter 2 value
  866. nop.f 0
  867. .save ar.pfs,GR_SAVE_PFS
  868. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  869. }
  870. { .mfi
  871. .fframe 64
  872. add sp=-64,sp // Create new stack
  873. nop.f 0
  874. mov GR_SAVE_GP=gp // Save gp
  875. };;
  876. // (2)
  877. { .mmi
  878. stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
  879. add GR_Parameter_X = 16,sp // Parameter 1 address
  880. .save b0, GR_SAVE_B0
  881. mov GR_SAVE_B0=b0 // Save b0
  882. };;
  883. .body
  884. // (3)
  885. { .mib
  886. stfs [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
  887. add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
  888. nop.b 0
  889. }
  890. { .mib
  891. stfs [GR_Parameter_Y] = cosh_FR_pre_result // STORE Parameter 3 on stack
  892. add GR_Parameter_Y = -16,GR_Parameter_Y
  893. br.call.sptk b0=__libm_error_support# // Call error handling function
  894. };;
  895. { .mmi
  896. nop.m 0
  897. nop.m 0
  898. add GR_Parameter_RESULT = 48,sp
  899. };;
  900. // (4)
  901. { .mmi
  902. ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
  903. .restore
  904. add sp = 64,sp // Restore stack pointer
  905. mov b0 = GR_SAVE_B0 // Restore return address
  906. };;
  907. { .mib
  908. mov gp = GR_SAVE_GP // Restore gp
  909. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  910. br.ret.sptk b0 // Return
  911. };;
  912. .endp __libm_error_region
  913. .type __libm_error_support#,@function
  914. .global __libm_error_support#