Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1121 lines
31 KiB

  1. .file "cosh.s"
  2. // Copyright (c) 2000, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00 Initial version
  29. // 4/04/00 Unwind support added
  30. // 8/15/00 Bundle added after call to __libm_error_support to properly
  31. // set [the previously overwritten] GR_Parameter_RESULT.
  32. //
  33. // API
  34. //==============================================================
  35. // double = cosh(double)
  36. // input floating point f8
  37. // output floating point f8
  38. // Overview of operation
  39. //==============================================================
  40. // There are four paths
  41. // 1. |x| < 0.25 COSH_BY_POLY
  42. // 2. |x| < 32 COSH_BY_TBL
  43. // 3. |x| < 2^14 COSH_BY_EXP
  44. // 4. |x_ >= 2^14 COSH_HUGE
  45. // For paths 1, and 2 SAFE is always 1.
  46. // For path 4, Safe is always 0.
  47. // SAFE = 1 means we cannot overflow.
  48. // Assembly macros
  49. //==============================================================
  50. cosh_FR_X = f44
  51. cosh_FR_SGNX = f40
  52. cosh_FR_Inv_log2by64 = f9
  53. cosh_FR_log2by64_lo = f11
  54. cosh_FR_log2by64_hi = f10
  55. cosh_FR_A1 = f9
  56. cosh_FR_A2 = f10
  57. cosh_FR_A3 = f11
  58. cosh_FR_Rcub = f12
  59. cosh_FR_M_temp = f13
  60. cosh_FR_R_temp = f13
  61. cosh_FR_Rsq = f13
  62. cosh_FR_R = f14
  63. cosh_FR_M = f38
  64. cosh_FR_B1 = f15
  65. cosh_FR_B2 = f32
  66. cosh_FR_B3 = f33
  67. cosh_FR_peven_temp1 = f34
  68. cosh_FR_peven_temp2 = f35
  69. cosh_FR_peven = f36
  70. cosh_FR_podd_temp1 = f34
  71. cosh_FR_podd_temp2 = f35
  72. cosh_FR_podd = f37
  73. cosh_FR_J_temp = f9
  74. cosh_FR_J = f10
  75. cosh_FR_Mmj = f39
  76. cosh_FR_N_temp1 = f11
  77. cosh_FR_N_temp2 = f12
  78. cosh_FR_N = f13
  79. cosh_FR_spos = f14
  80. cosh_FR_sneg = f15
  81. cosh_FR_Tjhi = f32
  82. cosh_FR_Tjlo = f33
  83. cosh_FR_Tmjhi = f34
  84. cosh_FR_Tmjlo = f35
  85. GR_mJ = r35
  86. GR_J = r36
  87. AD_mJ = r38
  88. AD_J = r39
  89. cosh_FR_C_hi = f9
  90. cosh_FR_C_hi_temp = f10
  91. cosh_FR_C_lo_temp1 = f11
  92. cosh_FR_C_lo_temp2 = f12
  93. cosh_FR_C_lo_temp3 = f13
  94. cosh_FR_C_lo = f38
  95. cosh_FR_S_hi = f39
  96. cosh_FR_S_hi_temp1 = f10
  97. cosh_FR_Y_hi = f11
  98. cosh_FR_Y_lo_temp = f12
  99. cosh_FR_Y_lo = f13
  100. cosh_FR_COSH = f9
  101. cosh_FR_X2 = f9
  102. cosh_FR_X4 = f10
  103. cosh_FR_P1 = f14
  104. cosh_FR_P2 = f15
  105. cosh_FR_P3 = f32
  106. cosh_FR_P4 = f33
  107. cosh_FR_P5 = f34
  108. cosh_FR_P6 = f35
  109. cosh_FR_TINY_THRESH = f9
  110. cosh_FR_COSH_temp = f10
  111. cosh_FR_SCALE = f11
  112. cosh_FR_hi_lo = f10
  113. cosh_FR_poly_podd_temp1 = f11
  114. cosh_FR_poly_podd_temp2 = f13
  115. cosh_FR_poly_peven_temp1 = f11
  116. cosh_FR_poly_peven_temp2 = f13
  117. GR_SAVE_PFS = r41
  118. GR_SAVE_B0 = r42
  119. GR_SAVE_GP = r43
  120. GR_Parameter_X = r44
  121. GR_Parameter_Y = r45
  122. GR_Parameter_RESULT = r46
  123. // Data tables
  124. //==============================================================
  125. .data
  126. .align 16
  127. double_cosh_arg_reduction:
  128. data8 0xB8AA3B295C17F0BC, 0x00004005
  129. data8 0xB17217F7D1000000, 0x00003FF8
  130. data8 0xCF79ABC9E3B39804, 0x00003FD0
  131. double_cosh_p_table:
  132. data8 0x8000000000000000, 0x00003FFE
  133. data8 0xAAAAAAAAAAAAAB80, 0x00003FFA
  134. data8 0xB60B60B60B4FE884, 0x00003FF5
  135. data8 0xD00D00D1021D7370, 0x00003FEF
  136. data8 0x93F27740C0C2F1CC, 0x00003FE9
  137. data8 0x8FA02AC65BCBD5BC, 0x00003FE2
  138. double_cosh_ab_table:
  139. data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC
  140. data8 0x88888888884ECDD5, 0x00003FF8
  141. data8 0xD00D0C6DCC26A86B, 0x00003FF2
  142. data8 0x8000000000000002, 0x00003FFE
  143. data8 0xAAAAAAAAAA402C77, 0x00003FFA
  144. data8 0xB60B6CC96BDB144D, 0x00003FF5
  145. double_cosh_j_table:
  146. data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
  147. data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
  148. data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
  149. data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
  150. data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
  151. data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
  152. data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
  153. data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
  154. data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
  155. data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
  156. data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
  157. data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
  158. data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
  159. data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
  160. data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
  161. data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
  162. data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
  163. data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
  164. data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
  165. data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
  166. data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
  167. data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
  168. data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
  169. data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
  170. data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
  171. data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
  172. data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
  173. data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
  174. data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
  175. data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
  176. data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
  177. data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
  178. data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
  179. data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
  180. data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
  181. data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
  182. data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
  183. data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
  184. data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
  185. data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
  186. data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
  187. data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
  188. data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
  189. data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
  190. data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
  191. data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
  192. data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
  193. data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
  194. data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
  195. data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
  196. data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
  197. data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
  198. data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
  199. data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
  200. data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
  201. data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
  202. data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
  203. data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
  204. data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
  205. data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
  206. data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
  207. data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
  208. data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
  209. data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
  210. data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
  211. .align 32
  212. .global cosh#
  213. .section .text
  214. .proc cosh#
  215. .align 32
  216. cosh:
  217. // X NAN?
  218. { .mfi
  219. alloc r32 = ar.pfs,0,12,4,0
  220. (p0) fclass.m.unc p6,p7 = f8, 0xc3 //@snan | @qnan
  221. nop.i 999
  222. }
  223. ;;
  224. { .mfb
  225. nop.m 999
  226. (p6) fma.d.s0 f8 = f8,f1,f8
  227. (p6) br.ret.spnt b0 ;;
  228. }
  229. // X infinity
  230. { .mfi
  231. nop.m 999
  232. (p0) fclass.m.unc p6,p0 = f8, 0x23 //@inf
  233. nop.i 999 ;;
  234. }
  235. { .mfb
  236. nop.m 999
  237. (p6) fmerge.s f8 = f0,f8
  238. (p6) br.ret.spnt b0 ;;
  239. }
  240. // Put 0.25 in f9; p6 true if x < 0.25
  241. { .mlx
  242. nop.m 999
  243. (p0) movl r32 = 0x000000000000fffd ;;
  244. }
  245. { .mfi
  246. (p0) setf.exp f9 = r32
  247. nop.f 999
  248. nop.i 999 ;;
  249. }
  250. { .mfi
  251. nop.m 999
  252. (p0) fmerge.s cosh_FR_X = f0,f8
  253. nop.i 999
  254. }
  255. { .mfi
  256. nop.m 999
  257. (p0) fmerge.s cosh_FR_SGNX = f8,f1
  258. nop.i 999 ;;
  259. }
  260. { .mfi
  261. nop.m 999
  262. (p0) fcmp.lt.unc p0,p7 = cosh_FR_X,f9
  263. nop.i 999 ;;
  264. }
  265. { .mib
  266. nop.m 999
  267. nop.i 999
  268. (p7) br.cond.sptk COSH_BY_TBL
  269. }
  270. ;;
  271. // COSH_BY_POLY:
  272. // POLY cannot overflow so there is no need to call __libm_error_support
  273. // Get the values of P_x from the table
  274. { .mmi
  275. nop.m 999
  276. (p0) addl r34 = @ltoff(double_cosh_p_table), gp
  277. nop.i 999
  278. }
  279. ;;
  280. { .mmi
  281. ld8 r34 = [r34]
  282. nop.m 999
  283. nop.i 999
  284. }
  285. ;;
  286. // Calculate cosh_FR_X2 = ax*ax and cosh_FR_X4 = ax*ax*ax*ax
  287. { .mmf
  288. nop.m 999
  289. (p0) ldfe cosh_FR_P1 = [r34],16
  290. (p0) fma.s1 cosh_FR_X2 = cosh_FR_X, cosh_FR_X, f0 ;;
  291. }
  292. { .mmi
  293. (p0) ldfe cosh_FR_P2 = [r34],16 ;;
  294. (p0) ldfe cosh_FR_P3 = [r34],16
  295. nop.i 999 ;;
  296. }
  297. { .mmi
  298. (p0) ldfe cosh_FR_P4 = [r34],16 ;;
  299. (p0) ldfe cosh_FR_P5 = [r34],16
  300. nop.i 999 ;;
  301. }
  302. { .mfi
  303. (p0) ldfe cosh_FR_P6 = [r34],16
  304. (p0) fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0
  305. nop.i 999 ;;
  306. }
  307. // Calculate cosh_FR_podd = x4 *(x4 * P_5 + P_3) + P_1
  308. { .mfi
  309. nop.m 999
  310. (p0) fma.s1 cosh_FR_poly_podd_temp1 = cosh_FR_X4, cosh_FR_P5, cosh_FR_P3
  311. nop.i 999 ;;
  312. }
  313. { .mfi
  314. nop.m 999
  315. (p0) fma.s1 cosh_FR_podd = cosh_FR_X4, cosh_FR_poly_podd_temp1, cosh_FR_P1
  316. nop.i 999
  317. }
  318. // Calculate cosh_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2)
  319. { .mfi
  320. nop.m 999
  321. (p0) fma.s1 cosh_FR_poly_peven_temp1 = cosh_FR_X4, cosh_FR_P6, cosh_FR_P4
  322. nop.i 999 ;;
  323. }
  324. { .mfi
  325. nop.m 999
  326. (p0) fma.s1 cosh_FR_poly_peven_temp2 = cosh_FR_X4, cosh_FR_poly_peven_temp1, cosh_FR_P2
  327. nop.i 999 ;;
  328. }
  329. { .mfi
  330. nop.m 999
  331. (p0) fma.s1 cosh_FR_peven = cosh_FR_X4, cosh_FR_poly_peven_temp2, f0
  332. nop.i 999 ;;
  333. }
  334. // Y_lo = x2*p_odd + p_even
  335. // Calculate f8 = Y_hi + Y_lo
  336. { .mfi
  337. nop.m 999
  338. (p0) fma.s1 cosh_FR_Y_lo = cosh_FR_X2, cosh_FR_podd, cosh_FR_peven
  339. nop.i 999 ;;
  340. }
  341. { .mfb
  342. nop.m 999
  343. (p0) fma.d.s0 f8 = f1, f1, cosh_FR_Y_lo
  344. (p0) br.ret.sptk b0 ;;
  345. }
  346. COSH_BY_TBL:
  347. // Now that we are at TBL; so far all we know is that |x| >= 0.25.
  348. // The first two steps are the same for TBL and EXP, but if we are HUGE
  349. // Double
  350. // Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)
  351. // Single
  352. // Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true)
  353. // we want to leave now. Go to HUGE if |x| >= 2^14
  354. // 1000d (register-biased) is e = 14 (true)
  355. { .mlx
  356. nop.m 999
  357. (p0) movl r32 = 0x0000000000010009 ;;
  358. }
  359. { .mfi
  360. (p0) setf.exp f9 = r32
  361. nop.f 999
  362. nop.i 999 ;;
  363. }
  364. { .mfi
  365. nop.m 999
  366. (p0) fcmp.ge.unc p6,p7 = cosh_FR_X,f9
  367. nop.i 999 ;;
  368. }
  369. { .mib
  370. nop.m 999
  371. nop.i 999
  372. (p6) br.cond.spnt COSH_HUGE ;;
  373. }
  374. // r32 = 1
  375. // r34 = N-1
  376. // r35 = N
  377. // r36 = j
  378. // r37 = N+1
  379. // TBL can never overflow
  380. // cosh(x) = cosh(B+R)
  381. // = cosh(B) cosh(R) + sinh(B) sinh(R)
  382. // cosh(R) can be approximated by 1 + p_even
  383. // sinh(R) can be approximated by p_odd
  384. // ******************************************************
  385. // STEP 1 (TBL and EXP)
  386. // ******************************************************
  387. // Get the following constants.
  388. // f9 = Inv_log2by64
  389. // f10 = log2by64_hi
  390. // f11 = log2by64_lo
  391. { .mmi
  392. (p0) adds r32 = 0x1,r0
  393. (p0) addl r34 = @ltoff(double_cosh_arg_reduction), gp
  394. nop.i 999
  395. }
  396. ;;
  397. // We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
  398. // put them in an exponent.
  399. // cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1)
  400. // r39 = 0xffff + (N-1) = 0xffff +N -1
  401. // r40 = 0xffff - (N +1) = 0xffff -N -1
  402. { .mlx
  403. ld8 r34 = [r34]
  404. (p0) movl r38 = 0x000000000000fffe ;;
  405. }
  406. { .mmi
  407. (p0) ldfe cosh_FR_Inv_log2by64 = [r34],16 ;;
  408. (p0) ldfe cosh_FR_log2by64_hi = [r34],16
  409. nop.i 999 ;;
  410. }
  411. { .mbb
  412. (p0) ldfe cosh_FR_log2by64_lo = [r34],16
  413. nop.b 999
  414. nop.b 999 ;;
  415. }
  416. // Get the A coefficients
  417. // f9 = A_1
  418. // f10 = A_2
  419. // f11 = A_3
  420. { .mmi
  421. nop.m 999
  422. (p0) addl r34 = @ltoff(double_cosh_ab_table), gp
  423. nop.i 999
  424. }
  425. ;;
  426. { .mmi
  427. ld8 r34 = [r34]
  428. nop.m 999
  429. nop.i 999
  430. }
  431. ;;
  432. // Calculate M and keep it as integer and floating point.
  433. // M = round-to-integer(x*Inv_log2by64)
  434. // cosh_FR_M = M = truncate(ax/(log2/64))
  435. // Put the significand of M in r35
  436. // and the floating point representation of M in cosh_FR_M
  437. { .mfi
  438. nop.m 999
  439. (p0) fma.s1 cosh_FR_M = cosh_FR_X, cosh_FR_Inv_log2by64, f0
  440. nop.i 999
  441. }
  442. { .mfi
  443. (p0) ldfe cosh_FR_A1 = [r34],16
  444. nop.f 999
  445. nop.i 999 ;;
  446. }
  447. { .mfi
  448. nop.m 999
  449. (p0) fcvt.fx.s1 cosh_FR_M_temp = cosh_FR_M
  450. nop.i 999 ;;
  451. }
  452. { .mfi
  453. nop.m 999
  454. (p0) fnorm.s1 cosh_FR_M = cosh_FR_M_temp
  455. nop.i 999 ;;
  456. }
  457. { .mfi
  458. (p0) getf.sig r35 = cosh_FR_M_temp
  459. nop.f 999
  460. nop.i 999 ;;
  461. }
  462. // M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It
  463. // has a range of -32 thru 31.
  464. // r35 = M
  465. // r36 = j
  466. { .mii
  467. nop.m 999
  468. nop.i 999 ;;
  469. (p0) and r36 = 0x3f, r35 ;;
  470. }
  471. // Calculate R
  472. // f13 = f44 - f12*f10 = x - M*log2by64_hi
  473. // f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo
  474. { .mfi
  475. nop.m 999
  476. (p0) fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_X
  477. nop.i 999
  478. }
  479. { .mfi
  480. (p0) ldfe cosh_FR_A2 = [r34],16
  481. nop.f 999
  482. nop.i 999 ;;
  483. }
  484. { .mfi
  485. nop.m 999
  486. (p0) fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp
  487. nop.i 999
  488. }
  489. // Get the B coefficients
  490. // f15 = B_1
  491. // f32 = B_2
  492. // f33 = B_3
  493. { .mmi
  494. (p0) ldfe cosh_FR_A3 = [r34],16 ;;
  495. (p0) ldfe cosh_FR_B1 = [r34],16
  496. nop.i 999 ;;
  497. }
  498. { .mmi
  499. (p0) ldfe cosh_FR_B2 = [r34],16 ;;
  500. (p0) ldfe cosh_FR_B3 = [r34],16
  501. nop.i 999 ;;
  502. }
  503. { .mii
  504. nop.m 999
  505. (p0) shl r34 = r36, 0x2 ;;
  506. (p0) sxt1 r37 = r34 ;;
  507. }
  508. // ******************************************************
  509. // STEP 2 (TBL and EXP)
  510. // ******************************************************
  511. // Calculate Rsquared and Rcubed in preparation for p_even and p_odd
  512. // f12 = R*R*R
  513. // f13 = R*R
  514. // f14 = R <== from above
  515. { .mfi
  516. nop.m 999
  517. (p0) fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0
  518. (p0) shr r36 = r37, 0x2 ;;
  519. }
  520. // r34 = M-j = r35 - r36
  521. // r35 = N = (M-j)/64
  522. { .mii
  523. (p0) sub r34 = r35, r36
  524. nop.i 999 ;;
  525. (p0) shr r35 = r34, 0x6 ;;
  526. }
  527. { .mii
  528. (p0) sub r40 = r38, r35
  529. (p0) adds r37 = 0x1, r35
  530. (p0) add r39 = r38, r35 ;;
  531. }
  532. // Get the address of the J table, add the offset,
  533. // addresses are sinh_AD_mJ and sinh_AD_J, get the T value
  534. // f32 = T(j)_hi
  535. // f33 = T(j)_lo
  536. // f34 = T(-j)_hi
  537. // f35 = T(-j)_lo
  538. { .mmi
  539. (p0) sub r34 = r35, r32
  540. (p0) addl r37 = @ltoff(double_cosh_j_table), gp
  541. nop.i 999
  542. }
  543. ;;
  544. { .mfi
  545. ld8 r37 = [r37]
  546. (p0) fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0
  547. nop.i 999
  548. }
  549. // ******************************************************
  550. // STEP 3 Now decide if we need to branch to EXP
  551. // ******************************************************
  552. // Put 32 in f9; p6 true if x < 32
  553. { .mlx
  554. nop.m 999
  555. (p0) movl r32 = 0x0000000000010004 ;;
  556. }
  557. // Calculate p_even
  558. // f34 = B_2 + Rsq *B_3
  559. // f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)
  560. // f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
  561. { .mfi
  562. nop.m 999
  563. (p0) fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2
  564. nop.i 999 ;;
  565. }
  566. { .mfi
  567. nop.m 999
  568. (p0) fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1
  569. nop.i 999
  570. }
  571. // Calculate p_odd
  572. // f34 = A_2 + Rsq *A_3
  573. // f35 = A_1 + Rsq * (A_2 + Rsq *A_3)
  574. // f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
  575. { .mfi
  576. nop.m 999
  577. (p0) fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2
  578. nop.i 999 ;;
  579. }
  580. { .mfi
  581. (p0) setf.exp cosh_FR_N_temp1 = r39
  582. nop.f 999
  583. nop.i 999 ;;
  584. }
  585. { .mfi
  586. nop.m 999
  587. (p0) fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0
  588. nop.i 999
  589. }
  590. { .mfi
  591. nop.m 999
  592. (p0) fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1
  593. nop.i 999 ;;
  594. }
  595. { .mfi
  596. (p0) setf.exp f9 = r32
  597. nop.f 999
  598. nop.i 999 ;;
  599. }
  600. { .mfi
  601. nop.m 999
  602. (p0) fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R
  603. nop.i 999
  604. }
  605. // sinh_GR_mj contains the table offset for -j
  606. // sinh_GR_j contains the table offset for +j
  607. // p6 is true when j <= 0
  608. { .mlx
  609. (p0) setf.exp cosh_FR_N_temp2 = r40
  610. (p0) movl r40 = 0x0000000000000020 ;;
  611. }
  612. { .mfi
  613. (p0) sub GR_mJ = r40, r36
  614. (p0) fmerge.se cosh_FR_spos = cosh_FR_N_temp1, f1
  615. (p0) adds GR_J = 0x20, r36 ;;
  616. }
  617. { .mii
  618. nop.m 999
  619. (p0) shl GR_mJ = GR_mJ, 5 ;;
  620. (p0) add AD_mJ = r37, GR_mJ ;;
  621. }
  622. { .mmi
  623. nop.m 999
  624. (p0) ldfe cosh_FR_Tmjhi = [AD_mJ],16
  625. (p0) shl GR_J = GR_J, 5 ;;
  626. }
  627. { .mfi
  628. (p0) ldfs cosh_FR_Tmjlo = [AD_mJ],16
  629. (p0) fcmp.lt.unc.s1 p6,p7 = cosh_FR_X,f9
  630. (p0) add AD_J = r37, GR_J ;;
  631. }
  632. { .mmi
  633. (p0) ldfe cosh_FR_Tjhi = [AD_J],16 ;;
  634. (p0) ldfs cosh_FR_Tjlo = [AD_J],16
  635. nop.i 999 ;;
  636. }
  637. { .mfb
  638. nop.m 999
  639. (p0) fmerge.se cosh_FR_sneg = cosh_FR_N_temp2, f1
  640. (p7) br.cond.spnt COSH_BY_EXP ;;
  641. }
  642. // ******************************************************
  643. // If NOT branch to EXP
  644. // ******************************************************
  645. // Calculate C_hi
  646. // ******************************************************
  647. // cosh_FR_C_hi_temp = cosh_FR_sneg * cosh_FR_Tmjhi
  648. // cosh_FR_C_hi = cosh_FR_spos * cosh_FR_Tjhi + (cosh_FR_sneg * cosh_FR_Tmjhi)
  649. { .mfi
  650. nop.m 999
  651. (p0) fma.s1 cosh_FR_C_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0
  652. nop.i 999 ;;
  653. }
  654. { .mfi
  655. nop.m 999
  656. (p0) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi_temp
  657. nop.i 999
  658. }
  659. // ******************************************************
  660. // Calculate S_hi
  661. // ******************************************************
  662. // cosh_FR_S_hi_temp1 = cosh_FR_sneg * cosh_FR_Tmjhi
  663. // cosh_FR_S_hi = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi_temp1
  664. { .mfi
  665. nop.m 999
  666. (p0) fma.s1 cosh_FR_S_hi_temp1 = cosh_FR_sneg, cosh_FR_Tmjhi, f0
  667. nop.i 999 ;;
  668. }
  669. // ******************************************************
  670. // Calculate C_lo
  671. // ******************************************************
  672. // cosh_FR_C_lo_temp1 = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi
  673. // cosh_FR_C_lo_temp2 = cosh_FR_sneg * cosh_FR_Tmjlo + (cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi)
  674. // cosh_FR_C_lo_temp1 = cosh_FR_sneg * cosh_FR_Tmjlo
  675. // cosh_FR_C_lo_temp3 = cosh_FR_spos * cosh_FR_Tjlo + (cosh_FR_sneg * cosh_FR_Tmjlo)
  676. // cosh_FR_C_lo = cosh_FR_C_lo_temp3 + cosh_FR_C_lo_temp2
  677. { .mfi
  678. nop.m 999
  679. (p0) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi
  680. nop.i 999
  681. }
  682. { .mfi
  683. nop.m 999
  684. (p0) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_S_hi_temp1
  685. nop.i 999 ;;
  686. }
  687. { .mfi
  688. nop.m 999
  689. (p0) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1
  690. nop.i 999
  691. }
  692. { .mfi
  693. nop.m 999
  694. (p0) fma.s1 cosh_FR_C_lo_temp1 = cosh_FR_sneg, cosh_FR_Tmjlo, f0
  695. nop.i 999 ;;
  696. }
  697. { .mfi
  698. nop.m 999
  699. (p0) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp1
  700. nop.i 999 ;;
  701. }
  702. { .mfi
  703. nop.m 999
  704. (p0) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp3, f1, cosh_FR_C_lo_temp2
  705. nop.i 999 ;;
  706. }
  707. // ******************************************************
  708. // cosh_FR_Y_lo_temp = cosh_FR_C_hi * cosh_FR_peven + cosh_FR_C_lo
  709. // cosh_FR_Y_lo = cosh_FR_S_hi * cosh_FR_podd + cosh_FR_Y_lo_temp
  710. // cosh_FR_COSH = Y_hi + Y_lo
  711. { .mfi
  712. nop.m 999
  713. (p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo
  714. nop.i 999 ;;
  715. }
  716. { .mfi
  717. nop.m 999
  718. (p0) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp
  719. nop.i 999 ;;
  720. }
  721. { .mfb
  722. nop.m 999
  723. (p0) fma.d.s0 f8 = cosh_FR_C_hi, f1, cosh_FR_Y_lo
  724. (p0) br.ret.sptk b0 ;;
  725. }
  726. COSH_BY_EXP:
  727. // When p7 is true, we know that an overflow is not going to happen
  728. // When p7 is false, we must check for possible overflow
  729. // p7 is the over_SAFE flag
  730. // f44 = Scale * (Y_hi + Y_lo)
  731. // = cosh_FR_spos * (cosh_FR_Tjhi + cosh_FR_Y_lo)
  732. { .mfi
  733. nop.m 999
  734. (p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd
  735. nop.i 999
  736. }
  737. // Now we are in EXP. This is the only path where an overflow is possible
  738. // but not for certain. So this is the only path where over_SAFE has any use.
  739. // r34 still has N-1
  740. // There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
  741. // There is a danger of double overflow if N-1 > 0x3fe = 1022
  742. { .mlx
  743. nop.m 999
  744. (p0) movl r32 = 0x00000000000003fe ;;
  745. }
  746. { .mfi
  747. (p0) cmp.gt.unc p0,p7 = r34, r32
  748. nop.f 999
  749. nop.i 999 ;;
  750. }
  751. { .mfi
  752. nop.m 999
  753. (p0) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi, cosh_FR_Y_lo_temp, cosh_FR_Tjlo
  754. nop.i 999 ;;
  755. }
  756. { .mfi
  757. nop.m 999
  758. (p0) fma.s1 cosh_FR_COSH_temp = cosh_FR_Y_lo, f1, cosh_FR_Tjhi
  759. nop.i 999 ;;
  760. }
  761. { .mfi
  762. nop.m 999
  763. (p0) fma.d.s0 f44 = cosh_FR_spos, cosh_FR_COSH_temp, f0
  764. nop.i 999 ;;
  765. }
  766. // If over_SAFE is set, return
  767. { .mfb
  768. nop.m 999
  769. (p7) fmerge.s f8 = f44,f44
  770. (p7) br.ret.sptk b0 ;;
  771. }
  772. // Else see if we overflowed
  773. // S0 user supplied status
  774. // S2 user supplied status + WRE + TD (Overflows)
  775. // If WRE is set then an overflow will not occur in EXP.
  776. // The input value that would cause a register (WRE) value to overflow is about 2^15
  777. // and this input would go into the HUGE path.
  778. // Answer with WRE is in f43.
  779. { .mfi
  780. nop.m 999
  781. (p0) fsetc.s2 0x7F,0x42
  782. nop.i 999;;
  783. }
  784. { .mfi
  785. nop.m 999
  786. (p0) fma.d.s2 f43 = cosh_FR_spos, cosh_FR_COSH_temp, f0
  787. nop.i 999 ;;
  788. }
  789. // 103FF => 103FF -FFFF = 400(true)
  790. // 400 + 3FF = 7FF, which is 1 more that the exponent of the largest
  791. // double (7FE). So 0 103FF 8000000000000000 is one ulp more than
  792. // largest double in register bias
  793. // Now set p8 if the answer with WRE is greater than or equal this value
  794. // Also set p9 if the answer with WRE is less than or equal to negative this value
  795. { .mlx
  796. nop.m 999
  797. (p0) movl r32 = 0x00000000000103ff ;;
  798. }
  799. { .mmf
  800. nop.m 999
  801. (p0) setf.exp f41 = r32
  802. (p0) fsetc.s2 0x7F,0x40 ;;
  803. }
  804. { .mfi
  805. nop.m 999
  806. (p0) fcmp.ge.unc.s1 p8, p0 = f43, f41
  807. nop.i 999
  808. }
  809. { .mfi
  810. nop.m 999
  811. (p0) fmerge.ns f42 = f41, f41
  812. nop.i 999 ;;
  813. }
  814. // The error tag for overflow is 64
  815. { .mii
  816. nop.m 999
  817. nop.i 999 ;;
  818. (p8) mov r47 = 64 ;;
  819. }
  820. { .mfb
  821. nop.m 999
  822. (p0) fcmp.le.unc.s1 p9, p0 = f43, f42
  823. (p8) br.cond.spnt __libm_error_region ;;
  824. }
  825. { .mii
  826. nop.m 999
  827. nop.i 999 ;;
  828. (p9) mov r47 = 64
  829. }
  830. { .mib
  831. nop.m 999
  832. nop.i 999
  833. (p9) br.cond.spnt __libm_error_region ;;
  834. }
  835. { .mfb
  836. nop.m 999
  837. (p0) fmerge.s f8 = f44,f44
  838. (p0) br.ret.sptk b0 ;;
  839. }
  840. // for COSH_HUGE, put 24000 in exponent; take sign from input; add 1
  841. // SAFE: SAFE is always 0 for HUGE
  842. COSH_HUGE:
  843. { .mlx
  844. nop.m 999
  845. (p0) movl r32 = 0x0000000000015dbf ;;
  846. }
  847. { .mfi
  848. (p0) setf.exp f9 = r32
  849. nop.f 999
  850. nop.i 999 ;;
  851. }
  852. { .mfi
  853. nop.m 999
  854. (p0) fma.s1 cosh_FR_hi_lo = f1, f9, f1
  855. nop.i 999 ;;
  856. }
  857. { .mfi
  858. nop.m 999
  859. (p0) fma.d.s0 f44 = f9, cosh_FR_hi_lo, f0
  860. (p0) mov r47 = 64
  861. }
  862. ;;
  863. .endp cosh#
  864. // Stack operations when calling error support.
  865. // (1) (2) (3) (call) (4)
  866. // sp -> + psp -> + psp -> + sp -> +
  867. // | | | |
  868. // | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
  869. // | | | |
  870. // | <-GR_Y Y2->| Y2 ->| <- GR_Y |
  871. // | | | |
  872. // | | <- GR_X X1 ->| |
  873. // | | | |
  874. // sp-64 -> + sp -> + sp -> + +
  875. // save ar.pfs save b0 restore gp
  876. // save gp restore ar.pfs
  877. .proc __libm_error_region
  878. __libm_error_region:
  879. .prologue
  880. // (1)
  881. { .mfi
  882. add GR_Parameter_Y=-32,sp // Parameter 2 value
  883. nop.f 0
  884. .save ar.pfs,GR_SAVE_PFS
  885. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  886. }
  887. { .mfi
  888. .fframe 64
  889. add sp=-64,sp // Create new stack
  890. nop.f 0
  891. mov GR_SAVE_GP=gp // Save gp
  892. };;
  893. // (2)
  894. { .mmi
  895. stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
  896. add GR_Parameter_X = 16,sp // Parameter 1 address
  897. .save b0, GR_SAVE_B0
  898. mov GR_SAVE_B0=b0 // Save b0
  899. };;
  900. .body
  901. // (3)
  902. { .mib
  903. stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
  904. add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
  905. nop.b 0
  906. }
  907. { .mib
  908. stfd [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack
  909. add GR_Parameter_Y = -16,GR_Parameter_Y
  910. br.call.sptk b0=__libm_error_support# // Call error handling function
  911. };;
  912. { .mmi
  913. nop.m 0
  914. nop.m 0
  915. add GR_Parameter_RESULT = 48,sp
  916. };;
  917. // (4)
  918. { .mmi
  919. ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
  920. .restore
  921. add sp = 64,sp // Restore stack pointer
  922. mov b0 = GR_SAVE_B0 // Restore return address
  923. };;
  924. { .mib
  925. mov gp = GR_SAVE_GP // Restore gp
  926. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  927. br.ret.sptk b0 // Return
  928. };;
  929. .endp __libm_error_region
  930. .type __libm_error_support#,@function
  931. .global __libm_error_support#