Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1112 lines
30 KiB

  1. .file "coshf.s"
  2. // Copyright (c) 2000, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 2/02/00 Initial version
  29. // 2/16/00 The error tag for coshf overflow changed to 65 (from 64).
  30. // 4/04/00 Unwind support added
  31. // 8/15/00 Bundle added after call to __libm_error_support to properly
  32. // set [the previously overwritten] GR_Parameter_RESULT.
  33. //
  34. // API
  35. //==============================================================
  36. // float = coshf(float)
  37. // input floating point f8
  38. // output floating point f8
  39. // Overview of operation
  40. //==============================================================
  41. // There are four paths
  42. // 1. |x| < 0.25 COSH_BY_POLY
  43. // 2. |x| < 32 COSH_BY_TBL
  44. // 3. |x| < 2^14 COSH_BY_EXP
  45. // 4. |x_ >= 2^14 COSH_HUGE
  46. // For paths 1, and 2 SAFE is always 1.
  47. // For path 4, Safe is always 0.
  48. // SAFE = 1 means we cannot overflow.
  49. // Assembly macros
  50. //==============================================================
  51. coshf_FR_X = f44
  52. coshf_FR_SGNX = f40
  53. coshf_FR_Inv_log2by64 = f9
  54. coshf_FR_log2by64_lo = f11
  55. coshf_FR_log2by64_hi = f10
  56. coshf_FR_A1 = f9
  57. coshf_FR_A2 = f10
  58. coshf_FR_A3 = f11
  59. coshf_FR_Rcub = f12
  60. coshf_FR_M_temp = f13
  61. coshf_FR_R_temp = f13
  62. coshf_FR_Rsq = f13
  63. coshf_FR_R = f14
  64. coshf_FR_M = f38
  65. coshf_FR_B1 = f15
  66. coshf_FR_B2 = f32
  67. coshf_FR_B3 = f33
  68. coshf_FR_peven_temp1 = f34
  69. coshf_FR_peven_temp2 = f35
  70. coshf_FR_peven = f36
  71. coshf_FR_podd_temp1 = f34
  72. coshf_FR_podd_temp2 = f35
  73. coshf_FR_podd = f37
  74. coshf_FR_J_temp = f9
  75. coshf_FR_J = f10
  76. coshf_FR_Mmj = f39
  77. coshf_FR_N_temp1 = f11
  78. coshf_FR_N_temp2 = f12
  79. coshf_FR_N = f13
  80. coshf_FR_spos = f14
  81. coshf_FR_sneg = f15
  82. coshf_FR_Tjhi = f32
  83. coshf_FR_Tjlo = f33
  84. coshf_FR_Tmjhi = f34
  85. coshf_FR_Tmjlo = f35
  86. GR_mJ = r35
  87. GR_J = r36
  88. AD_mJ = r38
  89. AD_J = r39
  90. GR_SAVE_B0 = r42
  91. GR_SAVE_PFS = r41
  92. GR_SAVE_GP = r43
  93. GR_Parameter_X = r44
  94. GR_Parameter_Y = r45
  95. GR_Parameter_RESULT = r46
  96. GR_Parameter_TAG = r47
  97. FR_X = f8
  98. FR_Y = f0
  99. FR_RESULT = f44
  100. coshf_FR_C_hi = f9
  101. coshf_FR_C_hi_temp = f10
  102. coshf_FR_C_lo_temp1 = f11
  103. coshf_FR_C_lo_temp2 = f12
  104. coshf_FR_C_lo_temp3 = f13
  105. coshf_FR_C_lo = f38
  106. coshf_FR_S_hi = f39
  107. coshf_FR_S_hi_temp1 = f10
  108. coshf_FR_Y_hi = f11
  109. coshf_FR_Y_lo_temp = f12
  110. coshf_FR_Y_lo = f13
  111. coshf_FR_COSH = f9
  112. coshf_FR_X2 = f9
  113. coshf_FR_X4 = f10
  114. coshf_FR_P1 = f14
  115. coshf_FR_P2 = f15
  116. coshf_FR_P3 = f32
  117. coshf_FR_P4 = f33
  118. coshf_FR_P5 = f34
  119. coshf_FR_P6 = f35
  120. coshf_FR_TINY_THRESH = f9
  121. coshf_FR_COSH_temp = f10
  122. coshf_FR_SCALE = f11
  123. coshf_FR_hi_lo = f10
  124. coshf_FR_poly_podd_temp1 = f11
  125. coshf_FR_poly_podd_temp2 = f13
  126. coshf_FR_poly_peven_temp1 = f11
  127. coshf_FR_poly_peven_temp2 = f13
  128. // Data tables
  129. //==============================================================
  130. .data
  131. .align 16
  132. single_coshf_arg_reduction:
  133. data8 0xB8AA3B295C17F0BC, 0x00004005
  134. data8 0xB17217F7D1000000, 0x00003FF8
  135. data8 0xCF79ABC9E3B39804, 0x00003FD0
  136. single_coshf_p_table:
  137. data8 0x8000000000000000, 0x00003FFE
  138. data8 0xAAAAAAAAAAAAAB80, 0x00003FFA
  139. data8 0xB60B60B60B4FE884, 0x00003FF5
  140. data8 0xD00D00D1021D7370, 0x00003FEF
  141. data8 0x93F27740C0C2F1CC, 0x00003FE9
  142. data8 0x8FA02AC65BCBD5BC, 0x00003FE2
  143. single_coshf_ab_table:
  144. data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC
  145. data8 0x88888888884ECDD5, 0x00003FF8
  146. data8 0xD00D0C6DCC26A86B, 0x00003FF2
  147. data8 0x8000000000000002, 0x00003FFE
  148. data8 0xAAAAAAAAAA402C77, 0x00003FFA
  149. data8 0xB60B6CC96BDB144D, 0x00003FF5
  150. single_coshf_j_table:
  151. data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
  152. data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
  153. data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
  154. data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
  155. data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
  156. data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
  157. data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
  158. data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
  159. data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
  160. data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
  161. data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
  162. data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
  163. data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
  164. data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
  165. data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
  166. data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
  167. data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
  168. data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
  169. data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
  170. data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
  171. data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
  172. data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
  173. data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
  174. data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
  175. data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
  176. data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
  177. data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
  178. data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
  179. data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
  180. data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
  181. data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
  182. data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
  183. data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
  184. data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
  185. data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
  186. data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
  187. data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
  188. data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
  189. data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
  190. data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
  191. data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
  192. data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
  193. data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
  194. data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
  195. data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
  196. data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
  197. data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
  198. data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
  199. data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
  200. data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
  201. data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
  202. data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
  203. data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
  204. data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
  205. data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
  206. data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
  207. data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
  208. data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
  209. data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
  210. data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
  211. data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
  212. data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
  213. data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
  214. data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
  215. data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
  216. .align 32
  217. .global coshf#
  218. .section .text
  219. .proc coshf#
  220. .align 32
  221. coshf:
  222. // X NAN?
  223. { .mfi
  224. alloc r32 = ar.pfs,0,12,4,0
  225. (p0) fclass.m.unc p6,p7 = f8, 0xc3
  226. nop.i 999 ;;
  227. }
  228. { .mfb
  229. nop.m 999
  230. (p6) fma.s.s0 f8 = f8,f1,f8
  231. (p6) br.ret.spnt b0 ;;
  232. }
  233. { .mfi
  234. nop.m 999
  235. nop.f 999
  236. nop.i 999 ;;
  237. }
  238. // X infinity
  239. { .mfi
  240. nop.m 999
  241. (p0) fclass.m.unc p6,p0 = f8, 0x23
  242. nop.i 999 ;;
  243. }
  244. { .mfb
  245. nop.m 999
  246. (p6) fmerge.s f8 = f0,f8
  247. (p6) br.ret.spnt b0 ;;
  248. }
  249. // Put 0.25 in f9; p6 true if x < 0.25
  250. { .mlx
  251. nop.m 999
  252. (p0) movl r32 = 0x000000000000fffd ;;
  253. }
  254. { .mfi
  255. (p0) setf.exp f9 = r32
  256. nop.f 999
  257. nop.i 999 ;;
  258. }
  259. { .mfi
  260. nop.m 999
  261. (p0) fmerge.s coshf_FR_X = f0,f8
  262. nop.i 999
  263. }
  264. { .mfi
  265. nop.m 999
  266. (p0) fmerge.s coshf_FR_SGNX = f8,f1
  267. nop.i 999 ;;
  268. }
  269. { .mfi
  270. nop.m 999
  271. (p0) fcmp.lt.unc p0,p7 = coshf_FR_X,f9
  272. nop.i 999 ;;
  273. }
  274. { .mib
  275. nop.m 999
  276. nop.i 999
  277. (p7) br.cond.sptk COSH_BY_TBL ;;
  278. }
  279. // COSH_BY_POLY:
  280. // POLY cannot overflow so there is no need to call __libm_error_support
  281. // Get the values of P_x from the table
  282. { .mmi
  283. nop.m 999
  284. (p0) addl r34 = @ltoff(single_coshf_p_table), gp
  285. nop.i 999
  286. }
  287. ;;
  288. { .mmi
  289. ld8 r34 = [r34]
  290. nop.m 999
  291. nop.i 999
  292. }
  293. ;;
  294. // Calculate coshf_FR_X2 = ax*ax and coshf_FR_X4 = ax*ax*ax*ax
  295. { .mmf
  296. nop.m 999
  297. (p0) ldfe coshf_FR_P1 = [r34],16
  298. (p0) fma.s1 coshf_FR_X2 = coshf_FR_X, coshf_FR_X, f0 ;;
  299. }
  300. { .mmi
  301. (p0) ldfe coshf_FR_P2 = [r34],16 ;;
  302. (p0) ldfe coshf_FR_P3 = [r34],16
  303. nop.i 999 ;;
  304. }
  305. { .mmi
  306. (p0) ldfe coshf_FR_P4 = [r34],16 ;;
  307. (p0) ldfe coshf_FR_P5 = [r34],16
  308. nop.i 999 ;;
  309. }
  310. { .mfi
  311. (p0) ldfe coshf_FR_P6 = [r34],16
  312. (p0) fma.s1 coshf_FR_X4 = coshf_FR_X2, coshf_FR_X2, f0
  313. nop.i 999 ;;
  314. }
  315. // Calculate coshf_FR_podd = x4 *(x4 * P_5 + P_3) + P_1
  316. { .mfi
  317. nop.m 999
  318. (p0) fma.s1 coshf_FR_poly_podd_temp1 = coshf_FR_X4, coshf_FR_P5, coshf_FR_P3
  319. nop.i 999 ;;
  320. }
  321. { .mfi
  322. nop.m 999
  323. (p0) fma.s1 coshf_FR_podd = coshf_FR_X4, coshf_FR_poly_podd_temp1, coshf_FR_P1
  324. nop.i 999
  325. }
  326. // Calculate coshf_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2)
  327. { .mfi
  328. nop.m 999
  329. (p0) fma.s1 coshf_FR_poly_peven_temp1 = coshf_FR_X4, coshf_FR_P6, coshf_FR_P4
  330. nop.i 999 ;;
  331. }
  332. { .mfi
  333. nop.m 999
  334. (p0) fma.s1 coshf_FR_poly_peven_temp2 = coshf_FR_X4, coshf_FR_poly_peven_temp1, coshf_FR_P2
  335. nop.i 999 ;;
  336. }
  337. { .mfi
  338. nop.m 999
  339. (p0) fma.s1 coshf_FR_peven = coshf_FR_X4, coshf_FR_poly_peven_temp2, f0
  340. nop.i 999 ;;
  341. }
  342. // Y_lo = x2*p_odd + p_even
  343. // Calculate f8 = Y_hi + Y_lo
  344. { .mfi
  345. nop.m 999
  346. (p0) fma.s1 coshf_FR_Y_lo = coshf_FR_X2, coshf_FR_podd, coshf_FR_peven
  347. nop.i 999 ;;
  348. }
  349. { .mfb
  350. nop.m 999
  351. (p0) fma.s.s0 f8 = f1, f1, coshf_FR_Y_lo
  352. (p0) br.ret.sptk b0 ;;
  353. }
  354. COSH_BY_TBL:
  355. // Now that we are at TBL; so far all we know is that |x| >= 0.25.
  356. // The first two steps are the same for TBL and EXP, but if we are HUGE
  357. // Double
  358. // Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)
  359. // Single
  360. // Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true)
  361. // we want to leave now. Go to HUGE if |x| >= 2^14
  362. // 1000d (register-biased) is e = 14 (true)
  363. { .mlx
  364. nop.m 999
  365. (p0) movl r32 = 0x0000000000010006 ;;
  366. }
  367. { .mfi
  368. (p0) setf.exp f9 = r32
  369. nop.f 999
  370. nop.i 999 ;;
  371. }
  372. { .mfi
  373. nop.m 999
  374. (p0) fcmp.ge.unc p6,p7 = coshf_FR_X,f9
  375. nop.i 999 ;;
  376. }
  377. { .mib
  378. nop.m 999
  379. nop.i 999
  380. (p6) br.cond.spnt COSH_HUGE ;;
  381. }
  382. // r32 = 1
  383. // r34 = N-1
  384. // r35 = N
  385. // r36 = j
  386. // r37 = N+1
  387. // TBL can never overflow
  388. // coshf(x) = coshf(B+R)
  389. // = coshf(B) coshf(R) + sinh(B) sinh(R)
  390. // coshf(R) can be approximated by 1 + p_even
  391. // sinh(R) can be approximated by p_odd
  392. // ******************************************************
  393. // STEP 1 (TBL and EXP)
  394. // ******************************************************
  395. // Get the following constants.
  396. // f9 = Inv_log2by64
  397. // f10 = log2by64_hi
  398. // f11 = log2by64_lo
  399. { .mmi
  400. (p0) adds r32 = 0x1,r0
  401. (p0) addl r34 = @ltoff(single_coshf_arg_reduction), gp
  402. nop.i 999
  403. }
  404. ;;
  405. // We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
  406. // put them in an exponent.
  407. // coshf_FR_spos = 2^(N-1) and coshf_FR_sneg = 2^(-N-1)
  408. // r39 = 0xffff + (N-1) = 0xffff +N -1
  409. // r40 = 0xffff - (N +1) = 0xffff -N -1
  410. { .mlx
  411. ld8 r34 = [r34]
  412. (p0) movl r38 = 0x000000000000fffe ;;
  413. }
  414. { .mmi
  415. (p0) ldfe coshf_FR_Inv_log2by64 = [r34],16 ;;
  416. (p0) ldfe coshf_FR_log2by64_hi = [r34],16
  417. nop.i 999 ;;
  418. }
  419. { .mbb
  420. (p0) ldfe coshf_FR_log2by64_lo = [r34],16
  421. nop.b 999
  422. nop.b 999 ;;
  423. }
  424. // Get the A coefficients
  425. // f9 = A_1
  426. // f10 = A_2
  427. // f11 = A_3
  428. { .mmi
  429. nop.m 999
  430. (p0) addl r34 = @ltoff(single_coshf_ab_table), gp
  431. nop.i 999
  432. }
  433. ;;
  434. { .mmi
  435. ld8 r34 = [r34]
  436. nop.m 999
  437. nop.i 999
  438. }
  439. ;;
  440. // Calculate M and keep it as integer and floating point.
  441. // M = round-to-integer(x*Inv_log2by64)
  442. // coshf_FR_M = M = truncate(ax/(log2/64))
  443. // Put the significand of M in r35
  444. // and the floating point representation of M in coshf_FR_M
  445. { .mfi
  446. nop.m 999
  447. (p0) fma.s1 coshf_FR_M = coshf_FR_X, coshf_FR_Inv_log2by64, f0
  448. nop.i 999
  449. }
  450. { .mfi
  451. (p0) ldfe coshf_FR_A1 = [r34],16
  452. nop.f 999
  453. nop.i 999 ;;
  454. }
  455. { .mfi
  456. nop.m 999
  457. (p0) fcvt.fx.s1 coshf_FR_M_temp = coshf_FR_M
  458. nop.i 999 ;;
  459. }
  460. { .mfi
  461. nop.m 999
  462. (p0) fnorm.s1 coshf_FR_M = coshf_FR_M_temp
  463. nop.i 999 ;;
  464. }
  465. { .mfi
  466. (p0) getf.sig r35 = coshf_FR_M_temp
  467. nop.f 999
  468. nop.i 999 ;;
  469. }
  470. // M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It
  471. // has a range of -32 thru 31.
  472. // r35 = M
  473. // r36 = j
  474. { .mii
  475. nop.m 999
  476. nop.i 999 ;;
  477. (p0) and r36 = 0x3f, r35 ;;
  478. }
  479. // Calculate R
  480. // f13 = f44 - f12*f10 = x - M*log2by64_hi
  481. // f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo
  482. { .mfi
  483. nop.m 999
  484. (p0) fnma.s1 coshf_FR_R_temp = coshf_FR_M, coshf_FR_log2by64_hi, coshf_FR_X
  485. nop.i 999
  486. }
  487. { .mfi
  488. (p0) ldfe coshf_FR_A2 = [r34],16
  489. nop.f 999
  490. nop.i 999 ;;
  491. }
  492. { .mfi
  493. nop.m 999
  494. (p0) fnma.s1 coshf_FR_R = coshf_FR_M, coshf_FR_log2by64_lo, coshf_FR_R_temp
  495. nop.i 999
  496. }
  497. // Get the B coefficients
  498. // f15 = B_1
  499. // f32 = B_2
  500. // f33 = B_3
  501. { .mmi
  502. (p0) ldfe coshf_FR_A3 = [r34],16 ;;
  503. (p0) ldfe coshf_FR_B1 = [r34],16
  504. nop.i 999 ;;
  505. }
  506. { .mmi
  507. (p0) ldfe coshf_FR_B2 = [r34],16 ;;
  508. (p0) ldfe coshf_FR_B3 = [r34],16
  509. nop.i 999 ;;
  510. }
  511. { .mii
  512. nop.m 999
  513. (p0) shl r34 = r36, 0x2 ;;
  514. (p0) sxt1 r37 = r34 ;;
  515. }
  516. // ******************************************************
  517. // STEP 2 (TBL and EXP)
  518. // ******************************************************
  519. // Calculate Rsquared and Rcubed in preparation for p_even and p_odd
  520. // f12 = R*R*R
  521. // f13 = R*R
  522. // f14 = R <== from above
  523. { .mfi
  524. nop.m 999
  525. (p0) fma.s1 coshf_FR_Rsq = coshf_FR_R, coshf_FR_R, f0
  526. (p0) shr r36 = r37, 0x2 ;;
  527. }
  528. // r34 = M-j = r35 - r36
  529. // r35 = N = (M-j)/64
  530. { .mii
  531. (p0) sub r34 = r35, r36
  532. nop.i 999 ;;
  533. (p0) shr r35 = r34, 0x6 ;;
  534. }
  535. { .mii
  536. (p0) sub r40 = r38, r35
  537. (p0) adds r37 = 0x1, r35
  538. (p0) add r39 = r38, r35 ;;
  539. }
  540. // Get the address of the J table, add the offset,
  541. // addresses are sinh_AD_mJ and sinh_AD_J, get the T value
  542. // f32 = T(j)_hi
  543. // f33 = T(j)_lo
  544. // f34 = T(-j)_hi
  545. // f35 = T(-j)_lo
  546. { .mmi
  547. (p0) sub r34 = r35, r32
  548. (p0) addl r37 = @ltoff(single_coshf_j_table), gp
  549. nop.i 999
  550. }
  551. ;;
  552. { .mfi
  553. ld8 r37 = [r37]
  554. (p0) fma.s1 coshf_FR_Rcub = coshf_FR_Rsq, coshf_FR_R, f0
  555. nop.i 999
  556. }
  557. // ******************************************************
  558. // STEP 3 Now decide if we need to branch to EXP
  559. // ******************************************************
  560. // Put 32 in f9; p6 true if x < 32
  561. { .mlx
  562. nop.m 999
  563. (p0) movl r32 = 0x0000000000010004 ;;
  564. }
  565. // Calculate p_even
  566. // f34 = B_2 + Rsq *B_3
  567. // f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)
  568. // f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
  569. { .mfi
  570. nop.m 999
  571. (p0) fma.s1 coshf_FR_peven_temp1 = coshf_FR_Rsq, coshf_FR_B3, coshf_FR_B2
  572. nop.i 999 ;;
  573. }
  574. { .mfi
  575. nop.m 999
  576. (p0) fma.s1 coshf_FR_peven_temp2 = coshf_FR_Rsq, coshf_FR_peven_temp1, coshf_FR_B1
  577. nop.i 999
  578. }
  579. // Calculate p_odd
  580. // f34 = A_2 + Rsq *A_3
  581. // f35 = A_1 + Rsq * (A_2 + Rsq *A_3)
  582. // f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
  583. { .mfi
  584. nop.m 999
  585. (p0) fma.s1 coshf_FR_podd_temp1 = coshf_FR_Rsq, coshf_FR_A3, coshf_FR_A2
  586. nop.i 999 ;;
  587. }
  588. { .mfi
  589. (p0) setf.exp coshf_FR_N_temp1 = r39
  590. nop.f 999
  591. nop.i 999 ;;
  592. }
  593. { .mfi
  594. nop.m 999
  595. (p0) fma.s1 coshf_FR_peven = coshf_FR_Rsq, coshf_FR_peven_temp2, f0
  596. nop.i 999
  597. }
  598. { .mfi
  599. nop.m 999
  600. (p0) fma.s1 coshf_FR_podd_temp2 = coshf_FR_Rsq, coshf_FR_podd_temp1, coshf_FR_A1
  601. nop.i 999 ;;
  602. }
  603. { .mfi
  604. (p0) setf.exp f9 = r32
  605. nop.f 999
  606. nop.i 999 ;;
  607. }
  608. { .mfi
  609. nop.m 999
  610. (p0) fma.s1 coshf_FR_podd = coshf_FR_podd_temp2, coshf_FR_Rcub, coshf_FR_R
  611. nop.i 999
  612. }
  613. // sinh_GR_mj contains the table offset for -j
  614. // sinh_GR_j contains the table offset for +j
  615. // p6 is true when j <= 0
  616. { .mlx
  617. (p0) setf.exp coshf_FR_N_temp2 = r40
  618. (p0) movl r40 = 0x0000000000000020 ;;
  619. }
  620. { .mfi
  621. (p0) sub GR_mJ = r40, r36
  622. (p0) fmerge.se coshf_FR_spos = coshf_FR_N_temp1, f1
  623. (p0) adds GR_J = 0x20, r36 ;;
  624. }
  625. { .mii
  626. nop.m 999
  627. (p0) shl GR_mJ = GR_mJ, 5 ;;
  628. (p0) add AD_mJ = r37, GR_mJ ;;
  629. }
  630. { .mmi
  631. nop.m 999
  632. (p0) ldfe coshf_FR_Tmjhi = [AD_mJ],16
  633. (p0) shl GR_J = GR_J, 5 ;;
  634. }
  635. { .mfi
  636. (p0) ldfs coshf_FR_Tmjlo = [AD_mJ],16
  637. (p0) fcmp.lt.unc.s1 p6,p7 = coshf_FR_X,f9
  638. (p0) add AD_J = r37, GR_J ;;
  639. }
  640. { .mmi
  641. (p0) ldfe coshf_FR_Tjhi = [AD_J],16 ;;
  642. (p0) ldfs coshf_FR_Tjlo = [AD_J],16
  643. nop.i 999 ;;
  644. }
  645. { .mfb
  646. nop.m 999
  647. (p0) fmerge.se coshf_FR_sneg = coshf_FR_N_temp2, f1
  648. (p7) br.cond.spnt COSH_BY_EXP ;;
  649. }
  650. // ******************************************************
  651. // If NOT branch to EXP
  652. // ******************************************************
  653. // Calculate C_hi
  654. // ******************************************************
  655. // coshf_FR_C_hi_temp = coshf_FR_sneg * coshf_FR_Tmjhi
  656. // coshf_FR_C_hi = coshf_FR_spos * coshf_FR_Tjhi + (coshf_FR_sneg * coshf_FR_Tmjhi)
  657. { .mfi
  658. nop.m 999
  659. (p0) fma.s1 coshf_FR_C_hi_temp = coshf_FR_sneg, coshf_FR_Tmjhi, f0
  660. nop.i 999 ;;
  661. }
  662. { .mfi
  663. nop.m 999
  664. (p0) fma.s1 coshf_FR_C_hi = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_C_hi_temp
  665. nop.i 999
  666. }
  667. // ******************************************************
  668. // Calculate S_hi
  669. // ******************************************************
  670. // coshf_FR_S_hi_temp1 = coshf_FR_sneg * coshf_FR_Tmjhi
  671. // coshf_FR_S_hi = coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi_temp1
  672. { .mfi
  673. nop.m 999
  674. (p0) fma.s1 coshf_FR_S_hi_temp1 = coshf_FR_sneg, coshf_FR_Tmjhi, f0
  675. nop.i 999 ;;
  676. }
  677. // ******************************************************
  678. // Calculate C_lo
  679. // ******************************************************
  680. // coshf_FR_C_lo_temp1 = coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi
  681. // coshf_FR_C_lo_temp2 = coshf_FR_sneg * coshf_FR_Tmjlo + (coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi)
  682. // coshf_FR_C_lo_temp1 = coshf_FR_sneg * coshf_FR_Tmjlo
  683. // coshf_FR_C_lo_temp3 = coshf_FR_spos * coshf_FR_Tjlo + (coshf_FR_sneg * coshf_FR_Tmjlo)
  684. // coshf_FR_C_lo = coshf_FR_C_lo_temp3 + coshf_FR_C_lo_temp2
  685. { .mfi
  686. nop.m 999
  687. (p0) fms.s1 coshf_FR_C_lo_temp1 = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_C_hi
  688. nop.i 999
  689. }
  690. { .mfi
  691. nop.m 999
  692. (p0) fms.s1 coshf_FR_S_hi = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_S_hi_temp1
  693. nop.i 999 ;;
  694. }
  695. { .mfi
  696. nop.m 999
  697. (p0) fma.s1 coshf_FR_C_lo_temp2 = coshf_FR_sneg, coshf_FR_Tmjhi, coshf_FR_C_lo_temp1
  698. nop.i 999
  699. }
  700. { .mfi
  701. nop.m 999
  702. (p0) fma.s1 coshf_FR_C_lo_temp1 = coshf_FR_sneg, coshf_FR_Tmjlo, f0
  703. nop.i 999 ;;
  704. }
  705. { .mfi
  706. nop.m 999
  707. (p0) fma.s1 coshf_FR_C_lo_temp3 = coshf_FR_spos, coshf_FR_Tjlo, coshf_FR_C_lo_temp1
  708. nop.i 999 ;;
  709. }
  710. { .mfi
  711. nop.m 999
  712. (p0) fma.s1 coshf_FR_C_lo = coshf_FR_C_lo_temp3, f1, coshf_FR_C_lo_temp2
  713. nop.i 999 ;;
  714. }
  715. // ******************************************************
  716. // coshf_FR_Y_lo_temp = coshf_FR_C_hi * coshf_FR_peven + coshf_FR_C_lo
  717. // coshf_FR_Y_lo = coshf_FR_S_hi * coshf_FR_podd + coshf_FR_Y_lo_temp
  718. // coshf_FR_COSH = Y_hi + Y_lo
  719. { .mfi
  720. nop.m 999
  721. (p0) fma.s1 coshf_FR_Y_lo_temp = coshf_FR_C_hi, coshf_FR_peven, coshf_FR_C_lo
  722. nop.i 999 ;;
  723. }
  724. { .mfi
  725. nop.m 999
  726. (p0) fma.s1 coshf_FR_Y_lo = coshf_FR_S_hi, coshf_FR_podd, coshf_FR_Y_lo_temp
  727. nop.i 999 ;;
  728. }
  729. { .mfb
  730. nop.m 999
  731. (p0) fma.s.s0 f8 = coshf_FR_C_hi, f1, coshf_FR_Y_lo
  732. (p0) br.ret.sptk b0 ;;
  733. }
  734. COSH_BY_EXP:
  735. // When p7 is true, we know that an overflow is not going to happen
  736. // When p7 is false, we must check for possible overflow
  737. // p7 is the over_SAFE flag
  738. // f44 = Scale * (Y_hi + Y_lo)
  739. // = coshf_FR_spos * (coshf_FR_Tjhi + coshf_FR_Y_lo)
  740. { .mfi
  741. nop.m 999
  742. (p0) fma.s1 coshf_FR_Y_lo_temp = coshf_FR_peven, f1, coshf_FR_podd
  743. nop.i 999
  744. }
  745. // Now we are in EXP. This is the only path where an overflow is possible
  746. // but not for certain. So this is the only path where over_SAFE has any use.
  747. // r34 still has N-1
  748. // There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
  749. // There is a danger of double overflow if N-1 > 0x3fe = 1022
  750. // There is a danger of single overflow if N-1 > 0x7e = 126
  751. { .mlx
  752. nop.m 999
  753. (p0) movl r32 = 0x000000000000007e ;;
  754. }
  755. { .mfi
  756. (p0) cmp.gt.unc p0,p7 = r34, r32
  757. nop.f 999
  758. nop.i 999 ;;
  759. }
  760. { .mfi
  761. nop.m 999
  762. (p0) fma.s1 coshf_FR_Y_lo = coshf_FR_Tjhi, coshf_FR_Y_lo_temp, coshf_FR_Tjlo
  763. nop.i 999 ;;
  764. }
  765. { .mfi
  766. nop.m 999
  767. (p0) fma.s1 coshf_FR_COSH_temp = coshf_FR_Y_lo, f1, coshf_FR_Tjhi
  768. nop.i 999 ;;
  769. }
  770. { .mfi
  771. nop.m 999
  772. (p0) fma.s.s0 f44 = coshf_FR_spos, coshf_FR_COSH_temp, f0
  773. nop.i 999 ;;
  774. }
  775. // If over_SAFE is set, return
  776. { .mfb
  777. nop.m 999
  778. (p7) fmerge.s f8 = f44,f44
  779. (p7) br.ret.sptk b0 ;;
  780. }
  781. // Else see if we overflowed
  782. // S0 user supplied status
  783. // S2 user supplied status + WRE + TD (Overflows)
  784. // If WRE is set then an overflow will not occur in EXP.
  785. // The input value that would cause a register (WRE) value to overflow is about 2^15
  786. // and this input would go into the HUGE path.
  787. // Answer with WRE is in f43.
  788. { .mfi
  789. nop.m 999
  790. (p0) fsetc.s2 0x7F,0x42
  791. nop.i 999;;
  792. }
  793. { .mfi
  794. nop.m 999
  795. (p0) fma.s.s2 f43 = coshf_FR_spos, coshf_FR_COSH_temp, f0
  796. nop.i 999 ;;
  797. }
  798. // 1 more that the exponent of the largest double (7FE) = 7FF
  799. // 7FF - 3FF = 400 (true); 400 + FFFF = 103FF (register-biased)
  800. // So 0 103FF 8000000000000000 is one ulp more than
  801. // largest double in register bias
  802. // 1 more that the exponent of the largest single (FE) = FF
  803. // FF - 7F = 80 (true); 80 + FFFF = 1007F (register-biased)
  804. // Now set p8 if the answer with WRE is greater than or equal this value
  805. // Also set p9 if the answer with WRE is less than or equal to negative this value
  806. { .mlx
  807. nop.m 999
  808. (p0) movl r32 = 0x000000000001007f ;;
  809. }
  810. { .mmf
  811. nop.m 999
  812. (p0) setf.exp f41 = r32
  813. (p0) fsetc.s2 0x7F,0x40 ;;
  814. }
  815. { .mfi
  816. nop.m 999
  817. (p0) fcmp.ge.unc.s1 p8, p0 = f43, f41
  818. nop.i 999
  819. }
  820. { .mfi
  821. nop.m 999
  822. (p0) fmerge.ns f42 = f41, f41
  823. nop.i 999 ;;
  824. }
  825. // The error tag for overflow is 65
  826. { .mii
  827. nop.m 999
  828. nop.i 999 ;;
  829. (p8) mov GR_Parameter_TAG = 65 ;;
  830. }
  831. { .mfb
  832. nop.m 999
  833. (p0) fcmp.le.unc.s1 p9, p0 = f43, f42
  834. (p8) br.cond.spnt __libm_error_region ;;
  835. }
  836. { .mii
  837. nop.m 999
  838. nop.i 999 ;;
  839. (p9) mov GR_Parameter_TAG = 64
  840. }
  841. { .mib
  842. nop.m 999
  843. nop.i 999
  844. (p9) br.cond.spnt __libm_error_region ;;
  845. }
  846. { .mfb
  847. nop.m 999
  848. (p0) fmerge.s f8 = f44,f44
  849. (p0) br.ret.sptk b0 ;;
  850. }
  851. COSH_HUGE:
  852. // for COSH_HUGE, put 24000 in exponent; take sign from input; add 1
  853. // SAFE: SAFE is always 0 for HUGE
  854. { .mlx
  855. nop.m 999
  856. (p0) movl r32 = 0x0000000000015dbf ;;
  857. }
  858. { .mfi
  859. (p0) setf.exp f9 = r32
  860. nop.f 999
  861. nop.i 999 ;;
  862. }
  863. { .mfi
  864. nop.m 999
  865. (p0) fma.s1 coshf_FR_hi_lo = f1, f9, f1
  866. nop.i 999 ;;
  867. }
  868. { .mfi
  869. nop.m 999
  870. (p0) fma.s.s0 f44 = f9, coshf_FR_hi_lo, f0
  871. (p0) mov GR_Parameter_TAG = 65
  872. }
  873. .endp coshf
  874. .proc __libm_error_region
  875. __libm_error_region:
  876. .prologue
  877. { .mfi
  878. add GR_Parameter_Y=-32,sp // Parameter 2 value
  879. nop.f 0
  880. .save ar.pfs,GR_SAVE_PFS
  881. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  882. }
  883. { .mfi
  884. .fframe 64
  885. add sp=-64,sp // Create new stack
  886. nop.f 0
  887. mov GR_SAVE_GP=gp // Save gp
  888. };;
  889. { .mmi
  890. stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
  891. add GR_Parameter_X = 16,sp // Parameter 1 address
  892. .save b0, GR_SAVE_B0
  893. mov GR_SAVE_B0=b0 // Save b0
  894. };;
  895. .body
  896. { .mib
  897. stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
  898. add GR_Parameter_RESULT = 0,GR_Parameter_Y
  899. nop.b 0 // Parameter 3 address
  900. }
  901. { .mib
  902. stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
  903. add GR_Parameter_Y = -16,GR_Parameter_Y
  904. br.call.sptk.many b0=__libm_error_support# // Call error handling function
  905. };;
  906. { .mmi
  907. nop.m 0
  908. nop.m 0
  909. add GR_Parameter_RESULT = 48,sp
  910. };;
  911. { .mmi
  912. ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
  913. .restore
  914. add sp = 64,sp // Restore stack pointer
  915. mov b0 = GR_SAVE_B0 // Restore return address
  916. };;
  917. { .mib
  918. mov gp = GR_SAVE_GP // Restore gp
  919. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  920. br.ret.sptk b0 // Return
  921. };;
  922. .endp __libm_error_region
  923. .type __libm_error_support#,@function
  924. .global __libm_error_support#