Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

572 lines
16 KiB

  1. .file "tanhf.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. // History
  27. //==============================================================
  28. // 5/30/01 Initial version
  29. //
  30. // API
  31. //==============================================================
  32. // float tanhf(float)
  33. //
  34. // Overview of operation
  35. //==============================================================
  36. // Background
  37. //
  38. //
  39. // There are 9 paths:
  40. // 1. x = +/-0.0
  41. // Return tanhf(x) = +/-0.0
  42. //
  43. // 2. 0.0 < |x| < 0.3125
  44. // Return tanhf(x) = x + x^3*Pol3(x^2),
  45. // where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0
  46. //
  47. // 3. 0.3125 <= |x| < 8.0
  48. // Return tanhf(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),
  49. // where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),
  50. // PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,
  51. // PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0
  52. //
  53. // Actually range 0.3125<=|x|< 8.0 is split to 5 subranges.
  54. // For each subrange there is particular set of coefficients.
  55. // Below is the list of subranges:
  56. // 3.1 0.3125 <= |x| < 0.5
  57. // 3.2 0.5 <= |x| < 1.0
  58. // 3.3 1.0 <= |x| < 2.0
  59. // 3.4 2.0 <= |x| < 4.0
  60. // 3.5 4.0 <= |x| < 8.0
  61. //
  62. // 4. 8.0 <= |x| < 9.125
  63. // Return tanhf(x) = sign(x)*(A3|x|^3 + A2*x^2 + A1*|x| + A0)
  64. //
  65. // 5. 9.125 <= |x| < +INF
  66. // Return tanhf(x) = sign(x)*(1.0d - 2^(-52))
  67. //
  68. // 6. |x| = INF
  69. // Return tanhf(x) = sign(x) * 1.0
  70. //
  71. // 7. x = [S,Q]NaN
  72. // Return tanhf(x) = QNaN
  73. //
  74. // 8. x is positive denormal
  75. // Return tanhf(x) = x - x^2
  76. //
  77. // 9. x is negative denormal
  78. // Return tanhf(x) = x + x^2
  79. //
  80. // Registers used
  81. //==============================================================
  82. // Floating Point registers used:
  83. // f8, input
  84. // f32 -> f59
  85. // General registers used:
  86. // r32 -> r46, r2, r3
  87. // Predicate registers used:
  88. // p0, p6 -> p15
  89. // p6 to filter out case when x = [Q,S]NaN or +/-0
  90. // p7 to filter out case when x = denormal
  91. // p8 set if |x| >= 0.3125, used also to process denormal input
  92. // p9 to filter out case when |x| = inf
  93. // p10 to filter out case when |x| < 0.3125
  94. // p11 to filter out case when 0.3125 <= |x| < 9.125
  95. // p12 to filter out case when |x| >= 9.125
  96. // p13 to filter out case when 8.0 <= |x| < 9.125
  97. // p14 set to 1 for positive x
  98. // p15 set to 1 for negative x
  99. // Assembly macros
  100. //==============================================================
  101. rDataPtr = r2
  102. rDataPtr1 = r3
  103. rBias = r33
  104. rCoeffAddr3 = r34
  105. rNearSaturation = r35
  106. rCoeffAddr1 = r36
  107. rCoeffAddr2 = r37
  108. rOffset2 = r38
  109. rBias2 = r39
  110. rMask = r40
  111. rArg = r41
  112. rBound = r42
  113. rSignBit = r43
  114. rAbsArg = r44
  115. rDataPtr2 = r45
  116. rSaturation = r46
  117. //==============================================================
  118. fA0 = f32
  119. fA1 = f33
  120. fA2 = f34
  121. fA3 = f35
  122. fC0 = f36
  123. fC1 = f37
  124. fC2 = f38
  125. fC3 = f39
  126. fD0 = f40
  127. fD1 = f41
  128. fD2 = f42
  129. fB0 = f43
  130. fArgSqr = f44
  131. fAbsArg = f45
  132. fSignumX = f46
  133. fArg4 = f47
  134. fArg4Sgn = f48
  135. fArg3 = f49
  136. fArg3Sgn = f50
  137. fArg7Sgn = f51
  138. fArg6Sgn = f52
  139. fPolC = f53
  140. fPolCTmp = f54
  141. fPolA = f55
  142. fPolATmp = f56
  143. fPolD = f57
  144. fPolDTmp = f58
  145. fArgSqrSgn = f59
  146. // Data tables
  147. //==============================================================
  148. .data
  149. .align 16
  150. tanhf_data:
  151. // Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
  152. data8 0x3F9BEEDFDD177D7B // C0
  153. data8 0x3F970D10C7F32458 // C1
  154. data8 0x3F766D6B051F3A38 // C2
  155. data8 0xBF732F2001B23402 // C3
  156. data8 0xBF854BE1CE1ED499 // D0
  157. data8 0x4013C944F3999A16 // D1
  158. data8 0xC01106C6975222C0 // D2
  159. data8 0x3F783D5ACCF9EBE8 // B0
  160. // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
  161. data8 0xBF5D631440786869 // C0
  162. data8 0xBF575D79A0D52069 // C1
  163. data8 0xBF7E2237B7EFC705 // C2
  164. data8 0x3F6A7ACBC273041F // C3
  165. data8 0xC040E32EA52D91EB // D0
  166. data8 0x403D19463E5DB4D7 // D1
  167. data8 0xC02216F61F759F39 // D2
  168. data8 0xBF55B4EA0B844BE7 // B0
  169. // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
  170. data8 0x3F8637DBE5B3E690 // C0
  171. data8 0xBF7F7FEC158C07F5 // C1
  172. data8 0x3F711C586706838A // C2
  173. data8 0xBF50EF7EF605554E // C3
  174. data8 0xC054D45448354E25 // D0
  175. data8 0x404ADFEEA282E730 // D1
  176. data8 0xC028AEE456D59549 // D2
  177. data8 0x3F25232D1BED59A8 // B0
  178. // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
  179. data8 0xBF52602285F2D06C // C0
  180. data8 0x3F2E57C298FFE1E0 // C1
  181. data8 0xBF15ED575DB3C811 // C2
  182. data8 0x3EE428878A08525C // C3
  183. data8 0xC0895A26849039C1 // D0
  184. data8 0x406E3C60BBFBB575 // D1
  185. data8 0xC03A06F62867C75A // D2
  186. data8 0xBEB114C70F1C723E // B0
  187. // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
  188. data8 0x3EF4B22BD17039A3 // C0
  189. data8 0xBEB704ADC040C57F // C1
  190. data8 0x3E937A98288AFE1A // C2
  191. data8 0xBE4F33B2C9FFE7E7 // C3
  192. data8 0xC0BE48CFADE2431E // D0
  193. data8 0x4090E74249760FDD // D1
  194. data8 0xC04B6F537FCF2F1E // D2
  195. data8 0x3E0DCD879C91ADEA // B0
  196. // Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125
  197. data8 0xBFD555551E8245B7 // A0
  198. data8 0x3FC110E63F52E689 // A1
  199. data8 0xBFAB8CD6A5B7BAFA // A2
  200. data8 0x3F945D467FCEB553 // A3
  201. // Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
  202. data8 0xBE3DCC92FCAECBB6 // A0
  203. data8 0x3FF0000043B7D267 // A1
  204. data8 0xBED18BF28ACFC4B1 // A2
  205. data8 0xBFD554A56F82837E // A3
  206. // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
  207. data8 0x3EFD6054758539F9 // A0
  208. data8 0x3FEFFBFC77198EBE // A1
  209. data8 0x3F700327CA98D237 // A2
  210. data8 0xBFD68955F5BB2FA1 // A3
  211. // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
  212. data8 0xBF71A53F229DF01B // A0
  213. data8 0x3FF0AECFD730DE50 // A1
  214. data8 0xBFC882F88E5DF3BA // A2
  215. data8 0x3FC6EDF212CA2A8D // A3
  216. // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
  217. data8 0xBFAF0B712E9EDA47 // A0
  218. data8 0x3FF1C208080BEA64 // A1
  219. data8 0x3FC3D29B20C8946E // A2
  220. data8 0xBFF04514ED900A6A // A3
  221. // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
  222. data8 0xBFB1DEA49A831CBC // A0
  223. data8 0x3FFA729FC7085674 // A1
  224. data8 0xBFF2F44D923A8FA4 // A2
  225. data8 0x3FE092FC5712227E // A3
  226. // Polynomial coefficients for the tanh(x), 8.0 <= |x| <= 9.125
  227. data8 0x3FEFFF5769EE3041 // A0
  228. data8 0x3EFBBF148D850891 // A1
  229. data8 0xBEC86BCEF0F5C2FE // A2
  230. data8 0x3E7CBA4F3A885A5C // A3
  231. data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
  232. .align 64
  233. .global tanhf#
  234. .section .text
  235. .proc tanhf#
  236. .align 64
  237. tanhf:
  238. { .mfi
  239. alloc r32 = ar.pfs, 1, 14, 0, 0
  240. fmerge.s fAbsArg = f1, f8 // |x|
  241. addl rMask = 0x806, r0
  242. }
  243. { .mfi
  244. addl rDataPtr = @ltoff(tanhf_data), gp
  245. fma.s1 fArgSqr = f8, f8, f0 // x^2
  246. adds rSignBit = 0x1, r0
  247. }
  248. ;;
  249. { .mfi
  250. getf.s rArg = f8 // x in GR
  251. fclass.m p7,p0 = f8, 0x0b // is x denormal ?
  252. // sign bit and 2 most bits in significand
  253. shl rMask = rMask, 20
  254. }
  255. { .mfi
  256. ld8 rDataPtr = [rDataPtr]
  257. nop.f 0
  258. adds rBias2 = 0x1F4, r0
  259. }
  260. ;;
  261. { .mfi
  262. adds rNearSaturation = 0x14, r0
  263. fmerge.s fSignumX = f8, f1 // signum(x)
  264. shl rSignBit = rSignBit, 31 // mask for sign bit
  265. }
  266. { .mfi
  267. adds rBound = 0x3EA, r0
  268. nop.f 0
  269. addl rSaturation = 0x4112, r0
  270. }
  271. ;;
  272. { .mfi
  273. andcm rOffset2 = rArg, rMask
  274. fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ?
  275. shl rBound = rBound, 20 // 1.0f in GR
  276. }
  277. { .mfb
  278. andcm rAbsArg = rArg, rSignBit // |x| in GR
  279. nop.f 0
  280. (p7) br.cond.spnt tanhf_denormal // branch out if x is denormal
  281. }
  282. ;;
  283. { .mfi
  284. adds rCoeffAddr2 = 352, rDataPtr
  285. fclass.m p9,p0 = f8, 0x23 // is x +/- inf?
  286. shr rOffset2 = rOffset2, 21
  287. }
  288. { .mfi
  289. cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.3125?
  290. nop.f 0
  291. adds rCoeffAddr3 = 16, rDataPtr
  292. }
  293. ;;
  294. { .mfi
  295. (p8) sub rBias = rOffset2, rBias2
  296. fma.s1 fArg4 = fArgSqr, fArgSqr, f0 // x^4
  297. shl rSaturation = rSaturation, 16
  298. }
  299. { .mfb
  300. (p10) adds rBias = 0x14, r0
  301. (p6) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0
  302. (p6) br.ret.spnt b0 // exit for x = NaN or +/-0
  303. }
  304. ;;
  305. { .mfi
  306. shladd rCoeffAddr1 = rBias, 4, rDataPtr
  307. fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3
  308. // is |x| < 9.125?
  309. cmp.lt p11, p12 = rAbsArg, rSaturation
  310. }
  311. { .mfi
  312. shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3
  313. fma.s1 fArg3 = fArgSqr, fAbsArg, f0 // |x|^3
  314. shladd rCoeffAddr2 = rBias, 3, rCoeffAddr2
  315. }
  316. ;;
  317. { .mfi
  318. (p11) ldfpd fC0, fC1 = [rCoeffAddr1]
  319. (p9) fmerge.s f8 = f8,f1 // +/- inf
  320. (p12) adds rDataPtr = 544, rDataPtr
  321. }
  322. { .mfb
  323. (p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16
  324. nop.f 0
  325. (p9) br.ret.spnt b0 // exit for x = +/- inf
  326. }
  327. ;;
  328. { .mfi
  329. (p11) ldfpd fA0, fA1 = [rCoeffAddr2], 16
  330. nop.f 0
  331. (p8) cmp.eq.unc p13, p0 = rBias, rNearSaturation
  332. }
  333. { .mfi
  334. add rCoeffAddr1 = 48, rCoeffAddr1
  335. nop.f 0
  336. nop.i 0
  337. }
  338. ;;
  339. { .mfi
  340. (p11) ldfpd fD0, fD1 = [rCoeffAddr3]
  341. nop.f 0
  342. nop.i 0
  343. }
  344. { .mfb
  345. (p11) ldfpd fD2, fB0 = [rCoeffAddr1]
  346. // sign(x)*|x|^2
  347. fma.s1 fArgSqrSgn = fArgSqr, fSignumX, f0
  348. (p10) br.cond.spnt tanhf_near_zero
  349. }
  350. ;;
  351. { .mfi
  352. (p11) ldfpd fA2, fA3 = [rCoeffAddr2], 16
  353. fcmp.lt.s1 p15, p14 = f8,f0
  354. nop.i 0
  355. }
  356. { .mfb
  357. (p12) ldfd fA0 = [rDataPtr]
  358. fma.s1 fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4
  359. (p12) br.cond.spnt tanhf_saturation
  360. }
  361. ;;
  362. { .mfi
  363. nop.m 0
  364. fma.s1 fArg7Sgn = fArg4, fArg3Sgn, f0 // sign(x)*|x|^7
  365. nop.i 0
  366. }
  367. { .mfb
  368. nop.m 0
  369. fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*|x|^6
  370. (p13) br.cond.spnt tanhf_close_to_saturation
  371. }
  372. ;;
  373. { .mfi
  374. nop.m 0
  375. fma.s1 fPolC = fC3, fAbsArg, fC2 // C3*|x| + C2
  376. nop.i 0
  377. }
  378. { .mfi
  379. nop.m 0
  380. fma.s1 fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0
  381. nop.i 0
  382. };;
  383. { .mfi
  384. nop.m 0
  385. fma.s1 fPolA = fA1, fAbsArg, fA0 // A1*|x| + A0
  386. nop.i 0
  387. }
  388. ;;
  389. { .mfi
  390. nop.m 0
  391. fma.s1 fPolD = fD1, fAbsArg, fD0 // D1*|x| + D0
  392. nop.i 0
  393. }
  394. { .mfi
  395. nop.m 0
  396. // sign(x)*(|x|^7 + D2*x^6)
  397. fma.s1 fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
  398. nop.i 0
  399. };;
  400. { .mfi
  401. nop.m 0
  402. fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2
  403. nop.i 0
  404. }
  405. { .mfi
  406. nop.m 0
  407. fma.s1 fB0 = fB0, fArg4, f0 // B0*x^4
  408. nop.i 0
  409. };;
  410. { .mfi
  411. nop.m 0
  412. // C3*|x|^3 + C2*x^2 + C1*|x| + C0
  413. fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
  414. nop.i 0
  415. }
  416. ;;
  417. { .mfi
  418. nop.m 0
  419. // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
  420. fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
  421. nop.i 0
  422. }
  423. ;;
  424. { .mfi
  425. nop.m 0
  426. // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
  427. fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
  428. nop.i 0
  429. }
  430. ;;
  431. { .mfi
  432. nop.m 0
  433. // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
  434. fma.d.s1 fPolC = fPolC, f1, fB0
  435. nop.i 0
  436. }
  437. ;;
  438. { .mfi
  439. nop.m 0
  440. (p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x
  441. nop.i 0
  442. }
  443. { .mfb
  444. nop.m 0
  445. (p15) fms.s.s0 f8 = fPolC, fPolD, fPolA // for negative x
  446. br.ret.sptk b0 // Exit for 0.3125 <=|x|< 8.0
  447. };;
  448. // Here if |x| < 0.3125
  449. tanhf_near_zero:
  450. { .mfi
  451. nop.m 0
  452. fma.s1 fPolC = fC3, fArgSqr, fC2 // C3*x^2 + C2
  453. nop.i 0
  454. }
  455. { .mfi
  456. nop.m 0
  457. fma.s1 fPolCTmp = fC1, fArgSqr, fC0 // C1*x^2 + C0
  458. nop.i 0
  459. };;
  460. { .mfi
  461. nop.m 0
  462. fma.s1 fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0
  463. nop.i 0
  464. };;
  465. { .mfb
  466. nop.m 0
  467. // x + x^3*(C3*x^6 + C2*x^4 + C1*x^2 + C0)
  468. fma.s.s0 f8 = fPolC, fArg3Sgn, f8
  469. br.ret.sptk b0 // Exit for |x| < 0.3125
  470. };;
  471. // Here if 9.125 <= |x| < +inf
  472. tanhf_saturation:
  473. { .mfb
  474. nop.m 0
  475. fma.s.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0d - 2^(-52))
  476. // Exit for 9.125 <= |x| < +inf
  477. br.ret.sptk b0 // Exit for 9.125 <=|x|< +inf
  478. }
  479. ;;
  480. // Here if 8.0 <= |x| < 9.125
  481. tanhf_close_to_saturation:
  482. { .mfi
  483. nop.m 0
  484. fma.s1 fPolATmp = fA1, fAbsArg, fA0 // A1*|x| + A0
  485. nop.i 0
  486. }
  487. { .mfi
  488. nop.m 0
  489. fma.s1 fPolA = fA3, fAbsArg, fA2 // A3*|x| + A2
  490. nop.i 0
  491. }
  492. ;;
  493. .pred.rel "mutex", p14, p15
  494. { .mfi
  495. nop.m 0
  496. // for positive x
  497. (p14) fma.s.s0 f8 = fPolA, fArgSqr, fPolATmp
  498. nop.i 0
  499. }
  500. { .mfb
  501. nop.m 0
  502. // for negative x
  503. (p15) fms.s.s0 f8 = fPolA, fArgSqrSgn, fPolATmp
  504. br.ret.sptk b0 // Exit for 8.0 <=|x|< 9.125
  505. };;
  506. // Here if x is single precision denormal
  507. tanhf_denormal:
  508. { .mfi
  509. nop.m 0
  510. fclass.m p7,p8 = f8, 0x0a // is x -denormal ?
  511. nop.i 0
  512. }
  513. ;;
  514. { .mfi
  515. nop.m 0
  516. (p7) fma.s.s0 f8 = f8,f8,f8 // -denormal
  517. nop.i 0
  518. }
  519. { .mfb
  520. nop.m 0
  521. (p8) fnma.s.s0 f8 = f8,f8,f8 // +denormal
  522. br.ret.sptk b0 // Exit for denormal
  523. }
  524. ;;
  525. .endp tanhf