Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

428 lines
8.7 KiB

  1. .file "hypot.asm"
  2. // Copyright (c) 2000, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
  6. // Bob Norin, Shane Story, and Ping Tak Peter Tang of the
  7. // Computational Software Lab, Intel Corporation.
  8. //
  9. // WARRANTY DISCLAIMER
  10. //
  11. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  12. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  13. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  14. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  15. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  16. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  17. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  18. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  19. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  20. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  21. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  22. //
  23. // Intel Corporation is the author of this code, and requests that all
  24. // problem reports or change requests be submitted to it directly at
  25. // http://developer.intel.com/opensource.
  26. //
  27. //*********************************************************************
  28. //
  29. // History:
  30. // 2/02/00 hand-optimized
  31. // 4/04/00 Unwind support added
  32. // 6/20/00 new version
  33. // 8/15/00 Bundle added after call to __libm_error_support to properly
  34. // set [the previously overwritten] GR_Parameter_RESULT.
  35. //
  36. //*********************************************************************
  37. // ___________
  38. // Function: hypot(x,y) = |(x^2 + y^2) = for double precision values
  39. // x and y
  40. // Also provides cabs functionality.
  41. //
  42. //*********************************************************************
  43. //
  44. // Resources Used:
  45. //
  46. // Floating-Point Registers: f8 (Input and Return Value)
  47. // f9 (Input)
  48. // f6 -f15, f32-f34
  49. //
  50. // General Purpose Registers:
  51. // r2,r3,r29 (Scratch)
  52. // r32-r36 (Locals)
  53. // r37-r40 (Used to pass arguments to error handling routine)
  54. //
  55. // Predicate Registers: p6 - p10
  56. //
  57. //*********************************************************************
  58. //
  59. // IEEE Special Conditions:
  60. //
  61. // All faults and exceptions should be raised correctly.
  62. // Overflow can occur.
  63. // hypot(Infinity and anything) = +Infinity
  64. // hypot(QNaN and anything) = QNaN
  65. // hypot(SNaN and anything ) = QNaN
  66. //
  67. //*********************************************************************
  68. //
  69. // Implementation:
  70. // x2 = x * x in double-extended
  71. // y2 = y * y in double-extended
  72. // temp = x2 + y2 in double-extended
  73. // sqrt(temp) rounded to double
  74. //
  75. //*********************************************************************
  76. GR_SAVE_PFS = r33
  77. GR_SAVE_B0 = r34
  78. GR_SAVE_GP = r35
  79. GR_Parameter_X = r36
  80. GR_Parameter_Y = r37
  81. GR_Parameter_RESULT = r38
  82. GR_Parameter_TAG = r39
  83. FR_X = f32
  84. FR_Y = f33
  85. FR_RESULT = f8
  86. .section .text
  87. .proc _cabs#
  88. .global _cabs#
  89. _cabs:
  90. .endp _cabs
  91. .proc _hypot#
  92. .global _hypot#
  93. .align 64
  94. _hypot:
  95. {.mfi
  96. alloc r32= ar.pfs,0,4,4,0
  97. // Compute x*x
  98. fma.s1 f10=f8,f8,f0
  99. // r2=bias-1
  100. mov r2=0xfffe
  101. }
  102. {.mfi
  103. // 63/8
  104. mov r3=0x40fc //0000
  105. // y*y
  106. fma.s1 f11=f9,f9,f0
  107. // r29=429/16
  108. mov r29=0x41d68;; //000
  109. }
  110. { .mfi
  111. nop.m 0
  112. // Check if x is an Inf - if so return Inf even
  113. // if y is a NaN (C9X)
  114. fclass.m.unc p7, p6 = f8, 0x023
  115. shl r3=r3,16
  116. }
  117. {.mfi
  118. nop.m 0
  119. // if possible overflow, copy f8 to f32
  120. // set Denormal, if necessary
  121. // (p8)
  122. fma.d.s0 f32=f8,f1,f0
  123. nop.i 0;;
  124. }
  125. { .mfi
  126. nop.m 0
  127. // Check if y is an Inf - if so return Inf even
  128. // if x is a NaN (C9X)
  129. fclass.m.unc p8, p9 = f9, 0x023
  130. shl r29=r29,12
  131. }
  132. { .mfb
  133. // f7=0.5
  134. setf.exp f7=r2
  135. // For x=inf, multiply y by 1 to raise invalid on y an SNaN
  136. // (p7) fma.s0 f9=f9,f1,f0
  137. // copy f9 to f33; set Denormal, if necessary
  138. fma.d.s0 f33=f9,f1,f0
  139. nop.b 0;;
  140. }
  141. {.mfb
  142. // f13=63/8
  143. setf.s f13=r3
  144. // is y Zero ?
  145. (p6) fclass.m p6,p0=f9,0x7
  146. nop.b 0
  147. }
  148. {.mlx
  149. nop.m 0
  150. movl r2=0x408c0000;;
  151. }
  152. {.mfi
  153. // f34=429/16
  154. setf.s f34=r29
  155. // is x Zero ?
  156. (p9) fclass.m p9,p0=f8,0x7
  157. // 231/16
  158. mov r3=0x4167;; //0000
  159. }
  160. {.mfi
  161. nop.m 0
  162. // a=x2+y2
  163. fma.s1 f12=f10,f1,f11
  164. nop.i 0;;
  165. }
  166. {.mfi
  167. nop.m 0
  168. // y not NaN ?
  169. (p9) fclass.m p8,p0=f9,0x3f
  170. shl r3=r3,16
  171. }
  172. {.mfi
  173. nop.m 0
  174. // f6=2
  175. fma.s1 f6=f1,f1,f1
  176. nop.i 0;;
  177. }
  178. {.mfi
  179. nop.m 0
  180. // x not NaN ?
  181. (p6) fclass.m p7,p0=f8,0x3f
  182. nop.i 0;;
  183. }
  184. {.mfi
  185. // f9=35/8
  186. setf.s f9=r2
  187. nop.f 0
  188. // 2*emax-2
  189. mov r2=0x107fb;;
  190. }
  191. {.mfb
  192. nop.m 0
  193. // if f8=Infinity or f9=Zero, return |f8|
  194. (p7) fmerge.s f8=f0,f32
  195. (p7) br.ret.spnt b0
  196. }
  197. {.mfb
  198. nop.m 0
  199. // if f9=Infinity or f8=Zero, return |f9|
  200. (p8) fmerge.s f8=f0,f33
  201. (p8) br.ret.spnt b0;;
  202. }
  203. {.mfi
  204. // f10 =231/16
  205. setf.s f10=r3
  206. // z0=frsqrta(a)
  207. frsqrta.s1 f8,p6=f12
  208. nop.i 0;;
  209. }
  210. { .mfi
  211. nop.m 0
  212. // Identify Natvals, Infs, NaNs, and Zeros
  213. // and return result
  214. fclass.m.unc p7, p0 = f12, 0x1E7
  215. nop.i 0;;
  216. }
  217. {.mfb
  218. // get exponent of x^2+y^2
  219. getf.exp r3=f12
  220. // if special case, set f8
  221. (p7) mov f8=f12
  222. (p7) br.ret.spnt b0;;
  223. }
  224. {.mfi
  225. nop.m 0
  226. // S0=a*z0
  227. (p6) fma.s1 f14=f12,f8,f0
  228. nop.i 0
  229. }
  230. {.mfi
  231. nop.m 0
  232. // H0=0.5*z0
  233. (p6) fma.s1 f15=f8,f7,f0
  234. nop.i 0;;
  235. }
  236. {.mfi
  237. nop.m 0
  238. // f6=5/2
  239. fma.s1 f6=f7,f1,f6
  240. nop.i 0
  241. }
  242. {.mfi
  243. nop.m 0
  244. // f11=3/2
  245. fma.s1 f11=f7,f1,f1
  246. nop.i 0;;
  247. }
  248. {.mfi
  249. nop.m 0
  250. // d=0.5-S0*H0
  251. (p6) fnma.s1 f7=f14,f15,f7
  252. nop.i 0;;
  253. }
  254. {.mfi
  255. nop.m 0
  256. // P67=231/16+429/16*d
  257. (p6) fma.s1 f10=f34,f7,f10
  258. nop.i 0
  259. }
  260. {.mfi
  261. nop.m 0
  262. // P45=63/8*d+35/8
  263. (p6) fma.s1 f9=f13,f7,f9
  264. nop.i 0;;
  265. }
  266. {.mfi
  267. nop.m 0
  268. // P23=5/2*d+3/2
  269. (p6) fma.s1 f11=f6,f7,f11
  270. nop.i 0
  271. }
  272. {.mfi
  273. nop.m 0
  274. // d2=d*d
  275. (p6) fma.s1 f13=f7,f7,f0
  276. nop.i 0;;
  277. }
  278. {.mfi
  279. nop.m 0
  280. // P47=d2*P67+P45
  281. (p6) fma.s1 f10=f10,f13,f9
  282. nop.i 0
  283. }
  284. {.mfi
  285. nop.m 0
  286. // P13=d*P23+1
  287. (p6) fma.s1 f11=f11,f7,f1
  288. nop.i 0;;
  289. }
  290. {.mfi
  291. nop.m 0
  292. // d3=d2*d
  293. (p6) fma.s1 f13=f13,f7,f0
  294. nop.i 0;;
  295. }
  296. {.mfi
  297. nop.m 0
  298. // T0=d*S0
  299. (p6) fma.s1 f15=f7,f14,f0
  300. nop.i 0
  301. }
  302. {.mfi
  303. // Is x^2 + y^2 well less than the overflow
  304. // threshold?
  305. (p6) cmp.lt.unc p7, p8 = r3,r2
  306. // P=P13+d3*P47
  307. (p6) fma.s1 f10=f13,f10,f11
  308. nop.i 0;;
  309. }
  310. {.mfb
  311. nop.m 0
  312. // S=P*T0+S0
  313. fma.d.s0 f8=f10,f15,f14
  314. // No overflow in this case
  315. (p7) br.ret.sptk b0;;
  316. }
  317. { .mfi
  318. nop.m 0
  319. (p8) fsetc.s2 0x7F,0x42
  320. // Possible overflow path, must detect by
  321. // Setting widest range exponent with prevailing
  322. // rounding mode.
  323. nop.i 0 ;;
  324. }
  325. { .mfi
  326. // bias+0x400 (bias+EMAX+1)
  327. (p8) mov r2=0x103ff
  328. // S=P*T0+S0
  329. (p8) fma.d.s2 f12=f10,f15,f14
  330. nop.i 0 ;;
  331. }
  332. { .mfi
  333. (p8) setf.exp f11 = r2
  334. (p8) fsetc.s2 0x7F,0x40
  335. // Restore Original Mode in S2
  336. nop.i 0 ;;
  337. }
  338. { .mfi
  339. nop.m 0
  340. (p8) fcmp.lt.unc.s1 p9, p10 = f12, f11
  341. nop.i 0 ;;
  342. }
  343. { .mib
  344. nop.m 0
  345. mov GR_Parameter_TAG = 46
  346. // No overflow
  347. (p9) br.ret.sptk b0;;
  348. }
  349. .endp
  350. .proc __libm_error_region
  351. __libm_error_region:
  352. .prologue
  353. { .mfi
  354. add GR_Parameter_Y=-32,sp // Parameter 2 value
  355. nop.f 0
  356. .save ar.pfs,GR_SAVE_PFS
  357. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  358. }
  359. { .mfi
  360. .fframe 64
  361. add sp=-64,sp // Create new stack
  362. nop.f 0
  363. mov GR_SAVE_GP=gp // Save gp
  364. };;
  365. { .mmi
  366. stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
  367. add GR_Parameter_X = 16,sp // Parameter 1 address
  368. .save b0, GR_SAVE_B0
  369. mov GR_SAVE_B0=b0 // Save b0
  370. };;
  371. .body
  372. { .mib
  373. stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
  374. add GR_Parameter_RESULT = 0,GR_Parameter_Y
  375. nop.b 0 // Parameter 3 address
  376. }
  377. { .mib
  378. stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
  379. add GR_Parameter_Y = -16,GR_Parameter_Y
  380. br.call.sptk b0=__libm_error_support# // Call error handling function
  381. };;
  382. { .mmi
  383. nop.m 0
  384. nop.m 0
  385. add GR_Parameter_RESULT = 48,sp
  386. };;
  387. { .mmi
  388. ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
  389. .restore
  390. add sp = 64,sp // Restore stack pointer
  391. mov b0 = GR_SAVE_B0 // Restore return address
  392. };;
  393. { .mib
  394. mov gp = GR_SAVE_GP // Restore gp
  395. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  396. br.ret.sptk b0 // Return
  397. };;
  398. .endp
  399. .type __libm_error_support#,@function
  400. .global __libm_error_support#