Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

333 lines
8.8 KiB

  1. .file "sqrt.s"
  2. // Copyright (c) 2000, 2001, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. //********************************************************************
  27. // History
  28. //********************************************************************
  29. // 2/02/00 Initial version
  30. // 4/04/00 Unwind support added
  31. // 8/15/00 Bundle added after call to __libm_error_support to properly
  32. // set [the previously overwritten] GR_Parameter_RESULT.
  33. //
  34. //********************************************************************
  35. //
  36. // Function: Combined sqrt(x), where
  37. // _
  38. // sqrt(x) = |x, for double precision x values
  39. //
  40. //********************************************************************
  41. //
  42. // Accuracy: Correctly Rounded
  43. //
  44. //********************************************************************
  45. //
  46. // Resources Used:
  47. //
  48. // Floating-Point Registers: f8 (Input and Return Value)
  49. // f7 -f14
  50. //
  51. // General Purpose Registers:
  52. // r32-r36 (Locals)
  53. // r37-r40 (Used to pass arguments to error handling routine)
  54. //
  55. // Predicate Registers: p6, p7, p8
  56. //
  57. //*********************************************************************
  58. //
  59. // IEEE Special Conditions:
  60. //
  61. // All faults and exceptions should be raised correctly.
  62. // sqrt(QNaN) = QNaN
  63. // sqrt(SNaN) = QNaN
  64. // sqrt(+/-0) = +/-0
  65. // sqrt(negative) = QNaN and error handling is called
  66. //
  67. //*********************************************************************
  68. //
  69. // Implementation:
  70. //
  71. // Modified Newton-Raphson Algorithm
  72. //
  73. //*********************************************************************
  74. GR_SAVE_PFS = r33
  75. GR_SAVE_B0 = r34
  76. GR_SAVE_GP = r35
  77. GR_Parameter_X = r37
  78. GR_Parameter_Y = r38
  79. GR_Parameter_RESULT = r39
  80. .section .text
  81. .proc sqrt#
  82. .global sqrt#
  83. .align 64
  84. sqrt:
  85. { .mfi
  86. alloc r32= ar.pfs,0,5,4,0
  87. frsqrta.s0 f7,p6=f8
  88. nop.i 0
  89. } { .mlx
  90. // BEGIN DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
  91. nop.m 0
  92. // exponent of +1/2 in r2
  93. movl r2 = 0x0fffe;;
  94. } { .mmi
  95. // +1/2 in f9
  96. setf.exp f9 = r2
  97. nop.m 0
  98. nop.i 0
  99. } { .mlx
  100. nop.m 0
  101. // 3/2 in r3
  102. movl r3=0x3fc00000;;
  103. } { .mfi
  104. setf.s f10=r3
  105. // Step (1)
  106. // y0 = 1/sqrt(a) in f7
  107. fclass.m.unc p7,p8 = f8,0x3A
  108. nop.i 0;;
  109. } { .mlx
  110. nop.m 0
  111. // 5/2 in r2
  112. movl r2 = 0x40200000
  113. } { .mlx
  114. nop.m 0
  115. // 63/8 in r3
  116. movl r3 = 0x40fc0000;;
  117. } { .mfi
  118. setf.s f11=r2
  119. // Step (2)
  120. // h = +1/2 * y0 in f6
  121. (p6) fma.s1 f6=f9,f7,f0
  122. nop.i 0
  123. } { .mfi
  124. setf.s f12=r3
  125. // Step (3)
  126. // g = a * y0 in f7
  127. (p6) fma.s1 f7=f8,f7,f0
  128. nop.i 0
  129. } { .mfi
  130. nop.m 0
  131. mov f15 = f8
  132. nop.i 0;;
  133. } { .mlx
  134. nop.m 0
  135. // 231/16 in r2
  136. movl r2 = 0x41670000;;
  137. } { .mfi
  138. setf.s f13=r2
  139. // Step (4)
  140. // e = 1/2 - g * h in f9
  141. (p6) fnma.s1 f9=f7,f6,f9
  142. nop.i 0
  143. } { .mlx
  144. nop.m 0
  145. // 35/8 in r3
  146. movl r3 = 0x408c0000;;
  147. } { .mfi
  148. setf.s f14=r3
  149. // Step (5)
  150. // S = 3/2 + 5/2 * e in f10
  151. (p6) fma.s1 f10=f11,f9,f10
  152. nop.i 0
  153. } { .mfi
  154. nop.m 0
  155. // Step (6)
  156. // e2 = e * e in f11
  157. (p6) fma.s1 f11=f9,f9,f0
  158. nop.i 0;;
  159. } { .mfi
  160. nop.m 0
  161. // Step (7)
  162. // t = 63/8 + 231/16 * e in f12
  163. (p6) fma.s1 f12=f13,f9,f12
  164. nop.i 0;;
  165. } { .mfi
  166. nop.m 0
  167. // Step (8)
  168. // S1 = e + e2 * S in f10
  169. (p6) fma.s1 f10=f11,f10,f9
  170. nop.i 0
  171. } { .mfi
  172. nop.m 0
  173. // Step (9)
  174. // e4 = e2 * e2 in f11
  175. (p6) fma.s1 f11=f11,f11,f0
  176. nop.i 0;;
  177. } { .mfi
  178. nop.m 0
  179. // Step (10)
  180. // t1 = 35/8 + e * t in f9
  181. (p6) fma.s1 f9=f9,f12,f14
  182. nop.i 0;;
  183. } { .mfi
  184. nop.m 0
  185. // Step (11)
  186. // G = g + S1 * g in f12
  187. (p6) fma.s1 f12=f10,f7,f7
  188. nop.i 0
  189. } { .mfi
  190. nop.m 0
  191. // Step (12)
  192. // E = g * e4 in f7
  193. (p6) fma.s1 f7=f7,f11,f0
  194. nop.i 0;;
  195. } { .mfi
  196. nop.m 0
  197. // Step (13)
  198. // u = S1 + e4 * t1 in f10
  199. (p6) fma.s1 f10=f11,f9,f10
  200. nop.i 0;;
  201. } { .mfi
  202. nop.m 0
  203. // Step (14)
  204. // g1 = G + t1 * E in f7
  205. (p6) fma.d.s1 f7=f9,f7,f12
  206. nop.i 0;;
  207. } { .mfi
  208. nop.m 0
  209. // Step (15)
  210. // h1 = h + u * h in f6
  211. (p6) fma.s1 f6=f10,f6,f6
  212. nop.i 0;;
  213. } { .mfi
  214. nop.m 0
  215. // Step (16)
  216. // d = a - g1 * g1 in f9
  217. (p6) fnma.s1 f9=f7,f7,f8
  218. nop.i 0;;
  219. } { .mfb
  220. nop.m 0
  221. // Step (17)
  222. // g2 = g1 + d * h1 in f7
  223. (p6) fma.d.s0 f8=f9,f6,f7
  224. (p6) br.ret.sptk b0 ;;
  225. }
  226. { .mfb
  227. nop.m 0
  228. mov f8 = f7
  229. (p8) br.ret.sptk b0 ;;
  230. }
  231. { .mfb
  232. (p7) mov r40 = 49
  233. nop.f 0
  234. (p7) br.cond.sptk __libm_error_region ;;
  235. }
  236. // END DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
  237. .endp sqrt#
  238. // Stack operations when calling error support.
  239. // (1) (2) (3) (call) (4)
  240. // sp -> + psp -> + psp -> + sp -> +
  241. // | | | |
  242. // | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
  243. // | | | |
  244. // | <-GR_Y Y2->| Y2 ->| <- GR_Y |
  245. // | | | |
  246. // | | <- GR_X X1 ->| |
  247. // | | | |
  248. // sp-64 -> + sp -> + sp -> + +
  249. // save ar.pfs save b0 restore gp
  250. // save gp restore ar.pfs
  251. .proc __libm_error_region
  252. __libm_error_region:
  253. //
  254. // This branch includes all those special values that are not negative,
  255. // with the result equal to frcpa(x)
  256. //
  257. .prologue
  258. // We are distinguishing between over(under)flow and letting
  259. // __libm_error_support set ERANGE or do anything else needed.
  260. // (1)
  261. { .mfi
  262. add GR_Parameter_Y=-32,sp // Parameter 2 value
  263. nop.f 0
  264. .save ar.pfs,GR_SAVE_PFS
  265. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  266. }
  267. { .mfi
  268. .fframe 64
  269. add sp=-64,sp // Create new stack
  270. nop.f 0
  271. mov GR_SAVE_GP=gp // Save gp
  272. };;
  273. // (2)
  274. { .mmi
  275. stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
  276. add GR_Parameter_X = 16,sp // Parameter 1 address
  277. .save b0, GR_SAVE_B0
  278. mov GR_SAVE_B0=b0 // Save b0
  279. };;
  280. .body
  281. // (3)
  282. { .mib
  283. stfd [GR_Parameter_X] = f15 // STORE Parameter 1 on stack
  284. add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
  285. nop.b 0
  286. }
  287. { .mib
  288. stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
  289. add GR_Parameter_Y = -16,GR_Parameter_Y
  290. br.call.sptk b0=__libm_error_support# // Call error handling function
  291. };;
  292. { .mmi
  293. nop.m 0
  294. nop.m 0
  295. add GR_Parameter_RESULT = 48,sp
  296. };;
  297. // (4)
  298. { .mmi
  299. ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
  300. .restore
  301. add sp = 64,sp // Restore stack pointer
  302. mov b0 = GR_SAVE_B0 // Restore return address
  303. };;
  304. { .mib
  305. mov gp = GR_SAVE_GP // Restore gp
  306. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  307. br.ret.sptk b0 // Return
  308. };;
  309. .endp __libm_error_region
  310. .type __libm_error_support#,@function
  311. .global __libm_error_support#