Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

251 lines
6.5 KiB

  1. .file "sqrtf.s"
  2. // Copyright (c) 2000, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
  6. // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
  7. //
  8. // WARRANTY DISCLAIMER
  9. //
  10. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  11. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  12. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  13. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  14. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  15. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  16. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  17. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  18. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  19. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  20. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  21. //
  22. // Intel Corporation is the author of this code, and requests that all
  23. // problem reports or change requests be submitted to it directly at
  24. // http://developer.intel.com/opensource.
  25. //
  26. //*********************************************************************
  27. // History:
  28. //
  29. // 2/02/00 Initial version
  30. // 4/04/00 Unwind support added
  31. // 8/15/00 Bundle added after call to __libm_error_support to properly
  32. // set [the previously overwritten] GR_Parameter_RESULT.
  33. //
  34. //*********************************************************************
  35. //
  36. // Function: Combined sqrtf(x), where
  37. // _
  38. // sqrtf(x) = |x, for single precision x values
  39. //
  40. //********************************************************************
  41. //
  42. // Accuracy: Correctly Rounded
  43. //
  44. //********************************************************************
  45. //
  46. // Resources Used:
  47. //
  48. // Floating-Point Registers: f8 (Input and Return Value)
  49. // f7 -f14
  50. //
  51. // General Purpose Registers:
  52. // r32-r36 (Locals)
  53. // r37-r40 (Used to pass arguments to error handling routine)
  54. //
  55. // Predicate Registers: p6, p7, p8
  56. //
  57. //********************************************************************
  58. //
  59. // IEEE Special Conditions:
  60. //
  61. // All faults and exceptions should be raised correctly.
  62. // sqrtf(QNaN) = QNaN
  63. // sqrtf(SNaN) = QNaN
  64. // sqrtf(+/-0) = +/-0
  65. // sqrtf(negative) = QNaN and error handling is called
  66. //
  67. //********************************************************************
  68. //
  69. // Implementation:
  70. //
  71. // Modified Newton-Raphson Algorithm
  72. //
  73. //********************************************************************
  74. GR_SAVE_B0 = r34
  75. GR_SAVE_PFS = r33
  76. GR_SAVE_GP = r35
  77. GR_Parameter_X = r37
  78. GR_Parameter_Y = r38
  79. GR_Parameter_RESULT = r39
  80. GR_Parameter_TAG = r40
  81. FR_X = f13
  82. FR_Y = f0
  83. FR_RESULT = f8
  84. .section .text
  85. .proc sqrtf#
  86. .global sqrtf#
  87. .align 64
  88. sqrtf:
  89. { .mlx
  90. // BEGIN SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
  91. alloc r32= ar.pfs,0,5,4,0
  92. // exponent of +1/2 in r2
  93. movl r2 = 0x0fffe
  94. } { .mfi
  95. // +1/2 in f12
  96. nop.m 0
  97. frsqrta.s0 f7,p6=f8
  98. nop.i 0;;
  99. } { .mfi
  100. setf.exp f12 = r2
  101. // Step (1)
  102. // y0 = 1/sqrt(a) in f7
  103. fclass.m.unc p7,p8 = f8,0x3A
  104. nop.i 0
  105. } { .mfi
  106. nop.m 0
  107. // Make a copy of x just in case
  108. mov f13 = f8
  109. nop.i 0;;
  110. } { .mfi
  111. nop.m 0
  112. // Step (2)
  113. // H0 = 1/2 * y0 in f9
  114. (p6) fma.s1 f9=f12,f7,f0
  115. nop.i 0
  116. } { .mfi
  117. nop.m 0
  118. // Step (3)
  119. // S0 = a * y0 in f7
  120. (p6) fma.s1 f7=f8,f7,f0
  121. nop.i 0;;
  122. } { .mfi
  123. nop.m 0
  124. // Step (4)
  125. // d = 1/2 - S0 * H0 in f10
  126. (p6) fnma.s1 f10=f7,f9,f12
  127. nop.i 0
  128. } { .mfi
  129. nop.m 0
  130. // Step (0'')
  131. // 3/2 = 1 + 1/2 in f12
  132. (p6) fma.s1 f12=f12,f1,f1
  133. nop.i 0;;
  134. } { .mfi
  135. nop.m 0
  136. // Step (5)
  137. // e = 1 + 3/2 * d in f12
  138. (p6) fma.s1 f12=f12,f10,f1
  139. nop.i 0
  140. } { .mfi
  141. nop.m 0
  142. // Step (6)
  143. // T0 = d * S0 in f11
  144. (p6) fma.s1 f11=f10,f7,f0
  145. nop.i 0;;
  146. } { .mfi
  147. nop.m 0
  148. // Step (7)
  149. // G0 = d * H0 in f10
  150. (p6) fma.s1 f10=f10,f9,f0
  151. nop.i 0;;
  152. } { .mfi
  153. nop.m 0
  154. // Step (8)
  155. // S1 = S0 + e * T0 in f7
  156. (p6) fma.s.s1 f7=f12,f11,f7
  157. nop.i 0;;
  158. } { .mfi
  159. nop.m 0
  160. // Step (9)
  161. // H1 = H0 + e * G0 in f12
  162. (p6) fma.s1 f12=f12,f10,f9
  163. nop.i 0;;
  164. } { .mfi
  165. nop.m 0
  166. // Step (10)
  167. // d1 = a - S1 * S1 in f9
  168. (p6) fnma.s1 f9=f7,f7,f8
  169. nop.i 0;;;
  170. } { .mfb
  171. nop.m 0
  172. // Step (11)
  173. // S = S1 + d1 * H1 in f7
  174. (p6) fma.s.s0 f8=f9,f12,f7
  175. (p6) br.ret.sptk b0 ;;
  176. // END SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
  177. } { .mfb
  178. nop.m 0
  179. (p0) mov f8 = f7
  180. (p8) br.ret.sptk b0 ;;
  181. }
  182. //
  183. // This branch includes all those special values that are not negative,
  184. // with the result equal to frcpa(x)
  185. //
  186. .endp sqrtf
  187. .proc __libm_error_region
  188. __libm_error_region:
  189. .prologue
  190. { .mii
  191. add GR_Parameter_Y=-32,sp // Parameter 2 value
  192. (p0) mov GR_Parameter_TAG = 50
  193. .save ar.pfs,GR_SAVE_PFS
  194. mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
  195. }
  196. { .mfi
  197. .fframe 64
  198. add sp=-64,sp // Create new stack
  199. nop.f 0
  200. mov GR_SAVE_GP=gp // Save gp
  201. };;
  202. { .mmi
  203. stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
  204. add GR_Parameter_X = 16,sp // Parameter 1 address
  205. .save b0, GR_SAVE_B0
  206. mov GR_SAVE_B0=b0 // Save b0
  207. };;
  208. .body
  209. { .mib
  210. stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
  211. add GR_Parameter_RESULT = 0,GR_Parameter_Y
  212. nop.b 0 // Parameter 3 address
  213. }
  214. { .mib
  215. stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
  216. add GR_Parameter_Y = -16,GR_Parameter_Y
  217. br.call.sptk b0=__libm_error_support# // Call error handling function
  218. };;
  219. { .mmi
  220. nop.m 0
  221. nop.m 0
  222. add GR_Parameter_RESULT = 48,sp
  223. };;
  224. { .mmi
  225. ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
  226. .restore
  227. add sp = 64,sp // Restore stack pointer
  228. mov b0 = GR_SAVE_B0 // Restore return address
  229. };;
  230. { .mib
  231. mov gp = GR_SAVE_GP // Restore gp
  232. mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
  233. br.ret.sptk b0 // Return
  234. };;
  235. .endp __libm_error_region
  236. .type __libm_error_support#,@function
  237. .global __libm_error_support#