Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

503 lines
13 KiB

  1. ;//
  2. ;// INTEL CORPORATION PROPRIETARY INFORMATION
  3. ;// This software is supplied under the terms of a license agreement or
  4. ;// nondisclosure agreement with Intel Corporation and may not be copied
  5. ;// or disclosed except in accordance with the terms of that agreement.
  6. ;// Copyright (c) 2000 Intel Corporation. All Rights Reserved.
  7. ;//
  8. ;//
  9. ; exp_wmt.asm
  10. ;
  11. ; double exp(double);
  12. ;
  13. ; Initial version: 11/30/2000
  14. ;
  15. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  16. ;; This is a new version using just one table. Reduction by log2/64 ;;
  17. ;; A non-standard table is used. Normally, we store T,t where ;;
  18. ;; T+t = exp(jlog2/64) to high precision. This implementation ;;
  19. ;; stores T,d where d = t/T. This shortens the latency by 1 FP op ;;
  20. ;; This version uses two tricks from Andrey. First, we merge two ;;
  21. ;; integer-based tests for exception filtering into 1. Second, instead ;;
  22. ;; of using sign(X)2^52 as a shifter, we use S = 2^52 * 1.10000..000 ;;
  23. ;; as the shifter. This will give bit pattern of the 2's complement of ;;
  24. ;; N in trailing bits of S + W, W = X * 64/log2. ;;
  25. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  26. .686P
  27. .387
  28. .XMM
  29. .MODEL FLAT,C
  30. EXTRN C __libm_error_support : NEAR
  31. CONST SEGMENT PARA PUBLIC USE32 'CONST'
  32. ALIGN 16
  33. smask DQ 8000000000000000H, 8000000000000000H ; mask to get sign bit
  34. emask DQ 0FFF0000000000000H, 0FFF0000000000000H
  35. mmask DQ 00000000FFFFFFC0H, 00000000FFFFFFC0H ; mask off bottom 6 bits
  36. bias DQ 000000000000FFC0H, 000000000000FFC0H ; 1023 shifter left 6 bits
  37. Shifter DQ 4338000000000000H, 4338000000000000H ; 2^52+2^51|2^52+2^51
  38. twom60 DQ 3C30000000000000H, 3C30000000000000H ; 2^(-60)
  39. cv DQ 40571547652b82feH, 40571547652b82feH ; invL|invL
  40. DQ 3F862E42FEFA0000H, 3F862E42FEFA0000H ; log2_hi|log2_hi
  41. DQ 3D1CF79ABC9E3B3AH, 3D1CF79ABC9E3B3AH ; log2_lo|log2_lo
  42. DQ 3F811074B1D108E5H, 3FC555555566A45AH ; p2|p4
  43. DQ 3FA5555726ECED80H, 3FDFFFFFFFFFE17BH ; p1|p3
  44. ;-------Table d, T so that movapd gives [ T | d ]
  45. ;-------Note that the exponent field of T is set to 000
  46. Tbl_addr DQ 0000000000000000H, 0000000000000000H
  47. DQ 3CAD7BBF0E03754DH, 00002C9A3E778060H
  48. DQ 3C8CD2523567F613H, 000059B0D3158574H
  49. DQ 3C60F74E61E6C861H, 0000874518759BC8H
  50. DQ 3C979AA65D837B6CH, 0000B5586CF9890FH
  51. DQ 3C3EBE3D702F9CD1H, 0000E3EC32D3D1A2H
  52. DQ 3CA3516E1E63BCD8H, 00011301D0125B50H
  53. DQ 3CA4C55426F0387BH, 0001429AAEA92DDFH
  54. DQ 3CA9515362523FB6H, 000172B83C7D517AH
  55. DQ 3C8B898C3F1353BFH, 0001A35BEB6FCB75H
  56. DQ 3C9AECF73E3A2F5FH, 0001D4873168B9AAH
  57. DQ 3C8A6F4144A6C38DH, 0002063B88628CD6H
  58. DQ 3C968EFDE3A8A894H, 0002387A6E756238H
  59. DQ 3C80472B981FE7F2H, 00026B4565E27CDDH
  60. DQ 3C82F7E16D09AB31H, 00029E9DF51FDEE1H
  61. DQ 3C8B3782720C0AB3H, 0002D285A6E4030BH
  62. DQ 3C834D754DB0ABB6H, 000306FE0A31B715H
  63. DQ 3C8FDD395DD3F84AH, 00033C08B26416FFH
  64. DQ 3CA12F8CCC187D29H, 000371A7373AA9CAH
  65. DQ 3CA7D229738B5E8BH, 0003A7DB34E59FF6H
  66. DQ 3C859F48A72A4C6DH, 0003DEA64C123422H
  67. DQ 3CA8B846259D9205H, 0004160A21F72E29H
  68. DQ 3C4363ED60C2AC12H, 00044E086061892DH
  69. DQ 3C6ECCE1DAA10379H, 000486A2B5C13CD0H
  70. DQ 3C7690CEBB7AAFB0H, 0004BFDAD5362A27H
  71. DQ 3CA083CC9B282A09H, 0004F9B2769D2CA6H
  72. DQ 3CA509B0C1AAE707H, 0005342B569D4F81H
  73. DQ 3C93350518FDD78EH, 00056F4736B527DAH
  74. DQ 3C9063E1E21C5409H, 0005AB07DD485429H
  75. DQ 3C9432E62B64C035H, 0005E76F15AD2148H
  76. DQ 3CA0128499F08C0AH, 0006247EB03A5584H
  77. DQ 3C99F0870073DC06H, 0006623882552224H
  78. DQ 3C998D4D0DA05571H, 0006A09E667F3BCCH
  79. DQ 3CA52BB986CE4786H, 0006DFB23C651A2EH
  80. DQ 3CA32092206F0DABH, 00071F75E8EC5F73H
  81. DQ 3CA061228E17A7A6H, 00075FEB564267C8H
  82. DQ 3CA244AC461E9F86H, 0007A11473EB0186H
  83. DQ 3C65EBE1ABD66C55H, 0007E2F336CF4E62H
  84. DQ 3C96FE9FBBFF67D0H, 00082589994CCE12H
  85. DQ 3C951F1414C801DFH, 000868D99B4492ECH
  86. DQ 3C8DB72FC1F0EAB4H, 0008ACE5422AA0DBH
  87. DQ 3C7BF68359F35F44H, 0008F1AE99157736H
  88. DQ 3CA360BA9C06283CH, 00093737B0CDC5E4H
  89. DQ 3C95E8D120F962AAH, 00097D829FDE4E4FH
  90. DQ 3C71AFFC2B91CE27H, 0009C49182A3F090H
  91. DQ 3C9B6D34589A2EBDH, 000A0C667B5DE564H
  92. DQ 3C95277C9AB89880H, 000A5503B23E255CH
  93. DQ 3C8469846E735AB3H, 000A9E6B5579FDBFH
  94. DQ 3C8C1A7792CB3387H, 000AE89F995AD3ADH
  95. DQ 3CA22466DC2D1D96H, 000B33A2B84F15FAH
  96. DQ 3CA1112EB19505AEH, 000B7F76F2FB5E46H
  97. DQ 3C74FFD70A5FDDCDH, 000BCC1E904BC1D2H
  98. DQ 3C736EAE30AF0CB3H, 000C199BDD85529CH
  99. DQ 3C84E08FD10959ACH, 000C67F12E57D14BH
  100. DQ 3C676B2C6C921968H, 000CB720DCEF9069H
  101. DQ 3C93700936DF99B3H, 000D072D4A07897BH
  102. DQ 3C74A385A63D07A7H, 000D5818DCFBA487H
  103. DQ 3C8E5A50D5C192ACH, 000DA9E603DB3285H
  104. DQ 3C98BB731C4A9792H, 000DFC97337B9B5EH
  105. DQ 3C74B604603A88D3H, 000E502EE78B3FF6H
  106. DQ 3C916F2792094926H, 000EA4AFA2A490D9H
  107. DQ 3C8EC3BC41AA2008H, 000EFA1BEE615A27H
  108. DQ 3C8A64A931D185EEH, 000F50765B6E4540H
  109. DQ 3C77893B4D91CD9DH, 000FA7C1819E90D8H
  110. ONE_val DQ 3ff0000000000000H ; 1.0
  111. EMIN DQ 0010000000000000H
  112. MAX_ARG DQ 40862e42fefa39efH
  113. MIN_ARG DQ 0c086232bdd70000H
  114. INF DQ 7ff0000000000000H
  115. ZERO DQ 0
  116. XMAX DQ 7fefffffffffffffH
  117. XMIN DQ 0010000000000000H
  118. Sm_Thres DQ 3C3000003C300000H ; DP 2^(-60)
  119. Del_Thres DQ 045764CA045764CAH ; DP 1080*log(2) - 2^(-60), hi part
  120. ALIGN 16
  121. CONST ENDS
  122. _TEXT SEGMENT PARA PUBLIC USE32 'CODE'
  123. ALIGN 16
  124. PUBLIC _exp_pentium4, _CIexp_pentium4
  125. _CIexp_pentium4 PROC NEAR
  126. push ebp
  127. mov ebp, esp
  128. sub esp, 8 ; for argument DBLSIZE
  129. and esp, 0fffffff0h
  130. fstp qword ptr [esp]
  131. movq xmm0, qword ptr [esp]
  132. call start
  133. leave
  134. ret
  135. _exp_pentium4 label proc
  136. ; load *|x in XMM0
  137. movlpd xmm0, 4[esp]
  138. start:
  139. unpcklpd xmm0,xmm0
  140. ; load Inv_L pair
  141. movapd xmm1, QWORD PTR [cv]
  142. ; load Shifter
  143. movapd xmm6, QWORD PTR [Shifter]
  144. ; load L_hi pair
  145. movapd xmm2, QWORD PTR [cv+16]
  146. ; load L_lo pair
  147. movapd xmm3, QWORD PTR [cv+32]
  148. pextrw eax, xmm0,3
  149. and eax,7FFFH
  150. ; x>=2^{10} ? (i.e. 2^{10}-eps-x<0)
  151. mov edx, 408fH
  152. sub edx, eax
  153. ; avoid underflow on intermediate calculations (|x|<2^{-54} ?)
  154. sub eax, 3c90H
  155. or edx, eax
  156. cmp edx, 80000000H
  157. ; small input or UF/OF
  158. jae RETURN_ONE
  159. ; xmm1=Inv_L*x|Inv_L*x
  160. mulpd xmm1,xmm0
  161. ; xmm1=Inv_L*x+Shifter| Inv_L*x+Shifter
  162. addpd xmm1,xmm6
  163. ; xmm7 contains bit pattern of N
  164. movapd xmm7,xmm1
  165. ; xmm1=N
  166. subpd xmm1,xmm6
  167. ; xmm2=L_hi*round_to_int(Inv_L*x)|L_hi*round_to_int(Inv_L*x) ; N_L_hi
  168. mulpd xmm2,xmm1
  169. ; [p2|p4]
  170. MOVAPD xmm4,[cv+48]
  171. ; xmm3=L_lo*round_to_int(Inv_L*x)|L_lo*round_to_int(Inv_L*x) ; N_L_lo
  172. mulpd xmm3,xmm1
  173. ; [p1|p3]
  174. MOVAPD xmm5,[cv+64]
  175. ; xmm0=x-xmm2 ; R := X |-| N_L_hi
  176. subpd xmm0,xmm2
  177. ; set eax <-- n, ecx <--j
  178. movd eax,xmm7
  179. mov ecx,eax
  180. and ecx,0000003FH
  181. ; get offset for [T,d]
  182. shl ecx,4
  183. ; eax,edx <-- m
  184. sar eax,6
  185. mov edx,eax
  186. ; xmm0-=xmm3 ; R := R |-| N_L_lo
  187. subpd xmm0,xmm3
  188. ; xmm2 <- [T,d]
  189. movapd xmm2,[ecx+Tbl_addr]
  190. ; xmm4=p2*R|p4*R
  191. mulpd xmm4,xmm0
  192. MOVAPD xmm1,xmm0
  193. MULPD xmm0,xmm0
  194. ; xmm5=p1+p2*R|p3+p4*R
  195. addpd xmm5,xmm4
  196. MULSD xmm0,xmm0
  197. ; get xmm1 <-- [R|R+d]
  198. addsd xmm1,xmm2
  199. ; xmm2 <-- [T|T]
  200. unpckhpd xmm2,xmm2
  201. ; xmm7 <-- exponent of 2^m
  202. movdqa xmm6,[mmask]
  203. pand xmm7,xmm6
  204. movdqa xmm6,[bias]
  205. paddq xmm7,xmm6
  206. psllq xmm7,46
  207. ; xmm5=[P_hi | P_lo]
  208. mulpd xmm0,xmm5
  209. ; xmm1 <- [R |d+R+P_lo]
  210. addsd xmm1,xmm0
  211. ; xmm2 is 2^m T
  212. ORPD xmm2,xmm7
  213. ; xmm5 <- [P_hi | P_hi]
  214. unpckhpd xmm0,xmm0
  215. ; xmm5 <-- [P_hi | d+R+P ]
  216. addsd xmm0,xmm1
  217. ; make sure -894 <= m <= 1022
  218. ; before we use the exponent in xmm7
  219. ; test by unsigned comp of m+894 with 1022+894
  220. add edx,894
  221. cmp edx,1916
  222. ja ADJUST
  223. mulsd xmm0,xmm2
  224. sub esp, 16
  225. addsd xmm0,xmm2
  226. movlpd QWORD PTR [esp+4], xmm0 ; return result
  227. fld QWORD PTR [esp+4] ;
  228. add esp, 16
  229. ret
  230. ADJUST:
  231. ;---xmm5 contains [*| d+R+P]
  232. ;---xmm2 contains [*| T ] where is exponent field is not correct
  233. ;---eax still contain the correct m
  234. ;---so we split m into m1 and m2, m1+m2 = m. Make T with exponent 2^m1 by
  235. ;---integer manipulation, and multiply final result by 2^m2
  236. ; overflow or underflow
  237. sub esp,18
  238. fstcw WORD PTR [esp+16]
  239. mov dx,WORD PTR [esp+16]
  240. ; set pc=64 bits
  241. or dx,300H
  242. mov WORD PTR [esp],dx
  243. fldcw WORD PTR [esp]
  244. ; eax <-- m1 = m/2, edx <-- m2 = m - m1
  245. mov edx,eax
  246. sar eax,1
  247. sub edx,eax
  248. ; T with exponent field zerorized
  249. movdqa xmm6,[emask]
  250. pandn xmm6,xmm2
  251. add eax,1023
  252. movd xmm3,eax
  253. psllq xmm3,52
  254. ; xmm6=T*2^m1
  255. ORPD xmm6,xmm3
  256. add edx,1023
  257. movd xmm4,edx
  258. psllq xmm4,52
  259. ; load P on FP stack
  260. movlpd QWORD PTR [esp], xmm0
  261. fld QWORD PTR [esp]
  262. ; load T'=T*2^m1 on FP stack
  263. movlpd QWORD PTR [esp+8], xmm6
  264. fld QWORD PTR [esp+8]
  265. ; T'*P
  266. fmul st(1), st(0)
  267. ; T'+T'*P
  268. faddp st(1), st(0)
  269. ; load 2^m2 on FP stack
  270. movlpd QWORD PTR [esp], xmm4
  271. fld QWORD PTR [esp]
  272. ; final calculation: 2^m2*(T'+T'*P)
  273. fmulp st(1), st(0)
  274. ; store result in memory, then xmm0
  275. fstp QWORD PTR [esp]
  276. movlpd xmm0, QWORD PTR [esp]
  277. ; restore FPCW
  278. fldcw WORD PTR [esp+16]
  279. add esp,18
  280. ; mov ecx, DWORD PTR [esp+8]
  281. ; ; if 0<x<2^{10}*ln2, return
  282. ; cmp ecx, 40862e42H
  283. ; jb RETURN
  284. ; ja CONT0
  285. pextrw ecx, xmm0, 3
  286. and ecx, 7ff0H
  287. cmp ecx, 7ff0H
  288. jae OVERFLOW
  289. cmp ecx, 0
  290. jz UNDERFLOW
  291. jmp RETURN
  292. ; load lower 32 bits of x
  293. ; mov edx, DWORD PTR [esp+4]
  294. ; cmp edx, 0fefa39efH
  295. ; jb RETURN
  296. ; jmp OVERFLOW
  297. CONT0:
  298. ; OF/UF
  299. ; OF ?
  300. cmp ecx,80000000H
  301. jb OVERFLOW
  302. ; x<(2-2^{10})*ln2 ?
  303. cmp ecx, 0c086232bH
  304. jb RETURN
  305. ja UNDERFLOW
  306. mov edx, DWORD PTR [esp+4]
  307. cmp edx, 0fefa39efH
  308. jb RETURN
  309. jmp UNDERFLOW
  310. OVERFLOW:
  311. ;OF
  312. mov edx,14
  313. jmp CALL_LIBM_ERROR
  314. UNDERFLOW:
  315. mov edx, 15
  316. CALL_LIBM_ERROR:
  317. ;call libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag)
  318. sub esp, 28
  319. movlpd QWORD PTR [esp+16], xmm0
  320. mov DWORD PTR [esp+12],edx
  321. mov edx, esp
  322. add edx,16
  323. mov DWORD PTR [esp+8],edx
  324. add edx,16
  325. mov DWORD PTR [esp+4],edx
  326. mov DWORD PTR [esp],edx
  327. call NEAR PTR __libm_error_support
  328. movlpd xmm0, QWORD PTR [esp+16]
  329. add esp, 28
  330. RETURN:
  331. sub esp, 16
  332. movlpd QWORD PTR [esp+4], xmm0 ; return result
  333. fld QWORD PTR [esp+4] ;
  334. add esp, 16
  335. ret
  336. SPECIAL_CASES:
  337. ; code to be added, but OK for now
  338. ; Need to resolve several cases
  339. ;
  340. ; Case 1: Argument is close to zero ( |X| < 2^(-60) )
  341. ; Compute 1 + X and return the result
  342. ; This will allow the appropriate action to take place.
  343. ; For example, in directed rounding, the correct number below/above 1 is returned.
  344. ; If X is denormalized, and that DAE is set, then we will be consistant with DAE,
  345. ; that is X is treated as zero and directed rounding will not affect the result.
  346. ; This action also takes care of the case X = 0.
  347. ;
  348. ; Case 2: |X| is large but finite
  349. ; Generate overflow/underflow by a simple arithmetic operation. This is also a place
  350. ; holder for various exception handling protocol.
  351. ;
  352. ; Case 3: X is +-inf. Return +inf or +0 exactly without exception
  353. ;
  354. ; Case 4: X is s/q NaN
  355. ;
  356. OF_UF:
  357. ; x=infinity/NaN ?
  358. cmp eax, 7ff00000H
  359. jae INF_NAN
  360. mov eax,[esp+8]
  361. cmp eax,80000000H
  362. jae UF
  363. movlpd xmm0, QWORD PTR [XMAX]
  364. mulsd xmm0, xmm0
  365. mov edx,14
  366. jmp CALL_LIBM_ERROR
  367. UF: movlpd xmm0, QWORD PTR [XMIN]
  368. mulsd xmm0, xmm0
  369. mov edx,15
  370. jmp CALL_LIBM_ERROR
  371. INF_NAN:
  372. ; load lower 32 bits of x
  373. mov edx, DWORD PTR [esp+4]
  374. cmp eax, 7ff00000H
  375. ja NaN_arg
  376. cmp edx,0
  377. jnz NaN_arg
  378. mov eax,DWORD PTR [esp+8]
  379. cmp eax,7ff00000H
  380. jne INF_NEG
  381. ; +INF
  382. fld QWORD PTR [INF]
  383. ret
  384. INF_NEG:
  385. ; -INF
  386. fld QWORD PTR [ZERO]
  387. ret
  388. NaN_arg:
  389. ; movlpd xmm0, 4[esp]
  390. ; addsd xmm0,xmm0
  391. ; sub esp, 16
  392. ; movlpd 4[esp],xmm0
  393. ; fld QWORD PTR [esp+4] ; return x
  394. ; add esp, 16
  395. ; ret
  396. mov edx,1002
  397. jmp CALL_LIBM_ERROR
  398. RETURN_ONE:
  399. ; load hi-part of x
  400. mov eax,[esp+8]
  401. and eax,7FFFFFFFH
  402. ; large absolute value (>=2^{10}) ?
  403. cmp eax, 40900000H
  404. jae OF_UF
  405. ; small inputs, return 1
  406. movlpd xmm0, 4[esp]
  407. ; set D flag
  408. addsd xmm0, QWORD PTR [ONE_val]
  409. sub esp, 16
  410. movlpd 4[esp],xmm0
  411. fld QWORD PTR [esp+4] ; return x
  412. add esp, 16
  413. ret
  414. _CIexp_pentium4 ENDP
  415. ALIGN 16
  416. _TEXT ENDS
  417. END