Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

503 lines
13 KiB

;//
;// INTEL CORPORATION PROPRIETARY INFORMATION
;// This software is supplied under the terms of a license agreement or
;// nondisclosure agreement with Intel Corporation and may not be copied
;// or disclosed except in accordance with the terms of that agreement.
;// Copyright (c) 2000 Intel Corporation. All Rights Reserved.
;//
;//
; exp_wmt.asm
;
; double exp(double);
;
; Initial version: 11/30/2000
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This is a new version using just one table. Reduction by log2/64 ;;
;; A non-standard table is used. Normally, we store T,t where ;;
;; T+t = exp(jlog2/64) to high precision. This implementation ;;
;; stores T,d where d = t/T. This shortens the latency by 1 FP op ;;
;; This version uses two tricks from Andrey. First, we merge two ;;
;; integer-based tests for exception filtering into 1. Second, instead ;;
;; of using sign(X)2^52 as a shifter, we use S = 2^52 * 1.10000..000 ;;
;; as the shifter. This will give bit pattern of the 2's complement of ;;
;; N in trailing bits of S + W, W = X * 64/log2. ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.686P
.387
.XMM
.MODEL FLAT,C
EXTRN C __libm_error_support : NEAR
CONST SEGMENT PARA PUBLIC USE32 'CONST'
ALIGN 16
smask DQ 8000000000000000H, 8000000000000000H ; mask to get sign bit
emask DQ 0FFF0000000000000H, 0FFF0000000000000H
mmask DQ 00000000FFFFFFC0H, 00000000FFFFFFC0H ; mask off bottom 6 bits
bias DQ 000000000000FFC0H, 000000000000FFC0H ; 1023 shifter left 6 bits
Shifter DQ 4338000000000000H, 4338000000000000H ; 2^52+2^51|2^52+2^51
twom60 DQ 3C30000000000000H, 3C30000000000000H ; 2^(-60)
cv DQ 40571547652b82feH, 40571547652b82feH ; invL|invL
DQ 3F862E42FEFA0000H, 3F862E42FEFA0000H ; log2_hi|log2_hi
DQ 3D1CF79ABC9E3B3AH, 3D1CF79ABC9E3B3AH ; log2_lo|log2_lo
DQ 3F811074B1D108E5H, 3FC555555566A45AH ; p2|p4
DQ 3FA5555726ECED80H, 3FDFFFFFFFFFE17BH ; p1|p3
;-------Table d, T so that movapd gives [ T | d ]
;-------Note that the exponent field of T is set to 000
Tbl_addr DQ 0000000000000000H, 0000000000000000H
DQ 3CAD7BBF0E03754DH, 00002C9A3E778060H
DQ 3C8CD2523567F613H, 000059B0D3158574H
DQ 3C60F74E61E6C861H, 0000874518759BC8H
DQ 3C979AA65D837B6CH, 0000B5586CF9890FH
DQ 3C3EBE3D702F9CD1H, 0000E3EC32D3D1A2H
DQ 3CA3516E1E63BCD8H, 00011301D0125B50H
DQ 3CA4C55426F0387BH, 0001429AAEA92DDFH
DQ 3CA9515362523FB6H, 000172B83C7D517AH
DQ 3C8B898C3F1353BFH, 0001A35BEB6FCB75H
DQ 3C9AECF73E3A2F5FH, 0001D4873168B9AAH
DQ 3C8A6F4144A6C38DH, 0002063B88628CD6H
DQ 3C968EFDE3A8A894H, 0002387A6E756238H
DQ 3C80472B981FE7F2H, 00026B4565E27CDDH
DQ 3C82F7E16D09AB31H, 00029E9DF51FDEE1H
DQ 3C8B3782720C0AB3H, 0002D285A6E4030BH
DQ 3C834D754DB0ABB6H, 000306FE0A31B715H
DQ 3C8FDD395DD3F84AH, 00033C08B26416FFH
DQ 3CA12F8CCC187D29H, 000371A7373AA9CAH
DQ 3CA7D229738B5E8BH, 0003A7DB34E59FF6H
DQ 3C859F48A72A4C6DH, 0003DEA64C123422H
DQ 3CA8B846259D9205H, 0004160A21F72E29H
DQ 3C4363ED60C2AC12H, 00044E086061892DH
DQ 3C6ECCE1DAA10379H, 000486A2B5C13CD0H
DQ 3C7690CEBB7AAFB0H, 0004BFDAD5362A27H
DQ 3CA083CC9B282A09H, 0004F9B2769D2CA6H
DQ 3CA509B0C1AAE707H, 0005342B569D4F81H
DQ 3C93350518FDD78EH, 00056F4736B527DAH
DQ 3C9063E1E21C5409H, 0005AB07DD485429H
DQ 3C9432E62B64C035H, 0005E76F15AD2148H
DQ 3CA0128499F08C0AH, 0006247EB03A5584H
DQ 3C99F0870073DC06H, 0006623882552224H
DQ 3C998D4D0DA05571H, 0006A09E667F3BCCH
DQ 3CA52BB986CE4786H, 0006DFB23C651A2EH
DQ 3CA32092206F0DABH, 00071F75E8EC5F73H
DQ 3CA061228E17A7A6H, 00075FEB564267C8H
DQ 3CA244AC461E9F86H, 0007A11473EB0186H
DQ 3C65EBE1ABD66C55H, 0007E2F336CF4E62H
DQ 3C96FE9FBBFF67D0H, 00082589994CCE12H
DQ 3C951F1414C801DFH, 000868D99B4492ECH
DQ 3C8DB72FC1F0EAB4H, 0008ACE5422AA0DBH
DQ 3C7BF68359F35F44H, 0008F1AE99157736H
DQ 3CA360BA9C06283CH, 00093737B0CDC5E4H
DQ 3C95E8D120F962AAH, 00097D829FDE4E4FH
DQ 3C71AFFC2B91CE27H, 0009C49182A3F090H
DQ 3C9B6D34589A2EBDH, 000A0C667B5DE564H
DQ 3C95277C9AB89880H, 000A5503B23E255CH
DQ 3C8469846E735AB3H, 000A9E6B5579FDBFH
DQ 3C8C1A7792CB3387H, 000AE89F995AD3ADH
DQ 3CA22466DC2D1D96H, 000B33A2B84F15FAH
DQ 3CA1112EB19505AEH, 000B7F76F2FB5E46H
DQ 3C74FFD70A5FDDCDH, 000BCC1E904BC1D2H
DQ 3C736EAE30AF0CB3H, 000C199BDD85529CH
DQ 3C84E08FD10959ACH, 000C67F12E57D14BH
DQ 3C676B2C6C921968H, 000CB720DCEF9069H
DQ 3C93700936DF99B3H, 000D072D4A07897BH
DQ 3C74A385A63D07A7H, 000D5818DCFBA487H
DQ 3C8E5A50D5C192ACH, 000DA9E603DB3285H
DQ 3C98BB731C4A9792H, 000DFC97337B9B5EH
DQ 3C74B604603A88D3H, 000E502EE78B3FF6H
DQ 3C916F2792094926H, 000EA4AFA2A490D9H
DQ 3C8EC3BC41AA2008H, 000EFA1BEE615A27H
DQ 3C8A64A931D185EEH, 000F50765B6E4540H
DQ 3C77893B4D91CD9DH, 000FA7C1819E90D8H
ONE_val DQ 3ff0000000000000H ; 1.0
EMIN DQ 0010000000000000H
MAX_ARG DQ 40862e42fefa39efH
MIN_ARG DQ 0c086232bdd70000H
INF DQ 7ff0000000000000H
ZERO DQ 0
XMAX DQ 7fefffffffffffffH
XMIN DQ 0010000000000000H
Sm_Thres DQ 3C3000003C300000H ; DP 2^(-60)
Del_Thres DQ 045764CA045764CAH ; DP 1080*log(2) - 2^(-60), hi part
ALIGN 16
CONST ENDS
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
ALIGN 16
PUBLIC _exp_pentium4, _CIexp_pentium4
_CIexp_pentium4 PROC NEAR
push ebp
mov ebp, esp
sub esp, 8 ; for argument DBLSIZE
and esp, 0fffffff0h
fstp qword ptr [esp]
movq xmm0, qword ptr [esp]
call start
leave
ret
_exp_pentium4 label proc
; load *|x in XMM0
movlpd xmm0, 4[esp]
start:
unpcklpd xmm0,xmm0
; load Inv_L pair
movapd xmm1, QWORD PTR [cv]
; load Shifter
movapd xmm6, QWORD PTR [Shifter]
; load L_hi pair
movapd xmm2, QWORD PTR [cv+16]
; load L_lo pair
movapd xmm3, QWORD PTR [cv+32]
pextrw eax, xmm0,3
and eax,7FFFH
; x>=2^{10} ? (i.e. 2^{10}-eps-x<0)
mov edx, 408fH
sub edx, eax
; avoid underflow on intermediate calculations (|x|<2^{-54} ?)
sub eax, 3c90H
or edx, eax
cmp edx, 80000000H
; small input or UF/OF
jae RETURN_ONE
; xmm1=Inv_L*x|Inv_L*x
mulpd xmm1,xmm0
; xmm1=Inv_L*x+Shifter| Inv_L*x+Shifter
addpd xmm1,xmm6
; xmm7 contains bit pattern of N
movapd xmm7,xmm1
; xmm1=N
subpd xmm1,xmm6
; xmm2=L_hi*round_to_int(Inv_L*x)|L_hi*round_to_int(Inv_L*x) ; N_L_hi
mulpd xmm2,xmm1
; [p2|p4]
MOVAPD xmm4,[cv+48]
; xmm3=L_lo*round_to_int(Inv_L*x)|L_lo*round_to_int(Inv_L*x) ; N_L_lo
mulpd xmm3,xmm1
; [p1|p3]
MOVAPD xmm5,[cv+64]
; xmm0=x-xmm2 ; R := X |-| N_L_hi
subpd xmm0,xmm2
; set eax <-- n, ecx <--j
movd eax,xmm7
mov ecx,eax
and ecx,0000003FH
; get offset for [T,d]
shl ecx,4
; eax,edx <-- m
sar eax,6
mov edx,eax
; xmm0-=xmm3 ; R := R |-| N_L_lo
subpd xmm0,xmm3
; xmm2 <- [T,d]
movapd xmm2,[ecx+Tbl_addr]
; xmm4=p2*R|p4*R
mulpd xmm4,xmm0
MOVAPD xmm1,xmm0
MULPD xmm0,xmm0
; xmm5=p1+p2*R|p3+p4*R
addpd xmm5,xmm4
MULSD xmm0,xmm0
; get xmm1 <-- [R|R+d]
addsd xmm1,xmm2
; xmm2 <-- [T|T]
unpckhpd xmm2,xmm2
; xmm7 <-- exponent of 2^m
movdqa xmm6,[mmask]
pand xmm7,xmm6
movdqa xmm6,[bias]
paddq xmm7,xmm6
psllq xmm7,46
; xmm5=[P_hi | P_lo]
mulpd xmm0,xmm5
; xmm1 <- [R |d+R+P_lo]
addsd xmm1,xmm0
; xmm2 is 2^m T
ORPD xmm2,xmm7
; xmm5 <- [P_hi | P_hi]
unpckhpd xmm0,xmm0
; xmm5 <-- [P_hi | d+R+P ]
addsd xmm0,xmm1
; make sure -894 <= m <= 1022
; before we use the exponent in xmm7
; test by unsigned comp of m+894 with 1022+894
add edx,894
cmp edx,1916
ja ADJUST
mulsd xmm0,xmm2
sub esp, 16
addsd xmm0,xmm2
movlpd QWORD PTR [esp+4], xmm0 ; return result
fld QWORD PTR [esp+4] ;
add esp, 16
ret
ADJUST:
;---xmm5 contains [*| d+R+P]
;---xmm2 contains [*| T ] where is exponent field is not correct
;---eax still contain the correct m
;---so we split m into m1 and m2, m1+m2 = m. Make T with exponent 2^m1 by
;---integer manipulation, and multiply final result by 2^m2
; overflow or underflow
sub esp,18
fstcw WORD PTR [esp+16]
mov dx,WORD PTR [esp+16]
; set pc=64 bits
or dx,300H
mov WORD PTR [esp],dx
fldcw WORD PTR [esp]
; eax <-- m1 = m/2, edx <-- m2 = m - m1
mov edx,eax
sar eax,1
sub edx,eax
; T with exponent field zerorized
movdqa xmm6,[emask]
pandn xmm6,xmm2
add eax,1023
movd xmm3,eax
psllq xmm3,52
; xmm6=T*2^m1
ORPD xmm6,xmm3
add edx,1023
movd xmm4,edx
psllq xmm4,52
; load P on FP stack
movlpd QWORD PTR [esp], xmm0
fld QWORD PTR [esp]
; load T'=T*2^m1 on FP stack
movlpd QWORD PTR [esp+8], xmm6
fld QWORD PTR [esp+8]
; T'*P
fmul st(1), st(0)
; T'+T'*P
faddp st(1), st(0)
; load 2^m2 on FP stack
movlpd QWORD PTR [esp], xmm4
fld QWORD PTR [esp]
; final calculation: 2^m2*(T'+T'*P)
fmulp st(1), st(0)
; store result in memory, then xmm0
fstp QWORD PTR [esp]
movlpd xmm0, QWORD PTR [esp]
; restore FPCW
fldcw WORD PTR [esp+16]
add esp,18
; mov ecx, DWORD PTR [esp+8]
; ; if 0<x<2^{10}*ln2, return
; cmp ecx, 40862e42H
; jb RETURN
; ja CONT0
pextrw ecx, xmm0, 3
and ecx, 7ff0H
cmp ecx, 7ff0H
jae OVERFLOW
cmp ecx, 0
jz UNDERFLOW
jmp RETURN
; load lower 32 bits of x
; mov edx, DWORD PTR [esp+4]
; cmp edx, 0fefa39efH
; jb RETURN
; jmp OVERFLOW
CONT0:
; OF/UF
; OF ?
cmp ecx,80000000H
jb OVERFLOW
; x<(2-2^{10})*ln2 ?
cmp ecx, 0c086232bH
jb RETURN
ja UNDERFLOW
mov edx, DWORD PTR [esp+4]
cmp edx, 0fefa39efH
jb RETURN
jmp UNDERFLOW
OVERFLOW:
;OF
mov edx,14
jmp CALL_LIBM_ERROR
UNDERFLOW:
mov edx, 15
CALL_LIBM_ERROR:
;call libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag)
sub esp, 28
movlpd QWORD PTR [esp+16], xmm0
mov DWORD PTR [esp+12],edx
mov edx, esp
add edx,16
mov DWORD PTR [esp+8],edx
add edx,16
mov DWORD PTR [esp+4],edx
mov DWORD PTR [esp],edx
call NEAR PTR __libm_error_support
movlpd xmm0, QWORD PTR [esp+16]
add esp, 28
RETURN:
sub esp, 16
movlpd QWORD PTR [esp+4], xmm0 ; return result
fld QWORD PTR [esp+4] ;
add esp, 16
ret
SPECIAL_CASES:
; code to be added, but OK for now
; Need to resolve several cases
;
; Case 1: Argument is close to zero ( |X| < 2^(-60) )
; Compute 1 + X and return the result
; This will allow the appropriate action to take place.
; For example, in directed rounding, the correct number below/above 1 is returned.
; If X is denormalized, and that DAE is set, then we will be consistant with DAE,
; that is X is treated as zero and directed rounding will not affect the result.
; This action also takes care of the case X = 0.
;
; Case 2: |X| is large but finite
; Generate overflow/underflow by a simple arithmetic operation. This is also a place
; holder for various exception handling protocol.
;
; Case 3: X is +-inf. Return +inf or +0 exactly without exception
;
; Case 4: X is s/q NaN
;
OF_UF:
; x=infinity/NaN ?
cmp eax, 7ff00000H
jae INF_NAN
mov eax,[esp+8]
cmp eax,80000000H
jae UF
movlpd xmm0, QWORD PTR [XMAX]
mulsd xmm0, xmm0
mov edx,14
jmp CALL_LIBM_ERROR
UF: movlpd xmm0, QWORD PTR [XMIN]
mulsd xmm0, xmm0
mov edx,15
jmp CALL_LIBM_ERROR
INF_NAN:
; load lower 32 bits of x
mov edx, DWORD PTR [esp+4]
cmp eax, 7ff00000H
ja NaN_arg
cmp edx,0
jnz NaN_arg
mov eax,DWORD PTR [esp+8]
cmp eax,7ff00000H
jne INF_NEG
; +INF
fld QWORD PTR [INF]
ret
INF_NEG:
; -INF
fld QWORD PTR [ZERO]
ret
NaN_arg:
; movlpd xmm0, 4[esp]
; addsd xmm0,xmm0
; sub esp, 16
; movlpd 4[esp],xmm0
; fld QWORD PTR [esp+4] ; return x
; add esp, 16
; ret
mov edx,1002
jmp CALL_LIBM_ERROR
RETURN_ONE:
; load hi-part of x
mov eax,[esp+8]
and eax,7FFFFFFFH
; large absolute value (>=2^{10}) ?
cmp eax, 40900000H
jae OF_UF
; small inputs, return 1
movlpd xmm0, 4[esp]
; set D flag
addsd xmm0, QWORD PTR [ONE_val]
sub esp, 16
movlpd 4[esp],xmm0
fld QWORD PTR [esp+4] ; return x
add esp, 16
ret
_CIexp_pentium4 ENDP
ALIGN 16
_TEXT ENDS
END