Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

460 lines
17 KiB

;//
;// INTEL CORPORATION PROPRIETARY INFORMATION
;// This software is supplied under the terms of a license agreement or
;// nondisclosure agreement with Intel Corporation and may not be copied
;// or disclosed except in accordance with the terms of that agreement.
;// Copyright (c) 2000 Intel Corporation. All Rights Reserved.
;//
;//
; log_wmt.asm
;
; double log(double);
;
; Initial version: 12/15/2000
; Updated with bug fixes: 2/20/2001
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;;
;; Another important feature is that we use the table of log(1/B) ;;
;; throughout. To ensure numerical accuracy, we only need to ensure that ;;
;; T(0)_hi = B(last)_hi, T(0)_lo = B(last)_lo. This ensures W_hi = 0 and ;;
;; W_lo = 0 exactly in the case of |X-1| <= 2^(-7). ;;
;; Finally, we do away with the need for extra-precision addition by the ;;
;; following observation. The three pieces at the end are ;;
;; A = W_hi + r_hi; B = r_lo; C = P + W_lo. ;;
;; When W_hi = W_lo = 0, the addition sequence (A+B) + C is accurate as ;;
;; the sum A+B is exact. ;;
;; Otherwise, A + (B+C) is accurate as B is going to be largely shifted ;;
;; off compared to the final result. ;;
;; Hence if we use compare and mask operations to ;;
;; create alpha = (r_lo or 0), beta = (0 or r_lo), Res_hi <- W_hi+alpha, ;;
;; Res_lo <- C + beta, then result is accurately computed as ;;
;; Res_hi+Res_lo. ;;
;; ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.686P
.387
.XMM
.MODEL FLAT,C
EXTRN C __libm_error_support : NEAR
CONST SEGMENT PARA PUBLIC USE32 'CONST'
ALIGN 16
emask DQ 000FFFFFFFFFFFFFH, 000FFFFFFFFFFFFFH ; mask off sign/expo field
Magic DQ 428FFFFFFFFFF80FH, 428FFFFFFFFFF80FH ; 2^(42)-1+2^(-7)
hi_mask DQ 7FFFFFFFFFE00000H, 7FFFFFFFFFE00000H ; mask of bottom 21 bits
LOG_2 DQ 3FE62E42FEFA3800H, 3D2EF35793C76730H ; L_hi,L_lo -> [L_lo|L_hi]
place_L DQ 0000000000000000H,0FFFFFFFFFFFFFFFFH ; 0,1 -> [FF..FF|00..00]
DQ 0FFFFFFFFFFFFFFFFH, 0000000000000000H ; 1,0 -> [00..00|FF..FF]
One DQ 3ff0000000000000H, 3ff0000000000000H ; 1,1
Zero DQ 0000000000000000H, 0000000000000000H ; 0,0
Two52 DQ 4330000000000000H, 4330000000000000H ; 2^52 for normalization
Infs DQ 0FFF0000000000000H, 7FF0000000000000H ; -inf,+inf --> [+inf|-inf]
NaN DQ 7FF0000000000001H, 7FF0000000000001H ; NaN for log(-ve), log(Nan)
coeff DQ 3FC24998090DC555H, 0BFCFFFFFFF201E13H ; p6,p3 ->[p3|p6]
DQ 0BFC555C54DD57D75H, 3FD55555555555A7H ; p5,p2 ->[p2|p5]
DQ 3FC9999998867A53H, 0BFE000000000001CH ; p4,p1 ->[p1|p4]
;-------Table B-----------
B_Tbl DQ 3FF0000000000000H, 3FF0000000000000H
DQ 3FEF820000000000H, 3FEF820000000000H
DQ 3FEF080000000000H, 3FEF080000000000H
DQ 3FEE920000000000H, 3FEE920000000000H
DQ 3FEE1E0000000000H, 3FEE1E0000000000H
DQ 3FEDAE0000000000H, 3FEDAE0000000000H
DQ 3FED420000000000H, 3FED420000000000H
DQ 3FECD80000000000H, 3FECD80000000000H
DQ 3FEC720000000000H, 3FEC720000000000H
DQ 3FEC0E0000000000H, 3FEC0E0000000000H
DQ 3FEBAC0000000000H, 3FEBAC0000000000H
DQ 3FEB4E0000000000H, 3FEB4E0000000000H
DQ 3FEAF20000000000H, 3FEAF20000000000H
DQ 3FEA980000000000H, 3FEA980000000000H
DQ 3FEA420000000000H, 3FEA420000000000H
DQ 3FE9EC0000000000H, 3FE9EC0000000000H
DQ 3FE99A0000000000H, 3FE99A0000000000H
DQ 3FE9480000000000H, 3FE9480000000000H
DQ 3FE8FA0000000000H, 3FE8FA0000000000H
DQ 3FE8AC0000000000H, 3FE8AC0000000000H
DQ 3FE8620000000000H, 3FE8620000000000H
DQ 3FE8180000000000H, 3FE8180000000000H
DQ 3FE7D00000000000H, 3FE7D00000000000H
DQ 3FE78A0000000000H, 3FE78A0000000000H
DQ 3FE7460000000000H, 3FE7460000000000H
DQ 3FE7020000000000H, 3FE7020000000000H
DQ 3FE6C20000000000H, 3FE6C20000000000H
DQ 3FE6820000000000H, 3FE6820000000000H
DQ 3FE6420000000000H, 3FE6420000000000H
DQ 3FE6060000000000H, 3FE6060000000000H
DQ 3FE5CA0000000000H, 3FE5CA0000000000H
DQ 3FE58E0000000000H, 3FE58E0000000000H
DQ 3FE5560000000000H, 3FE5560000000000H
DQ 3FE51E0000000000H, 3FE51E0000000000H
DQ 3FE4E60000000000H, 3FE4E60000000000H
DQ 3FE4B00000000000H, 3FE4B00000000000H
DQ 3FE47A0000000000H, 3FE47A0000000000H
DQ 3FE4460000000000H, 3FE4460000000000H
DQ 3FE4140000000000H, 3FE4140000000000H
DQ 3FE3E20000000000H, 3FE3E20000000000H
DQ 3FE3B20000000000H, 3FE3B20000000000H
DQ 3FE3820000000000H, 3FE3820000000000H
DQ 3FE3520000000000H, 3FE3520000000000H
DQ 3FE3240000000000H, 3FE3240000000000H
DQ 3FE2F60000000000H, 3FE2F60000000000H
DQ 3FE2CA0000000000H, 3FE2CA0000000000H
DQ 3FE29E0000000000H, 3FE29E0000000000H
DQ 3FE2740000000000H, 3FE2740000000000H
DQ 3FE24A0000000000H, 3FE24A0000000000H
DQ 3FE2200000000000H, 3FE2200000000000H
DQ 3FE1F80000000000H, 3FE1F80000000000H
DQ 3FE1D00000000000H, 3FE1D00000000000H
DQ 3FE1A80000000000H, 3FE1A80000000000H
DQ 3FE1820000000000H, 3FE1820000000000H
DQ 3FE15C0000000000H, 3FE15C0000000000H
DQ 3FE1360000000000H, 3FE1360000000000H
DQ 3FE1120000000000H, 3FE1120000000000H
DQ 3FE0EC0000000000H, 3FE0EC0000000000H
DQ 3FE0CA0000000000H, 3FE0CA0000000000H
DQ 3FE0A60000000000H, 3FE0A60000000000H
DQ 3FE0840000000000H, 3FE0840000000000H
DQ 3FE0620000000000H, 3FE0620000000000H
DQ 3FE0420000000000H, 3FE0420000000000H
DQ 3FE0200000000000H, 3FE0200000000000H
DQ 3FE0000000000000H, 3FE0000000000000H
;-------Table T_hi,T_lo so that movapd gives [ T_lo | T_hi ]
T_Tbl DQ 0000000000000000H, 0000000000000000H
DQ 3F8FBEA8B13C0000H, 3CDEC927B17E4E13H
DQ 3F9F7A9B16780000H, 3D242AD9271BE7D7H
DQ 3FA766D923C20000H, 3D1FF0A82F1C24C1H
DQ 3FAF0C30C1114000H, 3D31A88653BA4140H
DQ 3FB345179B63C000H, 3D3D4203D36150D0H
DQ 3FB6EF528C056000H, 3D24573A51306A44H
DQ 3FBA956D3ECAC000H, 3D3E63794C02C4AFH
DQ 3FBE2507702AE000H, 3D303B433FD6EEDCH
DQ 3FC0D79E7CD48000H, 3D3CB422847849E4H
DQ 3FC299D30C606000H, 3D3D4D0079DC08D9H
DQ 3FC44F8B726F8000H, 3D3DF6A4432B9BB4H
DQ 3FC601B076E7A000H, 3D3152D7D4DFC8E5H
DQ 3FC7B00916515000H, 3D146280D3E606A3H
DQ 3FC9509AA0044000H, 3D3F1E675B4D35C6H
DQ 3FCAF6895610D000H, 3D375BEBBA042B64H
DQ 3FCC8DF7CB9A8000H, 3D3EEE42F58E1E6EH
DQ 3FCE2A877A6B2000H, 3D3823817787081AH
DQ 3FCFB7D86EEE3000H, 3D371FCF1923FB43H
DQ 3FD0A504E97BB000H, 3D303094E6690C44H
DQ 3FD1661CAECB9800H, 3D2D1C000C076A8BH
DQ 3FD22981FBEF7800H, 3D17AF7A7DA9FC99H
DQ 3FD2E9E2BCE12000H, 3D24300C128D1DC2H
DQ 3FD3A71C56BB4800H, 3D08C46FB5A88483H
DQ 3FD4610BC29C5800H, 3D385F4D833BCDC7H
DQ 3FD51D1D93104000H, 3D35B0FAA20D9C8EH
DQ 3FD5D01DC49FF000H, 3D2740AB8CFA5ED3H
DQ 3FD68518244CF800H, 3D28722FF88BF119H
DQ 3FD73C1800DC0800H, 3D3320DBF75476C0H
DQ 3FD7E9883FA49800H, 3D3FAFF96743F289H
DQ 3FD898D38A893000H, 3D31F666071E2F57H
DQ 3FD94A0428036000H, 3D30E7BCB08C6B44H
DQ 3FD9F123F4BF6800H, 3D36892015F2401FH
DQ 3FDA99FCABDB8000H, 3D11E89C5F87A311H
DQ 3FDB44977C148800H, 3D3C6A343FB526DBH
DQ 3FDBEACD9E271800H, 3D268A6EDB879B51H
DQ 3FDC92B7D6BB0800H, 3D10FE9FFF876CC2H
DQ 3FDD360E90C38000H, 3D342CDB58440FD6H
DQ 3FDDD4AA04E1C000H, 3D32D8512DF01AFDH
DQ 3FDE74D262788800H, 3CFEB945ED9457BCH
DQ 3FDF100F6C2EB000H, 3D2CCE779D37F3D8H
DQ 3FDFACC89C9A9800H, 3D163E0D100EC76CH
DQ 3FE02582A5C9D000H, 3D222C6C4E98E18CH
DQ 3FE0720E5C40DC00H, 3D38E27400B03FBEH
DQ 3FE0BF52E7353800H, 3D19B5899CD387D3H
DQ 3FE109EB9E2E4C00H, 3D12DA67293E0BE7H
DQ 3FE15533D3B8D400H, 3D3D981CA8B0D3C3H
DQ 3FE19DB6BA0BA400H, 3D2B675885A4A268H
DQ 3FE1E6DF676FF800H, 3D1A58BA81B983AAH
DQ 3FE230B0D8BEBC00H, 3D12FC066E48667BH
DQ 3FE2779E1EC93C00H, 3D36523373359B79H
DQ 3FE2BF29F9841C00H, 3CFD8A3861D3B7ECH
DQ 3FE30757344F0C00H, 3D309BE85662F034H
DQ 3FE34C80A8958000H, 3D1D4093FCAC34BDH
DQ 3FE39240DDE5CC00H, 3D3493DBEAB758B3H
DQ 3FE3D89A6B1A5400H, 3D28C7CD5FA81E3EH
DQ 3FE41BCFF4860000H, 3D076FD6B90E2A84H
DQ 3FE4635BCF40DC00H, 3D2CE8D5D412CAADH
DQ 3FE4A3E862342400H, 3D224FA993F78464H
DQ 3FE4E8D015786C00H, 3D38B1C0D0303623H
DQ 3FE52A6D269BC400H, 3D30022268F689C9H
DQ 3FE56C91D71CF800H, 3CE07BAFD1366E9EH
DQ 3FE5AB505B390400H, 3CD5627AF66563FAH
DQ 3FE5EE82AA241800H, 3D2202380CDA46BEH
DQ 3FE62E42FEFA3800H, 3D2EF35793C76730H
ALIGN 16
CONST ENDS
$cmpsd MACRO op1, op2, op3
LOCAL begin_cmpsd, end_cmpsd
begin_cmpsd:
cmppd op1, op2, op3
end_cmpsd:
org begin_cmpsd
db 0F2h
org end_cmpsd
ENDM
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
ALIGN 16
PUBLIC _log_pentium4, _CIlog_pentium4
_CIlog_pentium4 PROC NEAR
push ebp
mov ebp, esp
sub esp, 8 ; for argument DBLSIZE
and esp, 0fffffff0h
fstp qword ptr [esp]
movq xmm0, qword ptr [esp]
call start
leave
ret
;----------------------;
;--Argument Reduction--;
;----------------------;
_log_pentium4 label proc
movlpd xmm0, QWORD PTR [4+esp] ;... load X to low part of xmm0
start:
mov edx,0 ;... set edx to 0
DENORMAL_RETRY:
movapd xmm5,xmm0
unpcklpd xmm0,xmm0 ;... [X|X]
psrlq xmm5,52
pextrw ecx,xmm5,0
movapd xmm1, QWORD PTR [emask] ;... pair of 000FF...FF
movapd xmm3, QWORD PTR [One] ;... pair of 3FF000...000
movapd xmm4, QWORD PTR [Magic] ;... pair of 2^(42)-1+2^(-7)
movapd xmm6, QWORD PTR [hi_mask] ;... pair of 7FFFFFFF..FE00000
andpd xmm0,xmm1
orpd xmm0,xmm3 ;... [Y|Y]
addpd xmm4,xmm0 ;... 11 lsb contains the index to B
;... the last 4 lsb are don't cares, the
;... 7 bits following that is the index
;... Hence by masking, we already have index*16
pextrw eax,xmm4,0
and eax,000007F0H ;... eax is offset
movapd xmm4, QWORD PTR [eax+B_Tbl] ;... [B|B]
movapd xmm7, QWORD PTR [eax+T_Tbl]
andpd xmm6,xmm0 ;... [Y_hi|Y_hi]
subpd xmm0,xmm6 ;... [Y_lo|Y_lo]
mulpd xmm6,xmm4 ;... [B*Y_hi|B*Y_hi]
subpd xmm6,xmm3 ;... [R_hi|R_hi]
addsd xmm7,xmm6 ;... [T_lo|T_hi+R_hi]
mulpd xmm0,xmm4 ;... [R_lo|R_lo]
movapd xmm4,xmm0 ;... [R_lo|R_lo]
addpd xmm0,xmm6 ;... [R|R]
;-----------------------------------------;
;--Approx and Reconstruction in parallel--;
;-----------------------------------------;
;...m is in ecx, [T_lo,T_hi+R_hi] in xmm7
;...xmm4 through xmm6 will be used
and ecx,00000FFFH ;... note we need sign and biased exponent
sub ecx,1
cmp ecx,2045 ;... the largest biased exponent 2046-1
;... if ecx is ABOVE (unsigned) this, either
;... the sign is +ve and biased exponent is 7FF
;... or the sign is +ve and exponent is 0, or
;... the sign is -ve (i.e. sign bit 1)
ja SPECIAL_CASES
sub ecx,1022 ;... m in integer format
add ecx,edx ;... this is the denormal adjustment
cvtsi2sd xmm6,ecx
unpcklpd xmm6,xmm6 ;... [m | m] in FP format
shl ecx,10
add eax,ecx ;16*(64*m + j) 0 <=> (m=-1 & j=64) or (m=0 & j=0)
mov ecx,16
mov edx,0
cmp eax,0
cmove edx,ecx ;this is the index into the mask table (place_{L,R})
movapd xmm1, QWORD PTR [coeff] ;... loading [p3|p6]
movapd xmm3,xmm0
movapd xmm2, QWORD PTR [coeff+16] ;... loading [p2|p5]
mulpd xmm1,xmm0 ;... [p3 R | p6 R]
mulpd xmm3,xmm3 ;... [R^2|R^2]
addpd xmm1,xmm2 ;... [p2+p3 R |p5+p6 R]
movapd xmm2, QWORD PTR [coeff+32] ;... [p1|p4]
mulsd xmm3,xmm3 ;... [R^2|R^4]
movapd xmm5, QWORD PTR [LOG_2] ;... loading [L_lo|L_hi]
;... [T_lo|T_hi+R_hi] already in xmm7
mulpd xmm6,xmm5 ;... [m L_lo | m L_hi]
movapd xmm5, QWORD PTR [edx+place_L] ;... [FF..FF|00.00] or [00..00|FF..FF]
andpd xmm4,xmm5 ;... [R_lo|0] or [0|R_lo]
addpd xmm7,xmm6 ;... [W_lo|W_hi]
addpd xmm7,xmm4 ;... [A_lo|A_hi]
mulpd xmm1,xmm0 ;... [p2 R+p3 R^2|p5 R+p6 R^2]
mulsd xmm3,xmm0 ;... [R^2|R^5]
addpd xmm1,xmm2 ;... [p1+.. | p4+...]
movapd xmm6,xmm7
unpckhpd xmm6,xmm6 ;... [*|A_lo]
mulpd xmm1,xmm3 ;... [P_hi|P_lo]
sub esp, 16
movapd xmm0,xmm1 ;... copy of [P_hi|P_lo]
unpckhpd xmm1,xmm1 ;... [P_hi|P_hi]
;...[P_hi|P_lo] in xmm1 at this point
addsd xmm0,xmm1 ;... [*|P]
addsd xmm0,xmm6
addsd xmm0,xmm7
movlpd QWORD PTR [esp+4], xmm0 ; return result
fld QWORD PTR [esp+4] ;
add esp, 16
ret
SPECIAL_CASES:
movlpd xmm0, QWORD PTR [4+esp] ;... load X again
movapd xmm1, QWORD PTR [Zero]
$cmpsd xmm1,xmm0,0
pextrw eax,xmm1,0 ;... ones if X = +-0.0
cmp eax,0
ja INPUT_ZERO
cmp ecx,-1 ;... ecx = -1 iff X is positive denormal
je INPUT_DENORM
cmp ecx,000007FEH
ja INPUT_NEGATIVE
movlpd xmm0, QWORD PTR [4+esp]
movapd xmm1, QWORD PTR [emask]
movapd xmm2, QWORD PTR [One]
andpd xmm0,xmm1
orpd xmm0,xmm2 ;... xmm0 is 1 iff the input argument was +inf
$cmpsd xmm2,xmm0,0
pextrw eax,xmm2,0 ;... 0 if X is NaN
cmp eax, 0
je INPUT_NaN
INPUT_INF:
;....Input is +Inf
fld QWORD PTR [Infs+8] ;
ret
INPUT_NaN:
; movlpd xmm0, QWORD PTR [esp+4]
; addsd xmm0, xmm0
; sub esp, 16
; movlpd QWORD PTR [esp+4], xmm0 ; return result
; fld QWORD PTR [esp+4] ;
; add esp, 16
; ret
mov edx, 1000
jmp CALL_LIBM_ERROR
INPUT_ZERO:
; raise Divide by Zero
movlpd xmm2, QWORD PTR [One]
divsd xmm2, xmm0
movlpd xmm1, QWORD PTR [Infs]
mov edx, 2
jmp CALL_LIBM_ERROR
INPUT_DENORM:
;....check for zero or denormal
;....for now I assume this is simply denormal
;....in reality, we need to check for zero and handle appropriately
movlpd xmm1,Two52
mulsd xmm0,xmm1
mov edx,-52 ;...set adjustment to exponent
jmp DENORMAL_RETRY ;...branch back
INPUT_NEGATIVE:
add ecx,1
and ecx, 7ffH
cmp ecx, 7ffH
jae NEG_INF_NAN
NEG_NORMAL_INFINITY:
; xmm1=0
xorpd xmm1, xmm1
; raise Invalid
divsd xmm1, xmm1
mov edx, 3
CALL_LIBM_ERROR:
;call libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag)
sub esp, 28
movlpd QWORD PTR [esp+16], xmm1
mov DWORD PTR [esp+12],edx
mov edx, esp
add edx,16
mov DWORD PTR [esp+8],edx
add edx,16
mov DWORD PTR [esp+4],edx
mov DWORD PTR [esp],edx
call NEAR PTR __libm_error_support
; movlpd xmm0, QWORD PTR [esp+16]
; movlpd QWORD PTR [esp+16], xmm0 ; return result
fld QWORD PTR [esp+16] ;
add esp,28
ret
NEG_INF_NAN:
movlpd xmm2, QWORD PTR [esp+4]
movlpd xmm0, QWORD PTR [esp+4]
movd eax, xmm2
psrlq xmm2, 32
movd ecx, xmm2
and ecx, 0fffffH ; eliminate sign/exponent
or eax, ecx
cmp eax,0
jz NEG_NORMAL_INFINITY ; negative infinity
; addsd xmm0, xmm0
; sub esp,16
; movlpd QWORD PTR [esp+4], xmm0
; fld QWORD PTR [esp+4]
; add esp, 16
; ret
mov edx, 1000
jmp CALL_LIBM_ERROR
_CIlog_pentium4 ENDP
ALIGN 16
_TEXT ENDS
END