;// ;// INTEL CORPORATION PROPRIETARY INFORMATION ;// This software is supplied under the terms of a license agreement or ;// nondisclosure agreement with Intel Corporation and may not be copied ;// or disclosed except in accordance with the terms of that agreement. ;// Copyright (c) 2000 Intel Corporation. All Rights Reserved. ;// ;// ; log_wmt.asm ; ; double log(double); ; ; Initial version: 12/15/2000 ; Updated with bug fixes: 2/20/2001 ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; ;; Another important feature is that we use the table of log(1/B) ;; ;; throughout. To ensure numerical accuracy, we only need to ensure that ;; ;; T(0)_hi = B(last)_hi, T(0)_lo = B(last)_lo. This ensures W_hi = 0 and ;; ;; W_lo = 0 exactly in the case of |X-1| <= 2^(-7). ;; ;; Finally, we do away with the need for extra-precision addition by the ;; ;; following observation. The three pieces at the end are ;; ;; A = W_hi + r_hi; B = r_lo; C = P + W_lo. ;; ;; When W_hi = W_lo = 0, the addition sequence (A+B) + C is accurate as ;; ;; the sum A+B is exact. ;; ;; Otherwise, A + (B+C) is accurate as B is going to be largely shifted ;; ;; off compared to the final result. ;; ;; Hence if we use compare and mask operations to ;; ;; create alpha = (r_lo or 0), beta = (0 or r_lo), Res_hi <- W_hi+alpha, ;; ;; Res_lo <- C + beta, then result is accurately computed as ;; ;; Res_hi+Res_lo. ;; ;; ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; .686P .387 .XMM .MODEL FLAT,C EXTRN C __libm_error_support : NEAR CONST SEGMENT PARA PUBLIC USE32 'CONST' ALIGN 16 emask DQ 000FFFFFFFFFFFFFH, 000FFFFFFFFFFFFFH ; mask off sign/expo field Magic DQ 428FFFFFFFFFF80FH, 428FFFFFFFFFF80FH ; 2^(42)-1+2^(-7) hi_mask DQ 7FFFFFFFFFE00000H, 7FFFFFFFFFE00000H ; mask of bottom 21 bits LOG_2 DQ 3FE62E42FEFA3800H, 3D2EF35793C76730H ; L_hi,L_lo -> [L_lo|L_hi] place_L DQ 0000000000000000H,0FFFFFFFFFFFFFFFFH ; 0,1 -> [FF..FF|00..00] DQ 0FFFFFFFFFFFFFFFFH, 0000000000000000H ; 1,0 -> [00..00|FF..FF] One DQ 3ff0000000000000H, 3ff0000000000000H ; 1,1 Zero DQ 0000000000000000H, 0000000000000000H ; 0,0 Two52 DQ 4330000000000000H, 4330000000000000H ; 2^52 for normalization Infs DQ 0FFF0000000000000H, 7FF0000000000000H ; -inf,+inf --> [+inf|-inf] NaN DQ 7FF0000000000001H, 7FF0000000000001H ; NaN for log(-ve), log(Nan) coeff DQ 3FC24998090DC555H, 0BFCFFFFFFF201E13H ; p6,p3 ->[p3|p6] DQ 0BFC555C54DD57D75H, 3FD55555555555A7H ; p5,p2 ->[p2|p5] DQ 3FC9999998867A53H, 0BFE000000000001CH ; p4,p1 ->[p1|p4] ;-------Table B----------- B_Tbl DQ 3FF0000000000000H, 3FF0000000000000H DQ 3FEF820000000000H, 3FEF820000000000H DQ 3FEF080000000000H, 3FEF080000000000H DQ 3FEE920000000000H, 3FEE920000000000H DQ 3FEE1E0000000000H, 3FEE1E0000000000H DQ 3FEDAE0000000000H, 3FEDAE0000000000H DQ 3FED420000000000H, 3FED420000000000H DQ 3FECD80000000000H, 3FECD80000000000H DQ 3FEC720000000000H, 3FEC720000000000H DQ 3FEC0E0000000000H, 3FEC0E0000000000H DQ 3FEBAC0000000000H, 3FEBAC0000000000H DQ 3FEB4E0000000000H, 3FEB4E0000000000H DQ 3FEAF20000000000H, 3FEAF20000000000H DQ 3FEA980000000000H, 3FEA980000000000H DQ 3FEA420000000000H, 3FEA420000000000H DQ 3FE9EC0000000000H, 3FE9EC0000000000H DQ 3FE99A0000000000H, 3FE99A0000000000H DQ 3FE9480000000000H, 3FE9480000000000H DQ 3FE8FA0000000000H, 3FE8FA0000000000H DQ 3FE8AC0000000000H, 3FE8AC0000000000H DQ 3FE8620000000000H, 3FE8620000000000H DQ 3FE8180000000000H, 3FE8180000000000H DQ 3FE7D00000000000H, 3FE7D00000000000H DQ 3FE78A0000000000H, 3FE78A0000000000H DQ 3FE7460000000000H, 3FE7460000000000H DQ 3FE7020000000000H, 3FE7020000000000H DQ 3FE6C20000000000H, 3FE6C20000000000H DQ 3FE6820000000000H, 3FE6820000000000H DQ 3FE6420000000000H, 3FE6420000000000H DQ 3FE6060000000000H, 3FE6060000000000H DQ 3FE5CA0000000000H, 3FE5CA0000000000H DQ 3FE58E0000000000H, 3FE58E0000000000H DQ 3FE5560000000000H, 3FE5560000000000H DQ 3FE51E0000000000H, 3FE51E0000000000H DQ 3FE4E60000000000H, 3FE4E60000000000H DQ 3FE4B00000000000H, 3FE4B00000000000H DQ 3FE47A0000000000H, 3FE47A0000000000H DQ 3FE4460000000000H, 3FE4460000000000H DQ 3FE4140000000000H, 3FE4140000000000H DQ 3FE3E20000000000H, 3FE3E20000000000H DQ 3FE3B20000000000H, 3FE3B20000000000H DQ 3FE3820000000000H, 3FE3820000000000H DQ 3FE3520000000000H, 3FE3520000000000H DQ 3FE3240000000000H, 3FE3240000000000H DQ 3FE2F60000000000H, 3FE2F60000000000H DQ 3FE2CA0000000000H, 3FE2CA0000000000H DQ 3FE29E0000000000H, 3FE29E0000000000H DQ 3FE2740000000000H, 3FE2740000000000H DQ 3FE24A0000000000H, 3FE24A0000000000H DQ 3FE2200000000000H, 3FE2200000000000H DQ 3FE1F80000000000H, 3FE1F80000000000H DQ 3FE1D00000000000H, 3FE1D00000000000H DQ 3FE1A80000000000H, 3FE1A80000000000H DQ 3FE1820000000000H, 3FE1820000000000H DQ 3FE15C0000000000H, 3FE15C0000000000H DQ 3FE1360000000000H, 3FE1360000000000H DQ 3FE1120000000000H, 3FE1120000000000H DQ 3FE0EC0000000000H, 3FE0EC0000000000H DQ 3FE0CA0000000000H, 3FE0CA0000000000H DQ 3FE0A60000000000H, 3FE0A60000000000H DQ 3FE0840000000000H, 3FE0840000000000H DQ 3FE0620000000000H, 3FE0620000000000H DQ 3FE0420000000000H, 3FE0420000000000H DQ 3FE0200000000000H, 3FE0200000000000H DQ 3FE0000000000000H, 3FE0000000000000H ;-------Table T_hi,T_lo so that movapd gives [ T_lo | T_hi ] T_Tbl DQ 0000000000000000H, 0000000000000000H DQ 3F8FBEA8B13C0000H, 3CDEC927B17E4E13H DQ 3F9F7A9B16780000H, 3D242AD9271BE7D7H DQ 3FA766D923C20000H, 3D1FF0A82F1C24C1H DQ 3FAF0C30C1114000H, 3D31A88653BA4140H DQ 3FB345179B63C000H, 3D3D4203D36150D0H DQ 3FB6EF528C056000H, 3D24573A51306A44H DQ 3FBA956D3ECAC000H, 3D3E63794C02C4AFH DQ 3FBE2507702AE000H, 3D303B433FD6EEDCH DQ 3FC0D79E7CD48000H, 3D3CB422847849E4H DQ 3FC299D30C606000H, 3D3D4D0079DC08D9H DQ 3FC44F8B726F8000H, 3D3DF6A4432B9BB4H DQ 3FC601B076E7A000H, 3D3152D7D4DFC8E5H DQ 3FC7B00916515000H, 3D146280D3E606A3H DQ 3FC9509AA0044000H, 3D3F1E675B4D35C6H DQ 3FCAF6895610D000H, 3D375BEBBA042B64H DQ 3FCC8DF7CB9A8000H, 3D3EEE42F58E1E6EH DQ 3FCE2A877A6B2000H, 3D3823817787081AH DQ 3FCFB7D86EEE3000H, 3D371FCF1923FB43H DQ 3FD0A504E97BB000H, 3D303094E6690C44H DQ 3FD1661CAECB9800H, 3D2D1C000C076A8BH DQ 3FD22981FBEF7800H, 3D17AF7A7DA9FC99H DQ 3FD2E9E2BCE12000H, 3D24300C128D1DC2H DQ 3FD3A71C56BB4800H, 3D08C46FB5A88483H DQ 3FD4610BC29C5800H, 3D385F4D833BCDC7H DQ 3FD51D1D93104000H, 3D35B0FAA20D9C8EH DQ 3FD5D01DC49FF000H, 3D2740AB8CFA5ED3H DQ 3FD68518244CF800H, 3D28722FF88BF119H DQ 3FD73C1800DC0800H, 3D3320DBF75476C0H DQ 3FD7E9883FA49800H, 3D3FAFF96743F289H DQ 3FD898D38A893000H, 3D31F666071E2F57H DQ 3FD94A0428036000H, 3D30E7BCB08C6B44H DQ 3FD9F123F4BF6800H, 3D36892015F2401FH DQ 3FDA99FCABDB8000H, 3D11E89C5F87A311H DQ 3FDB44977C148800H, 3D3C6A343FB526DBH DQ 3FDBEACD9E271800H, 3D268A6EDB879B51H DQ 3FDC92B7D6BB0800H, 3D10FE9FFF876CC2H DQ 3FDD360E90C38000H, 3D342CDB58440FD6H DQ 3FDDD4AA04E1C000H, 3D32D8512DF01AFDH DQ 3FDE74D262788800H, 3CFEB945ED9457BCH DQ 3FDF100F6C2EB000H, 3D2CCE779D37F3D8H DQ 3FDFACC89C9A9800H, 3D163E0D100EC76CH DQ 3FE02582A5C9D000H, 3D222C6C4E98E18CH DQ 3FE0720E5C40DC00H, 3D38E27400B03FBEH DQ 3FE0BF52E7353800H, 3D19B5899CD387D3H DQ 3FE109EB9E2E4C00H, 3D12DA67293E0BE7H DQ 3FE15533D3B8D400H, 3D3D981CA8B0D3C3H DQ 3FE19DB6BA0BA400H, 3D2B675885A4A268H DQ 3FE1E6DF676FF800H, 3D1A58BA81B983AAH DQ 3FE230B0D8BEBC00H, 3D12FC066E48667BH DQ 3FE2779E1EC93C00H, 3D36523373359B79H DQ 3FE2BF29F9841C00H, 3CFD8A3861D3B7ECH DQ 3FE30757344F0C00H, 3D309BE85662F034H DQ 3FE34C80A8958000H, 3D1D4093FCAC34BDH DQ 3FE39240DDE5CC00H, 3D3493DBEAB758B3H DQ 3FE3D89A6B1A5400H, 3D28C7CD5FA81E3EH DQ 3FE41BCFF4860000H, 3D076FD6B90E2A84H DQ 3FE4635BCF40DC00H, 3D2CE8D5D412CAADH DQ 3FE4A3E862342400H, 3D224FA993F78464H DQ 3FE4E8D015786C00H, 3D38B1C0D0303623H DQ 3FE52A6D269BC400H, 3D30022268F689C9H DQ 3FE56C91D71CF800H, 3CE07BAFD1366E9EH DQ 3FE5AB505B390400H, 3CD5627AF66563FAH DQ 3FE5EE82AA241800H, 3D2202380CDA46BEH DQ 3FE62E42FEFA3800H, 3D2EF35793C76730H ALIGN 16 CONST ENDS $cmpsd MACRO op1, op2, op3 LOCAL begin_cmpsd, end_cmpsd begin_cmpsd: cmppd op1, op2, op3 end_cmpsd: org begin_cmpsd db 0F2h org end_cmpsd ENDM _TEXT SEGMENT PARA PUBLIC USE32 'CODE' ALIGN 16 PUBLIC _log_pentium4, _CIlog_pentium4 _CIlog_pentium4 PROC NEAR push ebp mov ebp, esp sub esp, 8 ; for argument DBLSIZE and esp, 0fffffff0h fstp qword ptr [esp] movq xmm0, qword ptr [esp] call start leave ret ;----------------------; ;--Argument Reduction--; ;----------------------; _log_pentium4 label proc movlpd xmm0, QWORD PTR [4+esp] ;... load X to low part of xmm0 start: mov edx,0 ;... set edx to 0 DENORMAL_RETRY: movapd xmm5,xmm0 unpcklpd xmm0,xmm0 ;... [X|X] psrlq xmm5,52 pextrw ecx,xmm5,0 movapd xmm1, QWORD PTR [emask] ;... pair of 000FF...FF movapd xmm3, QWORD PTR [One] ;... pair of 3FF000...000 movapd xmm4, QWORD PTR [Magic] ;... pair of 2^(42)-1+2^(-7) movapd xmm6, QWORD PTR [hi_mask] ;... pair of 7FFFFFFF..FE00000 andpd xmm0,xmm1 orpd xmm0,xmm3 ;... [Y|Y] addpd xmm4,xmm0 ;... 11 lsb contains the index to B ;... the last 4 lsb are don't cares, the ;... 7 bits following that is the index ;... Hence by masking, we already have index*16 pextrw eax,xmm4,0 and eax,000007F0H ;... eax is offset movapd xmm4, QWORD PTR [eax+B_Tbl] ;... [B|B] movapd xmm7, QWORD PTR [eax+T_Tbl] andpd xmm6,xmm0 ;... [Y_hi|Y_hi] subpd xmm0,xmm6 ;... [Y_lo|Y_lo] mulpd xmm6,xmm4 ;... [B*Y_hi|B*Y_hi] subpd xmm6,xmm3 ;... [R_hi|R_hi] addsd xmm7,xmm6 ;... [T_lo|T_hi+R_hi] mulpd xmm0,xmm4 ;... [R_lo|R_lo] movapd xmm4,xmm0 ;... [R_lo|R_lo] addpd xmm0,xmm6 ;... [R|R] ;-----------------------------------------; ;--Approx and Reconstruction in parallel--; ;-----------------------------------------; ;...m is in ecx, [T_lo,T_hi+R_hi] in xmm7 ;...xmm4 through xmm6 will be used and ecx,00000FFFH ;... note we need sign and biased exponent sub ecx,1 cmp ecx,2045 ;... the largest biased exponent 2046-1 ;... if ecx is ABOVE (unsigned) this, either ;... the sign is +ve and biased exponent is 7FF ;... or the sign is +ve and exponent is 0, or ;... the sign is -ve (i.e. sign bit 1) ja SPECIAL_CASES sub ecx,1022 ;... m in integer format add ecx,edx ;... this is the denormal adjustment cvtsi2sd xmm6,ecx unpcklpd xmm6,xmm6 ;... [m | m] in FP format shl ecx,10 add eax,ecx ;16*(64*m + j) 0 <=> (m=-1 & j=64) or (m=0 & j=0) mov ecx,16 mov edx,0 cmp eax,0 cmove edx,ecx ;this is the index into the mask table (place_{L,R}) movapd xmm1, QWORD PTR [coeff] ;... loading [p3|p6] movapd xmm3,xmm0 movapd xmm2, QWORD PTR [coeff+16] ;... loading [p2|p5] mulpd xmm1,xmm0 ;... [p3 R | p6 R] mulpd xmm3,xmm3 ;... [R^2|R^2] addpd xmm1,xmm2 ;... [p2+p3 R |p5+p6 R] movapd xmm2, QWORD PTR [coeff+32] ;... [p1|p4] mulsd xmm3,xmm3 ;... [R^2|R^4] movapd xmm5, QWORD PTR [LOG_2] ;... loading [L_lo|L_hi] ;... [T_lo|T_hi+R_hi] already in xmm7 mulpd xmm6,xmm5 ;... [m L_lo | m L_hi] movapd xmm5, QWORD PTR [edx+place_L] ;... [FF..FF|00.00] or [00..00|FF..FF] andpd xmm4,xmm5 ;... [R_lo|0] or [0|R_lo] addpd xmm7,xmm6 ;... [W_lo|W_hi] addpd xmm7,xmm4 ;... [A_lo|A_hi] mulpd xmm1,xmm0 ;... [p2 R+p3 R^2|p5 R+p6 R^2] mulsd xmm3,xmm0 ;... [R^2|R^5] addpd xmm1,xmm2 ;... [p1+.. | p4+...] movapd xmm6,xmm7 unpckhpd xmm6,xmm6 ;... [*|A_lo] mulpd xmm1,xmm3 ;... [P_hi|P_lo] sub esp, 16 movapd xmm0,xmm1 ;... copy of [P_hi|P_lo] unpckhpd xmm1,xmm1 ;... [P_hi|P_hi] ;...[P_hi|P_lo] in xmm1 at this point addsd xmm0,xmm1 ;... [*|P] addsd xmm0,xmm6 addsd xmm0,xmm7 movlpd QWORD PTR [esp+4], xmm0 ; return result fld QWORD PTR [esp+4] ; add esp, 16 ret SPECIAL_CASES: movlpd xmm0, QWORD PTR [4+esp] ;... load X again movapd xmm1, QWORD PTR [Zero] $cmpsd xmm1,xmm0,0 pextrw eax,xmm1,0 ;... ones if X = +-0.0 cmp eax,0 ja INPUT_ZERO cmp ecx,-1 ;... ecx = -1 iff X is positive denormal je INPUT_DENORM cmp ecx,000007FEH ja INPUT_NEGATIVE movlpd xmm0, QWORD PTR [4+esp] movapd xmm1, QWORD PTR [emask] movapd xmm2, QWORD PTR [One] andpd xmm0,xmm1 orpd xmm0,xmm2 ;... xmm0 is 1 iff the input argument was +inf $cmpsd xmm2,xmm0,0 pextrw eax,xmm2,0 ;... 0 if X is NaN cmp eax, 0 je INPUT_NaN INPUT_INF: ;....Input is +Inf fld QWORD PTR [Infs+8] ; ret INPUT_NaN: ; movlpd xmm0, QWORD PTR [esp+4] ; addsd xmm0, xmm0 ; sub esp, 16 ; movlpd QWORD PTR [esp+4], xmm0 ; return result ; fld QWORD PTR [esp+4] ; ; add esp, 16 ; ret mov edx, 1000 jmp CALL_LIBM_ERROR INPUT_ZERO: ; raise Divide by Zero movlpd xmm2, QWORD PTR [One] divsd xmm2, xmm0 movlpd xmm1, QWORD PTR [Infs] mov edx, 2 jmp CALL_LIBM_ERROR INPUT_DENORM: ;....check for zero or denormal ;....for now I assume this is simply denormal ;....in reality, we need to check for zero and handle appropriately movlpd xmm1,Two52 mulsd xmm0,xmm1 mov edx,-52 ;...set adjustment to exponent jmp DENORMAL_RETRY ;...branch back INPUT_NEGATIVE: add ecx,1 and ecx, 7ffH cmp ecx, 7ffH jae NEG_INF_NAN NEG_NORMAL_INFINITY: ; xmm1=0 xorpd xmm1, xmm1 ; raise Invalid divsd xmm1, xmm1 mov edx, 3 CALL_LIBM_ERROR: ;call libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag) sub esp, 28 movlpd QWORD PTR [esp+16], xmm1 mov DWORD PTR [esp+12],edx mov edx, esp add edx,16 mov DWORD PTR [esp+8],edx add edx,16 mov DWORD PTR [esp+4],edx mov DWORD PTR [esp],edx call NEAR PTR __libm_error_support ; movlpd xmm0, QWORD PTR [esp+16] ; movlpd QWORD PTR [esp+16], xmm0 ; return result fld QWORD PTR [esp+16] ; add esp,28 ret NEG_INF_NAN: movlpd xmm2, QWORD PTR [esp+4] movlpd xmm0, QWORD PTR [esp+4] movd eax, xmm2 psrlq xmm2, 32 movd ecx, xmm2 and ecx, 0fffffH ; eliminate sign/exponent or eax, ecx cmp eax,0 jz NEG_NORMAL_INFINITY ; negative infinity ; addsd xmm0, xmm0 ; sub esp,16 ; movlpd QWORD PTR [esp+4], xmm0 ; fld QWORD PTR [esp+4] ; add esp, 16 ; ret mov edx, 1000 jmp CALL_LIBM_ERROR _CIlog_pentium4 ENDP ALIGN 16 _TEXT ENDS END