Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

518 lines
16 KiB

// TITLE("Alpha AXP fmod")
//++
//
// Copyright (c) 1993, 1994 Digital Equipment Corporation
//
// Module Name:
//
// fmod.s
//
// Abstract:
//
// This module implements a high-performance Alpha AXP specific routine
// for IEEE double format fmod.
//
// Author:
//
// (ajg) 1-Nov-1991.
//
// Environment:
//
// User mode.
//
// Revision History:
//
// Thomas Van Baak (tvb) 15-Apr-1994
//
// Adapted for NT.
//
//--
#include "ksalpha.h"
//
// Define DPML exception record for NT.
//
.struct 0
ErErr: .space 4 // error code
ErCxt: .space 4 // context
ErPlat: .space 4 // platform
ErEnv: .space 4 // environment
ErRet: .space 4 // return value pointer
ErName: .space 4 // function name
ErType: .space 8 // flags and fill
ErVal: .space 8 // return value
ErArg0: .space 8 // arg 0
ErArg1: .space 8 // arg 1
ErArg2: .space 8 // arg 2
ErArg3: .space 8 // arg 3
DpmlExceptionLength:
//
// Define stack frame.
//
.struct 0
SaveRa: .space 8
Temp0: .space 8 // to extract exponent from p, etc
Temp1: .space 8 // to extract exponent from q, etc
ExRec: .space DpmlExceptionLength // exception record
.space 0 // for 16-byte stack alignment
FrameLength:
//
// Define lower and upper 32-bit parts of 64-bit double.
//
#define LowPart 0x0
#define HighPart 0x4
SBTTL("fmod")
//++
//
// double
// fmod (
// IN double x
// IN double y
// )
//
// Routine Description:
//
// This function returns the modulus of the two given double arguments.
// The fmod function is an exactly computable function defined by:
//
// mod(p, q) = p - trunc(p/q)*q
//
// Arguments:
//
// x (f16) - Supplies the dividend value.
// y (f17) - Supplies the divisor value.
//
// Return Value:
//
// The double modulus result is returned as the function value in f0.
//
//--
NESTED_ENTRY(fmod, FrameLength, ra)
lda sp, -FrameLength(sp) // allocate stack frame
stq ra, SaveRa(sp)
PROLOGUE_END
ldah t1, 0x7ff0(zero) // Load exponent mask
stq zero, Temp0(sp)
stt f16, Temp0(sp)
stt f17, Temp1(sp)
ldl t0, Temp0 + HighPart(sp) // Get exp field of p
ldl v0, Temp1 + HighPart(sp) // Get exp field of q
and t0, t1, t0
xor t0, t1, t2
and v0, t1, v0
beq t2, p_NaN_inf // Goto p_NaN_inf if all 1s
xor v0, t1, t3
cpys f16, f16, f0 // Copy to f0 for return
subl t0, v0, t0
beq t3, q_NaN_inf // Goto q_NaN_inf if all 1s
bge t0, not_quick // A quick check to see if |p| < |q|
br zero, return // If so, just return
not_quick:
ldah t2, 0x350(zero)
ldah t3, 0x340(zero)
cmple v0, t2, t2
cmplt t0, t3, t3
bne t2, small_q // If q is small, goto small_q
// to avoid underflow/denorms
ldah t2, 0x1a0(zero)
beq t3, general_case // If exponent difference is not small enough,
// go do the looping general case
cmplt t0, t2, t2
divtc f16, f17, f0 // r = p/q
ldah t3, -0x400(zero)
beq t2, r_is_big // Can we do a quick extended mul & sub?
stt f17, Temp0(sp) // Yes, the ratio fits in an integer
ldl t2, Temp0(sp)
and t2, t3, t2
stl t2, Temp0(sp)
cvttqc f0, f1 // Integerize the ratio
cvtqt f1, f0
ldt f1, Temp0(sp)
subt f17, f1, f10 // Split into high and low
mult f0, f1, f1 // Multiply by the integerized ratio
mult f0, f10, f0
subt f16, f1, f1 // Subtract the pieces, in this order
subt f1, f0, f0
br zero, return
//
// We need to convert r to an extended precision integer.
//
r_is_big:
stt f17, Temp0(sp)
ldt f10, int_con
ldl t3, Temp0(sp)
ldah t4, -0x400(zero)
cpys f0, f10, f11
ldt f10, denorm_con
and t3, t4, t3
stl t3, Temp0(sp)
addtc f0, f11, f1 // Add big
subt f1, f11, f1 // Subtract big
cpys f1, f10, f0
ldt f10, Temp0(sp)
subt f17, f10, f17 // yhi = (y + 2^(p-1+k)) - 2^(p-1+k)
addt f1, f0, f11
subt f11, f0, f0
mult f0, f10, f11 // Extended precision multiply
subt f1, f0, f1
mult f0, f17, f0
subt f16, f11, f11 // ... and subtract
mult f1, f10, f10
mult f1, f17, f1
subt f11, f0, f0
subt f0, f10, f0
subt f0, f1, f0 // and done
br zero, return
small_q: ble v0, quick_small_q // If exp_q is > 0, use the general case
general_case:
cmptlt f16, f31, f11 // Set flag if p is negative
mov zero, t2
fbeq f11, small_q_rejoin_general_case
mov 1, t2
br zero, small_q_rejoin_general_case
quick_small_q:
cmptlt f16, f31, f1 // Capture sign of p
mov zero, t2
ldt f10, int_con
cpyse f10, f17, f0 // q, normalized
subt f0, f10, f17
stt f17, Temp0(sp)
fbeq f1, 30f
mov 1, t2 // p is negative
30: ldl t4, Temp0 + HighPart(sp)
ldah t3, 0x4320(zero)
and t4, t1, t4
subl t4, t3, t4
mov t4, v0
fbeq f17, q_is_zero // Can't divide by 0
cpyse f10, f16, f11
bne t0, small_q_done // If c == 0, q is denormal too
subt f11, f10, f16 // Ayup, q is denormal, too
stt f16, Temp0(sp)
ldl t5, Temp0 + HighPart(sp) // Check for p == 0. (in case)
fbeq f16, done
and t5, t1, t5
subl t5, t3, t3
mov t3, t0
cmplt t0, v0, t5
beq t5, small_q_done
stt f16, Temp0(sp)
ldl v0, Temp0 + HighPart(sp)
stt f31, Temp1(sp)
ldl t6, Temp1 + HighPart(sp)
ldah t3, 0x10(zero)
ldah t5, 0x4000(zero)
lda t3, -1(t3)
zapnot v0, 7, v0
subl t5, t0, t0
and t6, t3, t3
ldah t5, 0x3ff0(zero)
bis t3, t0, t3
bis v0, t5, v0
stl t3, Temp1 + HighPart(sp)
stl v0, Temp0 + HighPart(sp)
ldt f1, Temp0(sp)
ldt f0, Temp1(sp)
addt f1, f0, f0
beq t2, 40f // Need to negate p?
cpysn f0, f0, f0
40: // Now rescale p
stt f0, Temp0(sp)
ldl t5, Temp0 + HighPart(sp)
subl t5, t0, t0 // Reduce c by q's exponent
stl t0, Temp0 + HighPart(sp)
ldt f16, Temp0(sp)
cpys f16, f16, f0
br zero, return
small_q_done:
subl t0, v0, t0 // Adjust c by q's exponent,
// and fall into ...
small_q_rejoin_general_case:
stt f16, Temp0(sp) // Normalize p and q
stt f17, Temp1(sp)
ldl t3, Temp0 + HighPart(sp)
ldl t6, Temp1 + HighPart(sp)
ldah t4, 0x10(zero)
lda t4, -1(t4)
and t3, t4, t3
ldah t5, 0x4330(zero)
and t6, t4, t6
addl t3, t5, t3
stl t3, Temp0 + HighPart(sp)
addl t6, t5, t6
stl t6, Temp1 + HighPart(sp)
ldt f16, Temp0(sp)
ldt f10, Temp1(sp)
cmptle f10, f16, f11 // If p >= q, then p -= q
fbeq f11, 50f
subt f16, f10, f16
fbeq f16, done // If that makes p == 0, goto done
50: stt f10, Temp0(sp) // Convert q to extended
ldl a0, Temp0(sp)
ldah t3, -0x400(zero)
and a0, t3, t3
stl t3, Temp0(sp)
ldah a0, 0x340(zero)
ldt f1, Temp0(sp)
subt f10, f1, f0 // High and low
//
// Here's the dreaded loop, bane of good fmod implemetors around the world,
// and all-too-often omitted by mediocre fmod implemetors.
//
dread_loop:
subl t0, a0, t0 // Reduce c
blt t0, end_of_loop // End of loop?
stt f16, Temp0(sp) // Scale p
ldl t7, Temp0 + HighPart(sp)
ldah t6, 0x340(zero)
addl t7, t6, t6
stl t6, Temp0 + HighPart(sp)
ldt f11, Temp0(sp)
ldt f12, int_con
divtc f11, f10, f16 // r = p/q
addtc f16, f12, f13 // Add big
ldt f16, denorm_con
subt f13, f12, f12 // Subtract big
cpys f12, f16, f14
addt f12, f14, f13 // Split q into hi & lo, too
subt f13, f14, f13
mult f13, f1, f16 // Extended multiply
subt f12, f13, f12
mult f13, f0, f13
subt f11, f16, f11 // And subtract
mult f12, f1, f14
mult f12, f0, f12
subt f11, f13, f11
subt f11, f14, f11
subt f11, f12, f16
fbne f16, dread_loop // Continue looping ...
cpys f16, f16, f0 // ... unless p == 0, in which case, ...
br zero, return // ... return p
//
// We may need one additional iteration. Fortunately, this one can use
// the faster scaling for p.
//
end_of_loop:
addl t0, a0, t0 // Bump c back up
beq t0, almost_done // And unless it was zero, ...
stt f16, Temp0(sp)
ldl t6, Temp0 + HighPart(sp)
addl t6, t0, t0
stl t0, Temp0 + HighPart(sp)
ldt f13, Temp0(sp)
ldt f11, int_con
divtc f13, f10, f14
ldt f16, denorm_con
addtc f14, f11, f12
subt f12, f11, f11
cpys f11, f16, f14
addt f11, f14, f12
subt f12, f14, f12
mult f12, f1, f16
subt f11, f12, f11
mult f12, f0, f12
subt f13, f16, f13
mult f11, f1, f1
mult f11, f0, f0
subt f13, f12, f12
subt f12, f1, f1
subt f1, f0, f16
//
// Here, manage the final niggling details:
// (a) Make a final check for p == 0,
// (b) Compute the exponent field of the result,
// (c) Check for underflow of the result,
// (d) And give it the sign of p.
almost_done:
fbeq f16, done // p == 0?
stt f16, Temp0(sp)
ldl t6, Temp0 + HighPart(sp)
ldah t0, -0x10(zero)
and t6, t0, t0 // Isolate exponent of p
addl t0, v0, v0 // Add in c
subl v0, t5, v0 // Subtract the 'biased denorm'
ble v0, gen_underflow_or_denormal // Branch if exponent <= 0
stt f16, Temp0(sp)
ldl a0, Temp0 + HighPart(sp)
and a0, t4, t4
bis t4, v0, v0
stl v0, Temp0 + HighPart(sp)
ldt f16, Temp0(sp)
beq t2, done // Was input p >= 0?
cpysn f16, f16, f0 // No; so return -p.
br zero, return
//
// Exceptions
//
gen_underflow_or_denormal:
lda t0, fmodName
stl t0, ExRec + ErName(sp)
ldah t4, 0x800(zero)
stt f16, ExRec + ErArg0(sp)
stt f10, ExRec + ErArg1(sp)
lda t4, 0x38(t4) // MOD_REM_UNDERFLOW
stl t4, ExRec + ErErr(sp)
lda v0, ExRec(sp)
bsr ra, __dpml_exception
ldt f16, 0(v0)
//
// Return p, which is in f16
//
done: cpys f16, f16, f0
br zero, return
q_is_zero:
lda t3, fmodName
stl t3, ExRec + ErName(sp)
ldah t5, 0x800(zero)
stt f16, ExRec + ErArg0(sp)
stt f17, ExRec + ErArg1(sp)
lda t5, 0x39(t5) // MOD_REM_BY_ZERO
stl t5, ExRec + ErErr(sp)
br zero, call_exception
//
// If q is a NaN, return q.
// If q is Inf, return p (p mod Inf = p).
//
q_NaN_inf:
stt f17, Temp0(sp)
ldl t6, Temp0(sp)
ldah t2, 0x10(zero)
ldl t4, Temp0 + HighPart(sp)
lda t2, -1(t2)
and t4, t2, t2
bis t2, t6, t2
and t4, t1, t3
cmpult zero, t2, t2
cmpeq t3, t1, t3
beq t3, 60f
and t3, t2, t3
60: cpys f17, f17, f0 // Return q ...
bne t3, return // ... if it's a NaN
cpys f16, f16, f0 // Otherwise, return p
br zero, return
//
// If p (or q) is a NaN, return p (or q)
// Otherwise, report an exception (MOD_REM_INF)
//
p_NaN_inf:
stt f16, Temp0(sp)
ldl ra, Temp0(sp)
ldah v0, 0x10(zero)
ldl t5, Temp0 + HighPart(sp)
lda v0, -1(v0)
and t5, v0, v0
bis v0, ra, v0
and t5, t1, t7
cmpult zero, v0, v0
cmpeq t7, t1, t7
beq t7, 70f
and t7, v0, t7
beq t7, 70f
cpys f16, f16, f0 // return p
br zero, return
70: stt f17, Temp0(sp)
ldl t4, Temp0 + HighPart(sp)
and t4, t1, t6
cmpeq t6, t1, t1
beq t1, exc_mod_rem_of_inf
ldl t2, Temp0(sp)
ldah t3, 0x10(zero)
lda t3, -1(t3)
and t4, t3, t3
bis t3, t2, t2
cmpult zero, t2, t2
and t1, t2, t1
beq t1, exc_mod_rem_of_inf
cpys f17, f17, f0 // return q
br zero, return
exc_mod_rem_of_inf:
lda t0, fmodName
stl t0, ExRec + ErName(sp)
ldah t7, 0x800(zero)
stt f16, ExRec + ErArg0(sp)
stt f17, ExRec + ErArg1(sp)
lda t7, 0x3a(t7) // MOD_REM_OF_INF
stl t7, ExRec + ErErr(sp)
call_exception:
lda v0, ExRec(sp)
bsr ra, __dpml_exception
ldt f0, 0(v0)
//
// And return
//
return: ldq ra, SaveRa(sp)
lda sp, FrameLength(sp)
ret zero, (ra)
.end fmod
.align 2
.data
denorm_con:
.long 0x0 // Scale factor for normalization
.long 0x44d00000
int_con:
.long 0x0 // 2^TT, to integerize a number
.long 0x43300000
fmodName:
.ascii "fmod\0"