mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
574 lines
16 KiB
574 lines
16 KiB
// TITLE("Alpha AXP Hypotenuse")
|
|
//++
|
|
//
|
|
// Copyright (c) 1993, 1994 Digital Equipment Corporation
|
|
//
|
|
// Module Name:
|
|
//
|
|
// hypot.s
|
|
//
|
|
// Abstract:
|
|
//
|
|
// This module implements a high-performance Alpha AXP specific routine
|
|
// for IEEE double format hypotenuse.
|
|
//
|
|
// Author:
|
|
//
|
|
// Bill Gray (rtl::gray) 30-Jun-1993
|
|
//
|
|
// Environment:
|
|
//
|
|
// User mode.
|
|
//
|
|
// Revision History:
|
|
//
|
|
// Thomas Van Baak (tvb) 15-Feb-1994
|
|
//
|
|
// Adapted for NT.
|
|
//
|
|
//--
|
|
|
|
#include "ksalpha.h"
|
|
|
|
//
|
|
// Define DPML exception record for NT.
|
|
//
|
|
|
|
.struct 0
|
|
ErErr: .space 4 // error code
|
|
ErCxt: .space 4 // context
|
|
ErPlat: .space 4 // platform
|
|
ErEnv: .space 4 // environment
|
|
ErRet: .space 4 // return value pointer
|
|
ErName: .space 4 // function name
|
|
ErType: .space 8 // flags and fill
|
|
ErVal: .space 8 // return value
|
|
ErArg0: .space 8 // arg 0
|
|
ErArg1: .space 8 // arg 1
|
|
ErArg2: .space 8 // arg 2
|
|
ErArg3: .space 8 // arg 3
|
|
DpmlExceptionLength:
|
|
|
|
//
|
|
// Define stack frame.
|
|
//
|
|
|
|
.struct 0
|
|
Temp0: .space 8 // save argument
|
|
Temp1: .space 8 // save argument
|
|
ExRec: .space DpmlExceptionLength // exception record
|
|
.space 8 // for 16-byte stack alignment
|
|
FrameLength:
|
|
|
|
//
|
|
// Define lower and upper 32-bit parts of 64-bit double.
|
|
//
|
|
|
|
#define LowPart 0x0
|
|
#define HighPart 0x4
|
|
|
|
SBTTL("Hypotenuse")
|
|
|
|
//++
|
|
//
|
|
// double
|
|
// _hypot (
|
|
// IN double x
|
|
// IN double y
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function returns the hypotenuse for the given x, y values:
|
|
// double hypot(double x, double y) = sqrt(x*x + y*y).
|
|
//
|
|
// Arguments:
|
|
//
|
|
// x (f16) - Supplies the x argument value.
|
|
//
|
|
// y (f17) - Supplies the y argument value.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// The double result is returned as the function value in f0.
|
|
//
|
|
//--
|
|
|
|
NESTED_ENTRY(_hypot, FrameLength, ra)
|
|
|
|
lda sp, -FrameLength(sp) // allocate stack frame
|
|
mov ra, a1 // save return address
|
|
|
|
PROLOGUE_END
|
|
|
|
// This implementation first check for special cases: infinities, nans, zeros
|
|
// and denormalized numbers. Then it scales both number to avoid intermediate
|
|
// underflow and overflow. Once the scaled result of Hypot(x, y) is
|
|
// calculated, it checks for possible overflow before scaling up the result.
|
|
|
|
ldah t1, 0x7ff0(zero)
|
|
stt f17, Temp0(sp)
|
|
stt f16, Temp1(sp)
|
|
ldl t0, Temp1 + HighPart(sp) // Get exp of x
|
|
ldah t2, 0x2000(zero) // bias - 1
|
|
ldl v0, Temp0 + HighPart(sp) // Get exp of y
|
|
ldah t5, 0x3fd0(zero) // exponent mask
|
|
mov zero, t4
|
|
and t0, t1, t0 // mask
|
|
subl t0, t2, t3 // subtract bias
|
|
and v0, t1, v0 // mask
|
|
subl v0, t2, t2 // subtract bias
|
|
cmpult t3, t5, t3
|
|
cmpult t2, t5, t2
|
|
beq t3, scale_input
|
|
bne t2, calculate_hypot
|
|
|
|
|
|
//
|
|
// We get here if simply squaring and adding will cause an intermediate
|
|
// overflow or underflow. Consequently we need to scale the arguments
|
|
// before preceeding. In the IEEE case: NaN's, Inf's and denorms come
|
|
// through here. Split them out at special cases here
|
|
//
|
|
scale_input:
|
|
and t0, t1, t3
|
|
ldah t5, 0x10(zero)
|
|
and v0, t1, t4
|
|
subl t3, t5, t3
|
|
ldah t2, 0x7fe0(zero)
|
|
subl t4, t5, t4
|
|
cmpult t3, t2, t3
|
|
cmpult t4, t2, t2
|
|
beq t3, classify // exp_x abnormal? goto classify
|
|
beq t2, classify // exp_y abnormal? goto classify
|
|
|
|
subl t0, v0, t3 // diff = exp_x - exp_y
|
|
ldah t5, 0x360(zero)
|
|
blt t3, 10f // if diff < 0, goto 10
|
|
|
|
ldah t2, 0x4000(zero)
|
|
cmple t3, t5, t5 // if (diff > scale) goto return_abs_x
|
|
subl t0, t2, t0 // precompute exp_x - SCALE_ADJUST
|
|
beq t5, return_abs_x
|
|
|
|
mov t0, t4 // scale = exp_x - SCALE_ADJUST
|
|
br zero, 20f
|
|
10:
|
|
ldah t5, -0x360(zero)
|
|
ldah t2, 0x4000(zero)
|
|
cmplt t3, t5, t3 // if (diff < -scale) goto return_abs_y
|
|
subl v0, t2, v0
|
|
bne t3, return_abs_y
|
|
mov v0, t4 // scale = exp_y - SCALE_ADJUST
|
|
20:
|
|
//
|
|
// Make floats for the scale factor and unscale factor
|
|
//
|
|
ldah t0, 0x3ff0(zero)
|
|
subl t0, t4, t0
|
|
stl t0, Temp0 + HighPart(sp)
|
|
ldah v0, 0x4000(zero)
|
|
stl zero, Temp0(sp)
|
|
addl t4, v0, v0
|
|
ldt f0, Temp0(sp)
|
|
stl v0, Temp0 + HighPart(sp)
|
|
stl zero, Temp0(sp)
|
|
ldt f1, Temp0(sp)
|
|
mult f0, f16, f16 // x *= scale_factor
|
|
mult f0, f17, f17 // y *= scale_factor
|
|
br zero, calculate_hypot
|
|
|
|
classify:
|
|
//
|
|
// Classify x
|
|
//
|
|
classify_x:
|
|
stt f16, Temp0(sp)
|
|
ldl t5, Temp0 + HighPart(sp)
|
|
zapnot t5, 0xf, t3
|
|
and t5, t1, t4
|
|
srl t3, 31, t3
|
|
and t3, 1, t3
|
|
beq t4, 30f
|
|
cmpult t4, t1, t4
|
|
beq t4, 10f
|
|
addl t3, 4, t2
|
|
br zero, classify_y
|
|
10:
|
|
ldah t6, 0x10(zero)
|
|
ldl t4, Temp0(sp)
|
|
lda t6, -1(t6)
|
|
and t5, t6, t6
|
|
stl t6, Temp0 + HighPart(sp)
|
|
bis t6, t4, t4
|
|
srl t6, 19, t6
|
|
beq t4, 20f
|
|
and t6, 1, t6
|
|
mov t6, t2
|
|
br zero, classify_y
|
|
20:
|
|
addl t3, 2, t2
|
|
br zero, classify_y
|
|
30:
|
|
ldl t7, Temp0(sp)
|
|
ldah t4, 0x10(zero)
|
|
lda t4, -1(t4)
|
|
and t5, t4, t4
|
|
bis t4, t7, t7
|
|
stl t4, Temp0 + HighPart(sp)
|
|
mov 6, t6
|
|
cmoveq t7, 8, t6
|
|
addl t3, t6, t2
|
|
|
|
//
|
|
// Classify y
|
|
//
|
|
classify_y:
|
|
stt f17, Temp0(sp)
|
|
ldl t4, Temp0 + HighPart(sp)
|
|
zapnot t4, 0xf, t5
|
|
and t4, t1, t3
|
|
srl t5, 31, t5
|
|
and t5, 1, t5
|
|
beq t3, 30f
|
|
cmpult t3, t1, t3
|
|
beq t3, 10f
|
|
|
|
addl t5, 4, t6
|
|
br zero, special_args
|
|
10:
|
|
ldah t3, 0x10(zero)
|
|
ldl t7, Temp0(sp)
|
|
lda t3, -1(t3)
|
|
and t4, t3, t3
|
|
bis t3, t7, t7
|
|
stl t3, Temp0 + HighPart(sp)
|
|
beq t7, 20f
|
|
srl t3, 19, t3
|
|
and t3, 1, t3
|
|
mov t3, t6
|
|
br zero, special_args
|
|
20:
|
|
addl t5, 2, t6
|
|
br zero, special_args
|
|
30:
|
|
ldl a0, Temp0(sp)
|
|
ldah t7, 0x10(zero)
|
|
lda t7, -1(t7)
|
|
and t4, t7, t4
|
|
bis t4, a0, a0
|
|
stl t4, Temp0 + HighPart(sp)
|
|
mov 6, t3
|
|
cmoveq a0, 8, t3
|
|
addl t5, t3, t6
|
|
|
|
//
|
|
// If we get to here we know that x is a NaN, Inf, denorm or zero.
|
|
// We don't necessarily know anything about y.
|
|
//
|
|
special_args:
|
|
sra t2, 1, t2 // Classify x
|
|
sra t6, 1, t6 // Classify y
|
|
s4addl t2, t2, t2
|
|
addl t2, t6, t2 // Combine
|
|
cmpule t2, 24, t12 // Sanity check
|
|
beq t12, scale_up_denorm_input
|
|
|
|
lda t12, Switch
|
|
s4addl t2, t12, t12
|
|
ldl t12, 0(t12)
|
|
jmp zero, (t12)
|
|
|
|
//
|
|
// x and y are zero -- return 0
|
|
//
|
|
ret_zero:
|
|
cpys f31, f31, f0
|
|
br zero, done
|
|
|
|
//
|
|
// x is a NaN - return x
|
|
//
|
|
ret_x: cpys f16, f16, f0
|
|
br zero, done
|
|
|
|
//
|
|
// y is a NaN, but x isn't - return y
|
|
//
|
|
ret_y: cpys f17, f17, f0
|
|
br zero, done
|
|
|
|
//
|
|
// y is a denorm; if |x| is large enough, just return |x|
|
|
//
|
|
y_denorm:
|
|
ldah t4, 0x1c0(zero) // if (exp_x >= LARGE)
|
|
cmpult t0, t4, t4
|
|
beq t4, return_abs_x // goto return_abs_x
|
|
br zero, scale_up_denorm_input
|
|
|
|
//
|
|
// x is a denorm; if |y| is large enough, just return |y|
|
|
//
|
|
x_denorm:
|
|
ldah t7, 0x1c0(zero) // if (exp_y >= LARGE)
|
|
cmpult v0, t7, t7
|
|
beq t7, return_abs_y // goto return_abs_y
|
|
|
|
|
|
//
|
|
// Scale x and y up by 2^F_PRECISION and adjust exp_x and exp_y
|
|
// accordingly. With x and y scaled into the normal range, we can
|
|
// rejoin the main logic flow for computing hypot(x, y)
|
|
//
|
|
scale_up_denorm_input:
|
|
//
|
|
// if (exp_x is non-zero) put exp_x - scale in x's exponent field
|
|
//
|
|
ldah t4, -0x4000(zero)
|
|
beq t0, 10f
|
|
stt f16, Temp0(sp)
|
|
ldl t5, Temp0 + HighPart(sp)
|
|
ldah t2, -0x7ff0(zero)
|
|
ldah t6, 0x4000(zero)
|
|
lda t2, -1(t2)
|
|
addl t0, t6, t0
|
|
and t5, t2, t2
|
|
bis t2, t0, t0
|
|
stl t0, Temp0 + HighPart(sp)
|
|
ldt f16, Temp0(sp)
|
|
br zero, 20f
|
|
10: //
|
|
// else `denorm-to-norm'
|
|
//
|
|
ldt f0, Four
|
|
cpyse f0, f16, f10
|
|
subt f10, f0, f16
|
|
20:
|
|
//
|
|
// if (exp_y is non-zero) put exp_y - scale in y's exponent field
|
|
//
|
|
beq v0, 30f
|
|
stt f17, Temp0(sp)
|
|
ldl t6, Temp0 + HighPart(sp)
|
|
ldah t2, -0x7ff0(zero)
|
|
ldah t5, 0x4000(zero)
|
|
lda t2, -1(t2)
|
|
addl v0, t5, v0
|
|
and t6, t2, t2
|
|
bis t2, v0, v0
|
|
stl v0, Temp0 + HighPart(sp)
|
|
ldt f17, Temp0(sp)
|
|
br zero, 40f
|
|
30:
|
|
ldt f0, Four
|
|
cpyse f0, f17, f10
|
|
subt f10, f0, f17
|
|
40:
|
|
calculate_hypot:
|
|
//
|
|
// Compute z = sqrt(x*x + y*y) directly
|
|
//
|
|
mult f16, f16, f0 // x^2
|
|
mult f17, f17, f10 // y^2
|
|
ldt f11, One
|
|
lda t6, __sqrt_t_table // We compute sqrt(x) inline
|
|
ldah t2, -0x7fe0(zero)
|
|
lda t2, -1(t2)
|
|
ldah v0, 0x3fe0(zero) // Half bias
|
|
addt f0, f10, f0 // x^2 + y^2
|
|
stt f0, Temp0(sp) // To mem and back ...
|
|
ldl t3, Temp0 + HighPart(sp) // ... for exp & mantissa bits
|
|
cpyse f11, f0, f12
|
|
sra t3, 13, t5 // low exp + high mantissa bits
|
|
and t3, t2, t2
|
|
and t5, 0xff, t5 // masked
|
|
addl t5, t5, t5
|
|
s8addl t5, zero, t5 // table index
|
|
mult f12, f12, f14
|
|
addl t6, t5, t5 // address of coefficients
|
|
bis t2, v0, t0
|
|
lds f10, 4(t5)
|
|
xor t3, t2, t2
|
|
lds f13, 0(t5)
|
|
addl t2, v0, v0
|
|
ldt f15, 8(t5)
|
|
zapnot v0, 0xf, v0
|
|
mult f10, f12, f10 // evaluate poly
|
|
mult f14, f13, f13
|
|
stl t0, Temp0 + HighPart(sp) // check for overflow below
|
|
sll v0, 31, v0
|
|
ldt f12, Temp0(sp)
|
|
stq v0, Temp0(sp)
|
|
ldt f14, Temp0(sp)
|
|
addt f15, f10, f10
|
|
addt f13, f10, f10
|
|
ldt f13, Lsb // To check for correct rounding
|
|
//
|
|
// Perform a Newton's iteration
|
|
//
|
|
mult f10, f12, f12
|
|
mult f12, f10, f10
|
|
mult f12, f14, f12
|
|
subt f11, f10, f10
|
|
addt f12, f12, f15
|
|
mult f12, f10, f10
|
|
mult f12, f13, f12
|
|
addt f15, f10, f10
|
|
//
|
|
// Check for correctly rounded results
|
|
//
|
|
ldt f15, Half
|
|
subt f10, f12, f14
|
|
addt f10, f12, f12
|
|
multc f10, f14, f11
|
|
multc f10, f12, f13
|
|
cmptle f0, f11, f11
|
|
cmptlt f13, f0, f0
|
|
fcmoveq f11, f10, f14
|
|
fcmoveq f0, f14, f12
|
|
bne t4, start_unscale
|
|
|
|
cpys f12, f12, f0 // Return result in f0
|
|
br zero, done
|
|
|
|
//
|
|
//
|
|
//
|
|
start_unscale:
|
|
ldah t0, 0x3fd0(zero) // exponent mask
|
|
mult f12, f15, f15 // w = TWO_POW_M_T * z
|
|
//
|
|
// if ((scale > MAX_SCALE) && (z >= MAX_Z)) then overflow
|
|
//
|
|
cmple t4, t0, t0
|
|
bne t0, no_overflow
|
|
ldt f13, Four
|
|
lda a0, hypotName
|
|
ldah v0, 0x800(zero)
|
|
lda v0, 14(v0)
|
|
cmptle f13, f12, f13
|
|
fbeq f13, no_overflow
|
|
//
|
|
// Report overflow (800/14)
|
|
//
|
|
stl a0, ExRec + ErName(sp)
|
|
stt f16, ExRec + ErArg0(sp)
|
|
stt f17, ExRec + ErArg1(sp)
|
|
stl v0, ExRec + ErErr(sp)
|
|
lda v0, ExRec(sp)
|
|
bsr ra, __dpml_exception
|
|
ldt f0, 0(v0)
|
|
br zero, done
|
|
|
|
//
|
|
//
|
|
//
|
|
no_overflow:
|
|
ldah t5, -0x3ff0(zero)
|
|
cmplt t4, t5, t4 // if (scale >= MIN_SCALE)
|
|
beq t4, do_unscale // goto do_unscale;
|
|
|
|
stt f12, Temp0(sp)
|
|
ldl t7, Temp0 + HighPart(sp)
|
|
ldah a0, 0x4000(zero)
|
|
and t7, t1, t1
|
|
subl t1, a0, a0
|
|
xor t7, t1, t7
|
|
ble a0, 10f
|
|
|
|
bis t7, a0, t7
|
|
stl t7, Temp0 + HighPart(sp)
|
|
ldt f0, Temp0(sp)
|
|
br zero, done
|
|
|
|
10: subl t1, a0, t1
|
|
stl zero, Temp0(sp)
|
|
ldah t6, 0x10(zero)
|
|
addl t1, t6, t1
|
|
bis t7, t1, t7
|
|
ldah v0, -0x10(zero)
|
|
and t7, v0, v0
|
|
stl v0, Temp0 + HighPart(sp)
|
|
ldt f10, Temp0(sp)
|
|
addt f10, f12, f10
|
|
stt f10, Temp0(sp)
|
|
ldl t3, Temp0 + HighPart(sp)
|
|
subl t3, t1, t1
|
|
stl t1, Temp0 + HighPart(sp)
|
|
ldt f0, Temp0(sp)
|
|
br zero, done
|
|
|
|
do_unscale:
|
|
mult f1, f15, f0 // return unscale_factor * w
|
|
br zero, done
|
|
|
|
return_abs_y:
|
|
cpys f31, f17, f0
|
|
br zero, done
|
|
|
|
return_abs_x:
|
|
cpys f31, f16, f0
|
|
// br zero, done
|
|
|
|
//
|
|
// Return with result in f0.
|
|
//
|
|
done:
|
|
lda sp, FrameLength(sp) // deallocate stack frame
|
|
ret zero, (a1) // return through saved ra in a1
|
|
|
|
.end _hypot
|
|
|
|
.rdata
|
|
.align 3
|
|
|
|
//
|
|
// Define floating point constants.
|
|
//
|
|
|
|
Lsb: .quad 0x3cb4000000000000 // lsb factor: 5*2^-54
|
|
|
|
Half: .double 0.5
|
|
|
|
One: .double 1.0
|
|
|
|
Four: .double 4.0
|
|
|
|
//
|
|
// Switch table indexed by class(x)*5 + class(y)
|
|
//
|
|
|
|
Switch:
|
|
.long ret_x
|
|
.long ret_x
|
|
.long ret_x
|
|
.long ret_x
|
|
.long ret_x
|
|
.long ret_y
|
|
.long return_abs_y
|
|
.long return_abs_x
|
|
.long return_abs_x
|
|
.long return_abs_x
|
|
.long ret_y
|
|
.long return_abs_y
|
|
.long scale_up_denorm_input
|
|
.long y_denorm
|
|
.long return_abs_x
|
|
.long ret_y
|
|
.long return_abs_y
|
|
.long x_denorm
|
|
.long scale_up_denorm_input
|
|
.long return_abs_x
|
|
.long ret_y
|
|
.long return_abs_y
|
|
.long return_abs_y
|
|
.long return_abs_y
|
|
.long ret_zero
|
|
|
|
//
|
|
// Function name for dpml_exception.
|
|
//
|
|
|
|
hypotName:
|
|
.ascii "_hypot\0"
|