//      TITLE("Alpha AXP Hypotenuse")
//++
//
// Copyright (c) 1993, 1994  Digital Equipment Corporation
//
// Module Name:
//
//    hypot.s
//
// Abstract:
//
//    This module implements a high-performance Alpha AXP specific routine
//    for IEEE double format hypotenuse.
//
// Author:
//
//    Bill Gray (rtl::gray) 30-Jun-1993
//
// Environment:
//
//    User mode.
//
// Revision History:
//
//    Thomas Van Baak (tvb) 15-Feb-1994
//
//        Adapted for NT.
//
//--

#include "ksalpha.h"

//
// Define DPML exception record for NT.
//

        .struct 0
ErErr:  .space  4                       // error code
ErCxt:  .space  4                       // context
ErPlat: .space  4                       // platform
ErEnv:  .space  4                       // environment
ErRet:  .space  4                       // return value pointer
ErName: .space  4                       // function name
ErType: .space  8                       // flags and fill
ErVal:  .space  8                       // return value
ErArg0: .space  8                       // arg 0
ErArg1: .space  8                       // arg 1
ErArg2: .space  8                       // arg 2
ErArg3: .space  8                       // arg 3
DpmlExceptionLength:

//
// Define stack frame.
//

        .struct 0
Temp0:  .space  8                       // save argument
Temp1:  .space  8                       // save argument
ExRec:  .space  DpmlExceptionLength     // exception record
        .space  8                       // for 16-byte stack alignment
FrameLength:

//
// Define lower and upper 32-bit parts of 64-bit double.
//

#define LowPart 0x0
#define HighPart 0x4

        SBTTL("Hypotenuse")

//++
//
// double
// _hypot (
//    IN double x
//    IN double y
//    )
//
// Routine Description:
//
//    This function returns the hypotenuse for the given x, y values:
//        double hypot(double x, double y) = sqrt(x*x + y*y).
//
// Arguments:
//
//    x (f16) - Supplies the x argument value.
//
//    y (f17) - Supplies the y argument value.
//
// Return Value:
//
//    The double result is returned as the function value in f0.
//
//--

        NESTED_ENTRY(_hypot, FrameLength, ra)

        lda     sp, -FrameLength(sp)    // allocate stack frame
        mov     ra, a1                  // save return address

        PROLOGUE_END

//  This implementation first check for special cases: infinities, nans, zeros
//  and denormalized numbers.  Then it scales both number to avoid intermediate
//  underflow and overflow.  Once the scaled result of Hypot(x, y) is
//  calculated, it checks for possible overflow before scaling up the result.

        ldah    t1, 0x7ff0(zero)
        stt     f17, Temp0(sp)
        stt     f16, Temp1(sp)
        ldl     t0, Temp1 + HighPart(sp)        // Get exp of x
        ldah    t2, 0x2000(zero)                // bias - 1
        ldl     v0, Temp0 + HighPart(sp)        // Get exp of y
        ldah    t5, 0x3fd0(zero)                // exponent mask
        mov     zero, t4
        and     t0, t1, t0                      // mask
        subl    t0, t2, t3                      // subtract bias
        and     v0, t1, v0                      // mask
        subl    v0, t2, t2                      // subtract bias
        cmpult  t3, t5, t3
        cmpult  t2, t5, t2
        beq     t3, scale_input
        bne     t2, calculate_hypot


//
// We get here if simply squaring and adding will cause an intermediate
// overflow or underflow.  Consequently we need to scale the arguments
// before preceeding.  In the IEEE case: NaN's, Inf's and denorms come
// through here.  Split them out at special cases here
//
scale_input:
        and     t0, t1, t3
        ldah    t5, 0x10(zero)
        and     v0, t1, t4
        subl    t3, t5, t3
        ldah    t2, 0x7fe0(zero)
        subl    t4, t5, t4
        cmpult  t3, t2, t3
        cmpult  t4, t2, t2
        beq     t3, classify            // exp_x abnormal? goto classify
        beq     t2, classify            // exp_y abnormal? goto classify

        subl    t0, v0, t3              // diff = exp_x - exp_y
        ldah    t5, 0x360(zero)
        blt     t3, 10f                 // if diff < 0, goto 10

        ldah    t2, 0x4000(zero)
        cmple   t3, t5, t5              // if (diff > scale) goto return_abs_x
        subl    t0, t2, t0              // precompute exp_x - SCALE_ADJUST
        beq     t5, return_abs_x

        mov     t0, t4                  // scale = exp_x - SCALE_ADJUST
        br      zero, 20f
10:
        ldah    t5, -0x360(zero)
        ldah    t2, 0x4000(zero)
        cmplt   t3, t5, t3              // if (diff < -scale) goto return_abs_y
        subl    v0, t2, v0
        bne     t3, return_abs_y
        mov     v0, t4                  // scale = exp_y - SCALE_ADJUST
20:
        //
        // Make floats for the scale factor and unscale factor
        //
        ldah    t0, 0x3ff0(zero)
        subl    t0, t4, t0
        stl     t0, Temp0 + HighPart(sp)
        ldah    v0, 0x4000(zero)
        stl     zero, Temp0(sp)
        addl    t4, v0, v0
        ldt     f0, Temp0(sp)
        stl     v0, Temp0 + HighPart(sp)
        stl     zero, Temp0(sp)
        ldt     f1, Temp0(sp)
        mult    f0, f16, f16            // x *= scale_factor
        mult    f0, f17, f17            // y *= scale_factor
        br      zero, calculate_hypot

classify:
//
// Classify x
//
classify_x:
        stt     f16, Temp0(sp)
        ldl     t5, Temp0 + HighPart(sp)
        zapnot  t5, 0xf, t3
        and     t5, t1, t4
        srl     t3, 31, t3
        and     t3, 1, t3
        beq     t4, 30f
        cmpult  t4, t1, t4
        beq     t4, 10f
        addl    t3, 4, t2
        br      zero, classify_y
10:
        ldah    t6, 0x10(zero)
        ldl     t4, Temp0(sp)
        lda     t6, -1(t6)
        and     t5, t6, t6
        stl     t6, Temp0 + HighPart(sp)
        bis     t6, t4, t4
        srl     t6, 19, t6
        beq     t4, 20f
        and     t6, 1, t6
        mov     t6, t2
        br      zero, classify_y
20:
        addl    t3, 2, t2
        br      zero, classify_y
30:
        ldl     t7, Temp0(sp)
        ldah    t4, 0x10(zero)
        lda     t4, -1(t4)
        and     t5, t4, t4
        bis     t4, t7, t7
        stl     t4, Temp0 + HighPart(sp)
        mov     6, t6
        cmoveq  t7, 8, t6
        addl    t3, t6, t2

//
// Classify y
//
classify_y: 
        stt     f17, Temp0(sp)
        ldl     t4, Temp0 + HighPart(sp)
        zapnot  t4, 0xf, t5
        and     t4, t1, t3
        srl     t5, 31, t5
        and     t5, 1, t5
        beq     t3, 30f
        cmpult  t3, t1, t3
        beq     t3, 10f

        addl    t5, 4, t6
        br      zero, special_args
10:
        ldah    t3, 0x10(zero)
        ldl     t7, Temp0(sp)
        lda     t3, -1(t3)
        and     t4, t3, t3
        bis     t3, t7, t7
        stl     t3, Temp0 + HighPart(sp)
        beq     t7, 20f
        srl     t3, 19, t3
        and     t3, 1, t3
        mov     t3, t6
        br      zero, special_args
20:
        addl    t5, 2, t6
        br      zero, special_args
30:
        ldl     a0, Temp0(sp)
        ldah    t7, 0x10(zero)
        lda     t7, -1(t7)
        and     t4, t7, t4
        bis     t4, a0, a0
        stl     t4, Temp0 + HighPart(sp)
        mov     6, t3
        cmoveq  a0, 8, t3
        addl    t5, t3, t6

//
// If we get to here we know that x is a NaN, Inf, denorm or zero.
// We don't necessarily know anything about y.
//
special_args:
        sra     t2, 1, t2               // Classify x
        sra     t6, 1, t6               // Classify y
        s4addl  t2, t2, t2
        addl    t2, t6, t2              // Combine
        cmpule  t2, 24, t12             // Sanity check
        beq     t12, scale_up_denorm_input

        lda     t12, Switch
        s4addl  t2, t12, t12
        ldl     t12, 0(t12)
        jmp     zero, (t12)

//
//      x and y are zero -- return 0
//
ret_zero:
        cpys    f31, f31, f0
        br      zero, done

//
//      x is a NaN - return x
//
ret_x:  cpys    f16, f16, f0
        br      zero, done

//
//      y is a NaN, but x isn't - return y
//
ret_y:  cpys    f17, f17, f0
        br      zero, done

//
//      y is a denorm; if |x| is large enough, just return |x|
//
y_denorm:
        ldah    t4, 0x1c0(zero)         // if (exp_x >= LARGE)
        cmpult  t0, t4, t4
        beq     t4, return_abs_x        //      goto return_abs_x
        br      zero, scale_up_denorm_input

//
//      x is a denorm; if |y| is large enough, just return |y|
//
x_denorm:
        ldah    t7, 0x1c0(zero)         // if (exp_y >= LARGE)
        cmpult  v0, t7, t7
        beq     t7, return_abs_y        //      goto return_abs_y


//
// Scale x and y up by 2^F_PRECISION and adjust exp_x and exp_y
// accordingly.  With x and y scaled into the normal range, we can
// rejoin the main logic flow for computing hypot(x, y)
//
scale_up_denorm_input:
        //
        // if (exp_x is non-zero) put exp_x - scale in x's exponent field
        //
        ldah    t4, -0x4000(zero)
        beq     t0, 10f
        stt     f16, Temp0(sp)
        ldl     t5, Temp0 + HighPart(sp)
        ldah    t2, -0x7ff0(zero)
        ldah    t6, 0x4000(zero)
        lda     t2, -1(t2)
        addl    t0, t6, t0
        and     t5, t2, t2
        bis     t2, t0, t0
        stl     t0, Temp0 + HighPart(sp)
        ldt     f16, Temp0(sp)
        br      zero, 20f
10:     //
        // else `denorm-to-norm'
        //
        ldt     f0, Four
        cpyse   f0, f16, f10
        subt    f10, f0, f16
20:
        //
        // if (exp_y is non-zero) put exp_y - scale in y's exponent field
        //
        beq     v0, 30f
        stt     f17, Temp0(sp)
        ldl     t6, Temp0 + HighPart(sp)
        ldah    t2, -0x7ff0(zero)
        ldah    t5, 0x4000(zero)
        lda     t2, -1(t2)
        addl    v0, t5, v0
        and     t6, t2, t2
        bis     t2, v0, v0
        stl     v0, Temp0 + HighPart(sp)
        ldt     f17, Temp0(sp)
        br      zero, 40f
30:
        ldt     f0, Four
        cpyse   f0, f17, f10
        subt    f10, f0, f17
40:
calculate_hypot:
//
// Compute z = sqrt(x*x + y*y) directly
//
        mult    f16, f16, f0                    // x^2
        mult    f17, f17, f10                   // y^2
        ldt     f11, One
        lda     t6, __sqrt_t_table              // We compute sqrt(x) inline
        ldah    t2, -0x7fe0(zero)
        lda     t2, -1(t2)
        ldah    v0, 0x3fe0(zero)                // Half bias
        addt    f0, f10, f0                     // x^2 + y^2
        stt     f0, Temp0(sp)                   // To mem and back ...
        ldl     t3, Temp0 + HighPart(sp)        // ... for exp & mantissa bits
        cpyse   f11, f0, f12
        sra     t3, 13, t5                      // low exp + high mantissa bits
        and     t3, t2, t2
        and     t5, 0xff, t5                    // masked
        addl    t5, t5, t5
        s8addl  t5, zero, t5                    // table index
        mult    f12, f12, f14
        addl    t6, t5, t5                      // address of coefficients
        bis     t2, v0, t0
        lds     f10, 4(t5)
        xor     t3, t2, t2
        lds     f13, 0(t5)
        addl    t2, v0, v0
        ldt     f15, 8(t5)
        zapnot  v0, 0xf, v0
        mult    f10, f12, f10                   // evaluate poly
        mult    f14, f13, f13
        stl     t0, Temp0 + HighPart(sp)        // check for overflow below
        sll     v0, 31, v0
        ldt     f12, Temp0(sp)
        stq     v0, Temp0(sp)
        ldt     f14, Temp0(sp)
        addt    f15, f10, f10
        addt    f13, f10, f10
        ldt     f13, Lsb                        // To check for correct rounding
        //
        // Perform a Newton's iteration
        //
        mult    f10, f12, f12
        mult    f12, f10, f10
        mult    f12, f14, f12
        subt    f11, f10, f10
        addt    f12, f12, f15
        mult    f12, f10, f10
        mult    f12, f13, f12
        addt    f15, f10, f10
        //
        // Check for correctly rounded results
        //
        ldt     f15, Half
        subt    f10, f12, f14
        addt    f10, f12, f12
        multc   f10, f14, f11
        multc   f10, f12, f13
        cmptle  f0, f11, f11
        cmptlt  f13, f0, f0
        fcmoveq f11, f10, f14
        fcmoveq f0, f14, f12
        bne     t4, start_unscale

        cpys    f12, f12, f0                    // Return result in f0
        br      zero, done

//
//
//
start_unscale:
        ldah    t0, 0x3fd0(zero)                // exponent mask
        mult    f12, f15, f15                   // w = TWO_POW_M_T * z
        //
        // if ((scale > MAX_SCALE) && (z >= MAX_Z)) then overflow
        //
        cmple   t4, t0, t0
        bne     t0, no_overflow
        ldt     f13, Four
        lda     a0, hypotName
        ldah    v0, 0x800(zero)
        lda     v0, 14(v0)
        cmptle  f13, f12, f13
        fbeq    f13, no_overflow
//
//      Report overflow (800/14)
//
        stl     a0, ExRec + ErName(sp)
        stt     f16, ExRec + ErArg0(sp)
        stt     f17, ExRec + ErArg1(sp)
        stl     v0, ExRec + ErErr(sp)
        lda     v0, ExRec(sp)
        bsr     ra, __dpml_exception
        ldt     f0, 0(v0)
        br      zero, done

//
//
//
no_overflow:
        ldah    t5, -0x3ff0(zero)
        cmplt   t4, t5, t4              // if (scale >= MIN_SCALE)
        beq     t4, do_unscale          //        goto do_unscale;

        stt     f12, Temp0(sp)
        ldl     t7, Temp0 + HighPart(sp)
        ldah    a0, 0x4000(zero)
        and     t7, t1, t1
        subl    t1, a0, a0
        xor     t7, t1, t7
        ble     a0, 10f

        bis     t7, a0, t7
        stl     t7, Temp0 + HighPart(sp)
        ldt     f0, Temp0(sp)
        br      zero, done

10f:    subl    t1, a0, t1
        stl     zero, Temp0(sp)
        ldah    t6, 0x10(zero)
        addl    t1, t6, t1
        bis     t7, t1, t7
        ldah    v0, -0x10(zero)
        and     t7, v0, v0
        stl     v0, Temp0 + HighPart(sp)
        ldt     f10, Temp0(sp)
        addt    f10, f12, f10
        stt     f10, Temp0(sp)
        ldl     t3, Temp0 + HighPart(sp)
        subl    t3, t1, t1
        stl     t1, Temp0 + HighPart(sp)
        ldt     f0, Temp0(sp)
        br      zero, done

do_unscale:
        mult    f1, f15, f0             // return unscale_factor * w
        br      zero, done

return_abs_y:
        cpys    f31, f17, f0
        br      zero, done

return_abs_x:
        cpys    f31, f16, f0
//      br      zero, done

//
// Return with result in f0.
//
done:
        lda     sp, FrameLength(sp)     // deallocate stack frame
        ret     zero, (a1)              // return through saved ra in a1

        .end    _hypot

        .rdata
        .align  3

//
// Define floating point constants.
//

Lsb:    .quad   0x3cb4000000000000      // lsb factor: 5*2^-54

Half:   .double 0.5

One:    .double 1.0

Four:   .double 4.0

//
// Switch table indexed by class(x)*5 + class(y)
//

Switch:
        .long   ret_x
        .long   ret_x
        .long   ret_x
        .long   ret_x
        .long   ret_x
        .long   ret_y
        .long   return_abs_y
        .long   return_abs_x
        .long   return_abs_x
        .long   return_abs_x
        .long   ret_y
        .long   return_abs_y
        .long   scale_up_denorm_input
        .long   y_denorm
        .long   return_abs_x
        .long   ret_y
        .long   return_abs_y
        .long   x_denorm
        .long   scale_up_denorm_input
        .long   return_abs_x
        .long   ret_y
        .long   return_abs_y
        .long   return_abs_y
        .long   return_abs_y
        .long   ret_zero

//
// Function name for dpml_exception.
//

hypotName:
       .ascii  "_hypot\0"