// TITLE("Alpha AXP Hypotenuse") //++ // // Copyright (c) 1993, 1994 Digital Equipment Corporation // // Module Name: // // hypot.s // // Abstract: // // This module implements a high-performance Alpha AXP specific routine // for IEEE double format hypotenuse. // // Author: // // Bill Gray (rtl::gray) 30-Jun-1993 // // Environment: // // User mode. // // Revision History: // // Thomas Van Baak (tvb) 15-Feb-1994 // // Adapted for NT. // //-- #include "ksalpha.h" // // Define DPML exception record for NT. // .struct 0 ErErr: .space 4 // error code ErCxt: .space 4 // context ErPlat: .space 4 // platform ErEnv: .space 4 // environment ErRet: .space 4 // return value pointer ErName: .space 4 // function name ErType: .space 8 // flags and fill ErVal: .space 8 // return value ErArg0: .space 8 // arg 0 ErArg1: .space 8 // arg 1 ErArg2: .space 8 // arg 2 ErArg3: .space 8 // arg 3 DpmlExceptionLength: // // Define stack frame. // .struct 0 Temp0: .space 8 // save argument Temp1: .space 8 // save argument ExRec: .space DpmlExceptionLength // exception record .space 8 // for 16-byte stack alignment FrameLength: // // Define lower and upper 32-bit parts of 64-bit double. // #define LowPart 0x0 #define HighPart 0x4 SBTTL("Hypotenuse") //++ // // double // _hypot ( // IN double x // IN double y // ) // // Routine Description: // // This function returns the hypotenuse for the given x, y values: // double hypot(double x, double y) = sqrt(x*x + y*y). // // Arguments: // // x (f16) - Supplies the x argument value. // // y (f17) - Supplies the y argument value. // // Return Value: // // The double result is returned as the function value in f0. // //-- NESTED_ENTRY(_hypot, FrameLength, ra) lda sp, -FrameLength(sp) // allocate stack frame mov ra, a1 // save return address PROLOGUE_END // This implementation first check for special cases: infinities, nans, zeros // and denormalized numbers. Then it scales both number to avoid intermediate // underflow and overflow. Once the scaled result of Hypot(x, y) is // calculated, it checks for possible overflow before scaling up the result. ldah t1, 0x7ff0(zero) stt f17, Temp0(sp) stt f16, Temp1(sp) ldl t0, Temp1 + HighPart(sp) // Get exp of x ldah t2, 0x2000(zero) // bias - 1 ldl v0, Temp0 + HighPart(sp) // Get exp of y ldah t5, 0x3fd0(zero) // exponent mask mov zero, t4 and t0, t1, t0 // mask subl t0, t2, t3 // subtract bias and v0, t1, v0 // mask subl v0, t2, t2 // subtract bias cmpult t3, t5, t3 cmpult t2, t5, t2 beq t3, scale_input bne t2, calculate_hypot // // We get here if simply squaring and adding will cause an intermediate // overflow or underflow. Consequently we need to scale the arguments // before preceeding. In the IEEE case: NaN's, Inf's and denorms come // through here. Split them out at special cases here // scale_input: and t0, t1, t3 ldah t5, 0x10(zero) and v0, t1, t4 subl t3, t5, t3 ldah t2, 0x7fe0(zero) subl t4, t5, t4 cmpult t3, t2, t3 cmpult t4, t2, t2 beq t3, classify // exp_x abnormal? goto classify beq t2, classify // exp_y abnormal? goto classify subl t0, v0, t3 // diff = exp_x - exp_y ldah t5, 0x360(zero) blt t3, 10f // if diff < 0, goto 10 ldah t2, 0x4000(zero) cmple t3, t5, t5 // if (diff > scale) goto return_abs_x subl t0, t2, t0 // precompute exp_x - SCALE_ADJUST beq t5, return_abs_x mov t0, t4 // scale = exp_x - SCALE_ADJUST br zero, 20f 10: ldah t5, -0x360(zero) ldah t2, 0x4000(zero) cmplt t3, t5, t3 // if (diff < -scale) goto return_abs_y subl v0, t2, v0 bne t3, return_abs_y mov v0, t4 // scale = exp_y - SCALE_ADJUST 20: // // Make floats for the scale factor and unscale factor // ldah t0, 0x3ff0(zero) subl t0, t4, t0 stl t0, Temp0 + HighPart(sp) ldah v0, 0x4000(zero) stl zero, Temp0(sp) addl t4, v0, v0 ldt f0, Temp0(sp) stl v0, Temp0 + HighPart(sp) stl zero, Temp0(sp) ldt f1, Temp0(sp) mult f0, f16, f16 // x *= scale_factor mult f0, f17, f17 // y *= scale_factor br zero, calculate_hypot classify: // // Classify x // classify_x: stt f16, Temp0(sp) ldl t5, Temp0 + HighPart(sp) zapnot t5, 0xf, t3 and t5, t1, t4 srl t3, 31, t3 and t3, 1, t3 beq t4, 30f cmpult t4, t1, t4 beq t4, 10f addl t3, 4, t2 br zero, classify_y 10: ldah t6, 0x10(zero) ldl t4, Temp0(sp) lda t6, -1(t6) and t5, t6, t6 stl t6, Temp0 + HighPart(sp) bis t6, t4, t4 srl t6, 19, t6 beq t4, 20f and t6, 1, t6 mov t6, t2 br zero, classify_y 20: addl t3, 2, t2 br zero, classify_y 30: ldl t7, Temp0(sp) ldah t4, 0x10(zero) lda t4, -1(t4) and t5, t4, t4 bis t4, t7, t7 stl t4, Temp0 + HighPart(sp) mov 6, t6 cmoveq t7, 8, t6 addl t3, t6, t2 // // Classify y // classify_y: stt f17, Temp0(sp) ldl t4, Temp0 + HighPart(sp) zapnot t4, 0xf, t5 and t4, t1, t3 srl t5, 31, t5 and t5, 1, t5 beq t3, 30f cmpult t3, t1, t3 beq t3, 10f addl t5, 4, t6 br zero, special_args 10: ldah t3, 0x10(zero) ldl t7, Temp0(sp) lda t3, -1(t3) and t4, t3, t3 bis t3, t7, t7 stl t3, Temp0 + HighPart(sp) beq t7, 20f srl t3, 19, t3 and t3, 1, t3 mov t3, t6 br zero, special_args 20: addl t5, 2, t6 br zero, special_args 30: ldl a0, Temp0(sp) ldah t7, 0x10(zero) lda t7, -1(t7) and t4, t7, t4 bis t4, a0, a0 stl t4, Temp0 + HighPart(sp) mov 6, t3 cmoveq a0, 8, t3 addl t5, t3, t6 // // If we get to here we know that x is a NaN, Inf, denorm or zero. // We don't necessarily know anything about y. // special_args: sra t2, 1, t2 // Classify x sra t6, 1, t6 // Classify y s4addl t2, t2, t2 addl t2, t6, t2 // Combine cmpule t2, 24, t12 // Sanity check beq t12, scale_up_denorm_input lda t12, Switch s4addl t2, t12, t12 ldl t12, 0(t12) jmp zero, (t12) // // x and y are zero -- return 0 // ret_zero: cpys f31, f31, f0 br zero, done // // x is a NaN - return x // ret_x: cpys f16, f16, f0 br zero, done // // y is a NaN, but x isn't - return y // ret_y: cpys f17, f17, f0 br zero, done // // y is a denorm; if |x| is large enough, just return |x| // y_denorm: ldah t4, 0x1c0(zero) // if (exp_x >= LARGE) cmpult t0, t4, t4 beq t4, return_abs_x // goto return_abs_x br zero, scale_up_denorm_input // // x is a denorm; if |y| is large enough, just return |y| // x_denorm: ldah t7, 0x1c0(zero) // if (exp_y >= LARGE) cmpult v0, t7, t7 beq t7, return_abs_y // goto return_abs_y // // Scale x and y up by 2^F_PRECISION and adjust exp_x and exp_y // accordingly. With x and y scaled into the normal range, we can // rejoin the main logic flow for computing hypot(x, y) // scale_up_denorm_input: // // if (exp_x is non-zero) put exp_x - scale in x's exponent field // ldah t4, -0x4000(zero) beq t0, 10f stt f16, Temp0(sp) ldl t5, Temp0 + HighPart(sp) ldah t2, -0x7ff0(zero) ldah t6, 0x4000(zero) lda t2, -1(t2) addl t0, t6, t0 and t5, t2, t2 bis t2, t0, t0 stl t0, Temp0 + HighPart(sp) ldt f16, Temp0(sp) br zero, 20f 10: // // else `denorm-to-norm' // ldt f0, Four cpyse f0, f16, f10 subt f10, f0, f16 20: // // if (exp_y is non-zero) put exp_y - scale in y's exponent field // beq v0, 30f stt f17, Temp0(sp) ldl t6, Temp0 + HighPart(sp) ldah t2, -0x7ff0(zero) ldah t5, 0x4000(zero) lda t2, -1(t2) addl v0, t5, v0 and t6, t2, t2 bis t2, v0, v0 stl v0, Temp0 + HighPart(sp) ldt f17, Temp0(sp) br zero, 40f 30: ldt f0, Four cpyse f0, f17, f10 subt f10, f0, f17 40: calculate_hypot: // // Compute z = sqrt(x*x + y*y) directly // mult f16, f16, f0 // x^2 mult f17, f17, f10 // y^2 ldt f11, One lda t6, __sqrt_t_table // We compute sqrt(x) inline ldah t2, -0x7fe0(zero) lda t2, -1(t2) ldah v0, 0x3fe0(zero) // Half bias addt f0, f10, f0 // x^2 + y^2 stt f0, Temp0(sp) // To mem and back ... ldl t3, Temp0 + HighPart(sp) // ... for exp & mantissa bits cpyse f11, f0, f12 sra t3, 13, t5 // low exp + high mantissa bits and t3, t2, t2 and t5, 0xff, t5 // masked addl t5, t5, t5 s8addl t5, zero, t5 // table index mult f12, f12, f14 addl t6, t5, t5 // address of coefficients bis t2, v0, t0 lds f10, 4(t5) xor t3, t2, t2 lds f13, 0(t5) addl t2, v0, v0 ldt f15, 8(t5) zapnot v0, 0xf, v0 mult f10, f12, f10 // evaluate poly mult f14, f13, f13 stl t0, Temp0 + HighPart(sp) // check for overflow below sll v0, 31, v0 ldt f12, Temp0(sp) stq v0, Temp0(sp) ldt f14, Temp0(sp) addt f15, f10, f10 addt f13, f10, f10 ldt f13, Lsb // To check for correct rounding // // Perform a Newton's iteration // mult f10, f12, f12 mult f12, f10, f10 mult f12, f14, f12 subt f11, f10, f10 addt f12, f12, f15 mult f12, f10, f10 mult f12, f13, f12 addt f15, f10, f10 // // Check for correctly rounded results // ldt f15, Half subt f10, f12, f14 addt f10, f12, f12 multc f10, f14, f11 multc f10, f12, f13 cmptle f0, f11, f11 cmptlt f13, f0, f0 fcmoveq f11, f10, f14 fcmoveq f0, f14, f12 bne t4, start_unscale cpys f12, f12, f0 // Return result in f0 br zero, done // // // start_unscale: ldah t0, 0x3fd0(zero) // exponent mask mult f12, f15, f15 // w = TWO_POW_M_T * z // // if ((scale > MAX_SCALE) && (z >= MAX_Z)) then overflow // cmple t4, t0, t0 bne t0, no_overflow ldt f13, Four lda a0, hypotName ldah v0, 0x800(zero) lda v0, 14(v0) cmptle f13, f12, f13 fbeq f13, no_overflow // // Report overflow (800/14) // stl a0, ExRec + ErName(sp) stt f16, ExRec + ErArg0(sp) stt f17, ExRec + ErArg1(sp) stl v0, ExRec + ErErr(sp) lda v0, ExRec(sp) bsr ra, __dpml_exception ldt f0, 0(v0) br zero, done // // // no_overflow: ldah t5, -0x3ff0(zero) cmplt t4, t5, t4 // if (scale >= MIN_SCALE) beq t4, do_unscale // goto do_unscale; stt f12, Temp0(sp) ldl t7, Temp0 + HighPart(sp) ldah a0, 0x4000(zero) and t7, t1, t1 subl t1, a0, a0 xor t7, t1, t7 ble a0, 10f bis t7, a0, t7 stl t7, Temp0 + HighPart(sp) ldt f0, Temp0(sp) br zero, done 10f: subl t1, a0, t1 stl zero, Temp0(sp) ldah t6, 0x10(zero) addl t1, t6, t1 bis t7, t1, t7 ldah v0, -0x10(zero) and t7, v0, v0 stl v0, Temp0 + HighPart(sp) ldt f10, Temp0(sp) addt f10, f12, f10 stt f10, Temp0(sp) ldl t3, Temp0 + HighPart(sp) subl t3, t1, t1 stl t1, Temp0 + HighPart(sp) ldt f0, Temp0(sp) br zero, done do_unscale: mult f1, f15, f0 // return unscale_factor * w br zero, done return_abs_y: cpys f31, f17, f0 br zero, done return_abs_x: cpys f31, f16, f0 // br zero, done // // Return with result in f0. // done: lda sp, FrameLength(sp) // deallocate stack frame ret zero, (a1) // return through saved ra in a1 .end _hypot .rdata .align 3 // // Define floating point constants. // Lsb: .quad 0x3cb4000000000000 // lsb factor: 5*2^-54 Half: .double 0.5 One: .double 1.0 Four: .double 4.0 // // Switch table indexed by class(x)*5 + class(y) // Switch: .long ret_x .long ret_x .long ret_x .long ret_x .long ret_x .long ret_y .long return_abs_y .long return_abs_x .long return_abs_x .long return_abs_x .long ret_y .long return_abs_y .long scale_up_denorm_input .long y_denorm .long return_abs_x .long ret_y .long return_abs_y .long x_denorm .long scale_up_denorm_input .long return_abs_x .long ret_y .long return_abs_y .long return_abs_y .long return_abs_y .long ret_zero // // Function name for dpml_exception. // hypotName: .ascii "_hypot\0"