// TITLE("Alpha AXP fmod") //++ // // Copyright (c) 1993, 1994 Digital Equipment Corporation // // Module Name: // // fmod.s // // Abstract: // // This module implements a high-performance Alpha AXP specific routine // for IEEE double format fmod. // // Author: // // (ajg) 1-Nov-1991. // // Environment: // // User mode. // // Revision History: // // Thomas Van Baak (tvb) 15-Apr-1994 // // Adapted for NT. // //-- #include "ksalpha.h" // // Define DPML exception record for NT. // .struct 0 ErErr: .space 4 // error code ErCxt: .space 4 // context ErPlat: .space 4 // platform ErEnv: .space 4 // environment ErRet: .space 4 // return value pointer ErName: .space 4 // function name ErType: .space 8 // flags and fill ErVal: .space 8 // return value ErArg0: .space 8 // arg 0 ErArg1: .space 8 // arg 1 ErArg2: .space 8 // arg 2 ErArg3: .space 8 // arg 3 DpmlExceptionLength: // // Define stack frame. // .struct 0 SaveRa: .space 8 Temp0: .space 8 // to extract exponent from p, etc Temp1: .space 8 // to extract exponent from q, etc ExRec: .space DpmlExceptionLength // exception record .space 0 // for 16-byte stack alignment FrameLength: // // Define lower and upper 32-bit parts of 64-bit double. // #define LowPart 0x0 #define HighPart 0x4 SBTTL("fmod") //++ // // double // fmod ( // IN double x // IN double y // ) // // Routine Description: // // This function returns the modulus of the two given double arguments. // The fmod function is an exactly computable function defined by: // // mod(p, q) = p - trunc(p/q)*q // // Arguments: // // x (f16) - Supplies the dividend value. // y (f17) - Supplies the divisor value. // // Return Value: // // The double modulus result is returned as the function value in f0. // //-- NESTED_ENTRY(fmod, FrameLength, ra) lda sp, -FrameLength(sp) // allocate stack frame stq ra, SaveRa(sp) PROLOGUE_END ldah t1, 0x7ff0(zero) // Load exponent mask stq zero, Temp0(sp) stt f16, Temp0(sp) stt f17, Temp1(sp) ldl t0, Temp0 + HighPart(sp) // Get exp field of p ldl v0, Temp1 + HighPart(sp) // Get exp field of q and t0, t1, t0 xor t0, t1, t2 and v0, t1, v0 beq t2, p_NaN_inf // Goto p_NaN_inf if all 1s xor v0, t1, t3 cpys f16, f16, f0 // Copy to f0 for return subl t0, v0, t0 beq t3, q_NaN_inf // Goto q_NaN_inf if all 1s bge t0, not_quick // A quick check to see if |p| < |q| br zero, return // If so, just return not_quick: ldah t2, 0x350(zero) ldah t3, 0x340(zero) cmple v0, t2, t2 cmplt t0, t3, t3 bne t2, small_q // If q is small, goto small_q // to avoid underflow/denorms ldah t2, 0x1a0(zero) beq t3, general_case // If exponent difference is not small enough, // go do the looping general case cmplt t0, t2, t2 divtc f16, f17, f0 // r = p/q ldah t3, -0x400(zero) beq t2, r_is_big // Can we do a quick extended mul & sub? stt f17, Temp0(sp) // Yes, the ratio fits in an integer ldl t2, Temp0(sp) and t2, t3, t2 stl t2, Temp0(sp) cvttqc f0, f1 // Integerize the ratio cvtqt f1, f0 ldt f1, Temp0(sp) subt f17, f1, f10 // Split into high and low mult f0, f1, f1 // Multiply by the integerized ratio mult f0, f10, f0 subt f16, f1, f1 // Subtract the pieces, in this order subt f1, f0, f0 br zero, return // // We need to convert r to an extended precision integer. // r_is_big: stt f17, Temp0(sp) ldt f10, int_con ldl t3, Temp0(sp) ldah t4, -0x400(zero) cpys f0, f10, f11 ldt f10, denorm_con and t3, t4, t3 stl t3, Temp0(sp) addtc f0, f11, f1 // Add big subt f1, f11, f1 // Subtract big cpys f1, f10, f0 ldt f10, Temp0(sp) subt f17, f10, f17 // yhi = (y + 2^(p-1+k)) - 2^(p-1+k) addt f1, f0, f11 subt f11, f0, f0 mult f0, f10, f11 // Extended precision multiply subt f1, f0, f1 mult f0, f17, f0 subt f16, f11, f11 // ... and subtract mult f1, f10, f10 mult f1, f17, f1 subt f11, f0, f0 subt f0, f10, f0 subt f0, f1, f0 // and done br zero, return small_q: ble v0, quick_small_q // If exp_q is > 0, use the general case general_case: cmptlt f16, f31, f11 // Set flag if p is negative mov zero, t2 fbeq f11, small_q_rejoin_general_case mov 1, t2 br zero, small_q_rejoin_general_case quick_small_q: cmptlt f16, f31, f1 // Capture sign of p mov zero, t2 ldt f10, int_con cpyse f10, f17, f0 // q, normalized subt f0, f10, f17 stt f17, Temp0(sp) fbeq f1, 30f mov 1, t2 // p is negative 30: ldl t4, Temp0 + HighPart(sp) ldah t3, 0x4320(zero) and t4, t1, t4 subl t4, t3, t4 mov t4, v0 fbeq f17, q_is_zero // Can't divide by 0 cpyse f10, f16, f11 bne t0, small_q_done // If c == 0, q is denormal too subt f11, f10, f16 // Ayup, q is denormal, too stt f16, Temp0(sp) ldl t5, Temp0 + HighPart(sp) // Check for p == 0. (in case) fbeq f16, done and t5, t1, t5 subl t5, t3, t3 mov t3, t0 cmplt t0, v0, t5 beq t5, small_q_done stt f16, Temp0(sp) ldl v0, Temp0 + HighPart(sp) stt f31, Temp1(sp) ldl t6, Temp1 + HighPart(sp) ldah t3, 0x10(zero) ldah t5, 0x4000(zero) lda t3, -1(t3) zapnot v0, 7, v0 subl t5, t0, t0 and t6, t3, t3 ldah t5, 0x3ff0(zero) bis t3, t0, t3 bis v0, t5, v0 stl t3, Temp1 + HighPart(sp) stl v0, Temp0 + HighPart(sp) ldt f1, Temp0(sp) ldt f0, Temp1(sp) addt f1, f0, f0 beq t2, 40f // Need to negate p? cpysn f0, f0, f0 40: // Now rescale p stt f0, Temp0(sp) ldl t5, Temp0 + HighPart(sp) subl t5, t0, t0 // Reduce c by q's exponent stl t0, Temp0 + HighPart(sp) ldt f16, Temp0(sp) cpys f16, f16, f0 br zero, return small_q_done: subl t0, v0, t0 // Adjust c by q's exponent, // and fall into ... small_q_rejoin_general_case: stt f16, Temp0(sp) // Normalize p and q stt f17, Temp1(sp) ldl t3, Temp0 + HighPart(sp) ldl t6, Temp1 + HighPart(sp) ldah t4, 0x10(zero) lda t4, -1(t4) and t3, t4, t3 ldah t5, 0x4330(zero) and t6, t4, t6 addl t3, t5, t3 stl t3, Temp0 + HighPart(sp) addl t6, t5, t6 stl t6, Temp1 + HighPart(sp) ldt f16, Temp0(sp) ldt f10, Temp1(sp) cmptle f10, f16, f11 // If p >= q, then p -= q fbeq f11, 50f subt f16, f10, f16 fbeq f16, done // If that makes p == 0, goto done 50: stt f10, Temp0(sp) // Convert q to extended ldl a0, Temp0(sp) ldah t3, -0x400(zero) and a0, t3, t3 stl t3, Temp0(sp) ldah a0, 0x340(zero) ldt f1, Temp0(sp) subt f10, f1, f0 // High and low // // Here's the dreaded loop, bane of good fmod implemetors around the world, // and all-too-often omitted by mediocre fmod implemetors. // dread_loop: subl t0, a0, t0 // Reduce c blt t0, end_of_loop // End of loop? stt f16, Temp0(sp) // Scale p ldl t7, Temp0 + HighPart(sp) ldah t6, 0x340(zero) addl t7, t6, t6 stl t6, Temp0 + HighPart(sp) ldt f11, Temp0(sp) ldt f12, int_con divtc f11, f10, f16 // r = p/q addtc f16, f12, f13 // Add big ldt f16, denorm_con subt f13, f12, f12 // Subtract big cpys f12, f16, f14 addt f12, f14, f13 // Split q into hi & lo, too subt f13, f14, f13 mult f13, f1, f16 // Extended multiply subt f12, f13, f12 mult f13, f0, f13 subt f11, f16, f11 // And subtract mult f12, f1, f14 mult f12, f0, f12 subt f11, f13, f11 subt f11, f14, f11 subt f11, f12, f16 fbne f16, dread_loop // Continue looping ... cpys f16, f16, f0 // ... unless p == 0, in which case, ... br zero, return // ... return p // // We may need one additional iteration. Fortunately, this one can use // the faster scaling for p. // end_of_loop: addl t0, a0, t0 // Bump c back up beq t0, almost_done // And unless it was zero, ... stt f16, Temp0(sp) ldl t6, Temp0 + HighPart(sp) addl t6, t0, t0 stl t0, Temp0 + HighPart(sp) ldt f13, Temp0(sp) ldt f11, int_con divtc f13, f10, f14 ldt f16, denorm_con addtc f14, f11, f12 subt f12, f11, f11 cpys f11, f16, f14 addt f11, f14, f12 subt f12, f14, f12 mult f12, f1, f16 subt f11, f12, f11 mult f12, f0, f12 subt f13, f16, f13 mult f11, f1, f1 mult f11, f0, f0 subt f13, f12, f12 subt f12, f1, f1 subt f1, f0, f16 // // Here, manage the final niggling details: // (a) Make a final check for p == 0, // (b) Compute the exponent field of the result, // (c) Check for underflow of the result, // (d) And give it the sign of p. almost_done: fbeq f16, done // p == 0? stt f16, Temp0(sp) ldl t6, Temp0 + HighPart(sp) ldah t0, -0x10(zero) and t6, t0, t0 // Isolate exponent of p addl t0, v0, v0 // Add in c subl v0, t5, v0 // Subtract the 'biased denorm' ble v0, gen_underflow_or_denormal // Branch if exponent <= 0 stt f16, Temp0(sp) ldl a0, Temp0 + HighPart(sp) and a0, t4, t4 bis t4, v0, v0 stl v0, Temp0 + HighPart(sp) ldt f16, Temp0(sp) beq t2, done // Was input p >= 0? cpysn f16, f16, f0 // No; so return -p. br zero, return // // Exceptions // gen_underflow_or_denormal: lda t0, fmodName stl t0, ExRec + ErName(sp) ldah t4, 0x800(zero) stt f16, ExRec + ErArg0(sp) stt f10, ExRec + ErArg1(sp) lda t4, 0x38(t4) // MOD_REM_UNDERFLOW stl t4, ExRec + ErErr(sp) lda v0, ExRec(sp) bsr ra, __dpml_exception ldt f16, 0(v0) // // Return p, which is in f16 // done: cpys f16, f16, f0 br zero, return q_is_zero: lda t3, fmodName stl t3, ExRec + ErName(sp) ldah t5, 0x800(zero) stt f16, ExRec + ErArg0(sp) stt f17, ExRec + ErArg1(sp) lda t5, 0x39(t5) // MOD_REM_BY_ZERO stl t5, ExRec + ErErr(sp) br zero, call_exception // // If q is a NaN, return q. // If q is Inf, return p (p mod Inf = p). // q_NaN_inf: stt f17, Temp0(sp) ldl t6, Temp0(sp) ldah t2, 0x10(zero) ldl t4, Temp0 + HighPart(sp) lda t2, -1(t2) and t4, t2, t2 bis t2, t6, t2 and t4, t1, t3 cmpult zero, t2, t2 cmpeq t3, t1, t3 beq t3, 60f and t3, t2, t3 60: cpys f17, f17, f0 // Return q ... bne t3, return // ... if it's a NaN cpys f16, f16, f0 // Otherwise, return p br zero, return // // If p (or q) is a NaN, return p (or q) // Otherwise, report an exception (MOD_REM_INF) // p_NaN_inf: stt f16, Temp0(sp) ldl ra, Temp0(sp) ldah v0, 0x10(zero) ldl t5, Temp0 + HighPart(sp) lda v0, -1(v0) and t5, v0, v0 bis v0, ra, v0 and t5, t1, t7 cmpult zero, v0, v0 cmpeq t7, t1, t7 beq t7, 70f and t7, v0, t7 beq t7, 70f cpys f16, f16, f0 // return p br zero, return 70: stt f17, Temp0(sp) ldl t4, Temp0 + HighPart(sp) and t4, t1, t6 cmpeq t6, t1, t1 beq t1, exc_mod_rem_of_inf ldl t2, Temp0(sp) ldah t3, 0x10(zero) lda t3, -1(t3) and t4, t3, t3 bis t3, t2, t2 cmpult zero, t2, t2 and t1, t2, t1 beq t1, exc_mod_rem_of_inf cpys f17, f17, f0 // return q br zero, return exc_mod_rem_of_inf: lda t0, fmodName stl t0, ExRec + ErName(sp) ldah t7, 0x800(zero) stt f16, ExRec + ErArg0(sp) stt f17, ExRec + ErArg1(sp) lda t7, 0x3a(t7) // MOD_REM_OF_INF stl t7, ExRec + ErErr(sp) call_exception: lda v0, ExRec(sp) bsr ra, __dpml_exception ldt f0, 0(v0) // // And return // return: ldq ra, SaveRa(sp) lda sp, FrameLength(sp) ret zero, (ra) .end fmod .align 2 .data denorm_con: .long 0x0 // Scale factor for normalization .long 0x44d00000 int_con: .long 0x0 // 2^TT, to integerize a number .long 0x43300000 fmodName: .ascii "fmod\0"