//      TITLE("Alpha AXP fmod")
//++
//
// Copyright (c) 1993, 1994  Digital Equipment Corporation
//
// Module Name:
//
//    fmod.s
//
// Abstract:
//
//    This module implements a high-performance Alpha AXP specific routine
//    for IEEE double format fmod.
//
// Author:
//
//    (ajg) 1-Nov-1991.
//
// Environment:
//
//    User mode.
//
// Revision History:
//
//    Thomas Van Baak (tvb) 15-Apr-1994
//
//        Adapted for NT.
//
//--

#include "ksalpha.h"

//
// Define DPML exception record for NT.
//

        .struct 0
ErErr:  .space  4                       // error code
ErCxt:  .space  4                       // context
ErPlat: .space  4                       // platform
ErEnv:  .space  4                       // environment
ErRet:  .space  4                       // return value pointer
ErName: .space  4                       // function name
ErType: .space  8                       // flags and fill
ErVal:  .space  8                       // return value
ErArg0: .space  8                       // arg 0
ErArg1: .space  8                       // arg 1
ErArg2: .space  8                       // arg 2
ErArg3: .space  8                       // arg 3
DpmlExceptionLength:

//
// Define stack frame.
//

        .struct 0
SaveRa: .space  8
Temp0:  .space  8                       // to extract exponent from p, etc
Temp1:  .space  8                       // to extract exponent from q, etc
ExRec:  .space  DpmlExceptionLength     // exception record
        .space  0                       // for 16-byte stack alignment
FrameLength:


//
// Define lower and upper 32-bit parts of 64-bit double.
//

#define LowPart 0x0
#define HighPart 0x4

        SBTTL("fmod")

//++
//
// double
// fmod (
//    IN double x
//    IN double y
//    )
//
// Routine Description:
//
//    This function returns the modulus of the two given double arguments.
//    The fmod function is an exactly computable function defined by:
//
//        mod(p, q) = p - trunc(p/q)*q
//
// Arguments:
//
//    x (f16) - Supplies the dividend value.
//    y (f17) - Supplies the divisor value.
//
// Return Value:
//
//    The double modulus result is returned as the function value in f0.
//
//--

        NESTED_ENTRY(fmod, FrameLength, ra)

        lda     sp, -FrameLength(sp)    // allocate stack frame
        stq     ra, SaveRa(sp)

        PROLOGUE_END

        ldah    t1, 0x7ff0(zero)                // Load exponent mask
        stq     zero, Temp0(sp)

        stt     f16, Temp0(sp)
        stt     f17, Temp1(sp)
        ldl     t0, Temp0 + HighPart(sp)        // Get exp field of p
        ldl     v0, Temp1 + HighPart(sp)        // Get exp field of q
        and     t0, t1, t0
        xor     t0, t1, t2
        and     v0, t1, v0
        beq     t2, p_NaN_inf                   // Goto p_NaN_inf if all 1s

        xor     v0, t1, t3
        cpys    f16, f16, f0                    // Copy to f0 for return
        subl    t0, v0, t0
        beq     t3, q_NaN_inf                   // Goto q_NaN_inf if all 1s

        bge     t0, not_quick                   // A quick check to see if |p| < |q|

        br      zero, return                    // If so, just return

not_quick:
        ldah    t2, 0x350(zero)
        ldah    t3, 0x340(zero)
        cmple   v0, t2, t2
        cmplt   t0, t3, t3
        bne     t2, small_q                     // If q is small, goto small_q
                                                // to avoid underflow/denorms

        ldah    t2, 0x1a0(zero)
        beq     t3, general_case                // If exponent difference is not small enough,
                                                // go do the looping general case

        cmplt   t0, t2, t2
        divtc   f16, f17, f0                    // r = p/q
        ldah    t3, -0x400(zero)
        beq     t2, r_is_big                    // Can we do a quick extended mul & sub?

        stt     f17, Temp0(sp)                  // Yes, the ratio fits in an integer
        ldl     t2, Temp0(sp)
        and     t2, t3, t2
        stl     t2, Temp0(sp)
        cvttqc  f0, f1                          // Integerize the ratio
        cvtqt   f1, f0
        ldt     f1, Temp0(sp)
        subt    f17, f1, f10                    // Split into high and low
        mult    f0, f1, f1                      // Multiply by the integerized ratio
        mult    f0, f10, f0
        subt    f16, f1, f1                     // Subtract the pieces, in this order
        subt    f1, f0, f0
        br      zero, return

//
//      We need to convert r to an extended precision integer.
//

r_is_big: 
        stt     f17, Temp0(sp)
        ldt     f10, int_con
        ldl     t3, Temp0(sp)
        ldah    t4, -0x400(zero)
        cpys    f0, f10, f11
        ldt     f10, denorm_con
        and     t3, t4, t3
        stl     t3, Temp0(sp)
        addtc   f0, f11, f1                     // Add big
        subt    f1, f11, f1                     // Subtract big
        cpys    f1, f10, f0
        ldt     f10, Temp0(sp)
        subt    f17, f10, f17                   // yhi = (y + 2^(p-1+k)) - 2^(p-1+k)
        addt    f1, f0, f11
        subt    f11, f0, f0
        mult    f0, f10, f11                    // Extended precision multiply
        subt    f1, f0, f1
        mult    f0, f17, f0
        subt    f16, f11, f11                   // ... and subtract
        mult    f1, f10, f10
        mult    f1, f17, f1
        subt    f11, f0, f0
        subt    f0, f10, f0
        subt    f0, f1, f0                      // and done
        br      zero, return

small_q: ble     v0, quick_small_q              // If exp_q is > 0, use the general case

general_case:
        cmptlt  f16, f31, f11                   // Set flag if p is negative
        mov     zero, t2
        fbeq    f11, small_q_rejoin_general_case
        mov     1, t2
        br      zero, small_q_rejoin_general_case

quick_small_q: 
        cmptlt  f16, f31, f1                    // Capture sign of p
        mov     zero, t2
        ldt     f10, int_con
        cpyse   f10, f17, f0                    // q, normalized
        subt    f0, f10, f17
        stt     f17, Temp0(sp)
        fbeq    f1, 30f
        mov     1, t2                           // p is negative
30:     ldl     t4, Temp0 + HighPart(sp)
        ldah    t3, 0x4320(zero)
        and     t4, t1, t4
        subl    t4, t3, t4
        mov     t4, v0
        fbeq    f17, q_is_zero                  // Can't divide by 0
        cpyse   f10, f16, f11
        bne     t0, small_q_done                // If c == 0, q is denormal too

        subt    f11, f10, f16                   // Ayup, q is denormal, too
        stt     f16, Temp0(sp)
        ldl     t5, Temp0 + HighPart(sp)        // Check for p == 0.  (in case)
        fbeq    f16, done
        and     t5, t1, t5
        subl    t5, t3, t3
        mov     t3, t0
        cmplt   t0, v0, t5
        beq     t5, small_q_done

        stt     f16, Temp0(sp)
        ldl     v0, Temp0 + HighPart(sp)
        stt     f31, Temp1(sp)
        ldl     t6, Temp1 + HighPart(sp)
        ldah    t3, 0x10(zero)
        ldah    t5, 0x4000(zero)
        lda     t3, -1(t3)
        zapnot  v0, 7, v0
        subl    t5, t0, t0
        and     t6, t3, t3
        ldah    t5, 0x3ff0(zero)
        bis     t3, t0, t3
        bis     v0, t5, v0
        stl     t3, Temp1 + HighPart(sp)
        stl     v0, Temp0 + HighPart(sp)
        ldt     f1, Temp0(sp)
        ldt     f0, Temp1(sp)
        addt    f1, f0, f0
        beq     t2, 40f                         // Need to negate p?
        cpysn   f0, f0, f0
40:                                             // Now rescale p
        stt     f0, Temp0(sp)
        ldl     t5, Temp0 + HighPart(sp)
        subl    t5, t0, t0                      // Reduce c by q's exponent
        stl     t0, Temp0 + HighPart(sp)
        ldt     f16, Temp0(sp)
        cpys    f16, f16, f0
        br      zero, return

small_q_done:
        subl    t0, v0, t0                      // Adjust c by q's exponent,
                                                // and fall into ...

small_q_rejoin_general_case:
        stt     f16, Temp0(sp)                  // Normalize p and q
        stt     f17, Temp1(sp)
        ldl     t3, Temp0 + HighPart(sp)
        ldl     t6, Temp1 + HighPart(sp)
        ldah    t4, 0x10(zero)
        lda     t4, -1(t4)
        and     t3, t4, t3
        ldah    t5, 0x4330(zero)
        and     t6, t4, t6
        addl    t3, t5, t3
        stl     t3, Temp0 + HighPart(sp)
        addl    t6, t5, t6
        stl     t6, Temp1 + HighPart(sp)
        ldt     f16, Temp0(sp)
        ldt     f10, Temp1(sp)
        cmptle  f10, f16, f11                   // If p >= q, then p -= q
        fbeq    f11, 50f
        subt    f16, f10, f16
        fbeq    f16, done                       // If that makes p == 0, goto done
50:     stt     f10, Temp0(sp)                  // Convert q to extended
        ldl     a0, Temp0(sp)
        ldah    t3, -0x400(zero)
        and     a0, t3, t3
        stl     t3, Temp0(sp)
        ldah    a0, 0x340(zero)
        ldt     f1, Temp0(sp)
        subt    f10, f1, f0                     // High and low

//
//      Here's the dreaded loop, bane of good fmod implemetors around the world,
//      and all-too-often omitted by mediocre fmod implemetors.
//

dread_loop:
        subl    t0, a0, t0                      // Reduce c
        blt     t0, end_of_loop                 // End of loop?

        stt     f16, Temp0(sp)                  // Scale p
        ldl     t7, Temp0 + HighPart(sp)
        ldah    t6, 0x340(zero)
        addl    t7, t6, t6
        stl     t6, Temp0 + HighPart(sp)
        ldt     f11, Temp0(sp)
        ldt     f12, int_con
        divtc   f11, f10, f16                   // r = p/q
        addtc   f16, f12, f13                   // Add big
        ldt     f16, denorm_con
        subt    f13, f12, f12                   // Subtract big
        cpys    f12, f16, f14
        addt    f12, f14, f13                   // Split q into hi & lo, too
        subt    f13, f14, f13
        mult    f13, f1, f16                    // Extended multiply
        subt    f12, f13, f12
        mult    f13, f0, f13
        subt    f11, f16, f11                   // And subtract
        mult    f12, f1, f14
        mult    f12, f0, f12
        subt    f11, f13, f11
        subt    f11, f14, f11
        subt    f11, f12, f16
        fbne    f16, dread_loop                 // Continue looping ...
        cpys    f16, f16, f0                    // ... unless p == 0, in which case, ...
        br      zero, return                    // ... return p

//
//      We may need one additional iteration.  Fortunately, this one can use
//      the faster scaling for p.
//

end_of_loop:
        addl    t0, a0, t0                      // Bump c back up
        beq     t0, almost_done                 // And unless it was zero, ...

        stt     f16, Temp0(sp)
        ldl     t6, Temp0 + HighPart(sp)
        addl    t6, t0, t0
        stl     t0, Temp0 + HighPart(sp)
        ldt     f13, Temp0(sp)
        ldt     f11, int_con
        divtc   f13, f10, f14
        ldt     f16, denorm_con
        addtc   f14, f11, f12
        subt    f12, f11, f11
        cpys    f11, f16, f14
        addt    f11, f14, f12
        subt    f12, f14, f12
        mult    f12, f1, f16
        subt    f11, f12, f11
        mult    f12, f0, f12
        subt    f13, f16, f13
        mult    f11, f1, f1
        mult    f11, f0, f0
        subt    f13, f12, f12
        subt    f12, f1, f1
        subt    f1, f0, f16

//
//      Here, manage the final niggling details:
//              (a) Make a final check for p == 0,
//              (b) Compute the exponent field of the result,
//              (c) Check for underflow of the result,
//              (d) And give it the sign of p.
almost_done:
        fbeq    f16, done                       // p == 0?
        stt     f16, Temp0(sp)
        ldl     t6, Temp0 + HighPart(sp)
        ldah    t0, -0x10(zero)
        and     t6, t0, t0                      // Isolate exponent of p
        addl    t0, v0, v0                      // Add in c
        subl    v0, t5, v0                      // Subtract the 'biased denorm'
        ble     v0, gen_underflow_or_denormal   // Branch if exponent <= 0

        stt     f16, Temp0(sp)
        ldl     a0, Temp0 + HighPart(sp)
        and     a0, t4, t4
        bis     t4, v0, v0
        stl     v0, Temp0 + HighPart(sp)
        ldt     f16, Temp0(sp)
        beq     t2, done                        // Was input p >= 0?

        cpysn   f16, f16, f0                    // No; so return -p.
        br      zero, return

//
//      Exceptions
//

gen_underflow_or_denormal: 
        lda     t0, fmodName
        stl     t0, ExRec + ErName(sp)
        ldah    t4, 0x800(zero)
        stt     f16, ExRec + ErArg0(sp)
        stt     f10, ExRec + ErArg1(sp)
        lda     t4, 0x38(t4)            // MOD_REM_UNDERFLOW
        stl     t4, ExRec + ErErr(sp)
        lda     v0, ExRec(sp)
        bsr     ra, __dpml_exception
        ldt     f16, 0(v0)

//
//      Return p, which is in f16
//

done:   cpys    f16, f16, f0
        br      zero, return


q_is_zero: 
        lda     t3, fmodName
        stl     t3, ExRec + ErName(sp)
        ldah    t5, 0x800(zero)
        stt     f16, ExRec + ErArg0(sp)
        stt     f17, ExRec + ErArg1(sp)
        lda     t5, 0x39(t5)            // MOD_REM_BY_ZERO
        stl     t5, ExRec + ErErr(sp)
        br      zero, call_exception

//
//      If q is a NaN, return q.
//      If q is Inf, return p (p mod Inf = p).
//

q_NaN_inf:
        stt     f17, Temp0(sp)
        ldl     t6, Temp0(sp)
        ldah    t2, 0x10(zero)
        ldl     t4, Temp0 + HighPart(sp)
        lda     t2, -1(t2)
        and     t4, t2, t2
        bis     t2, t6, t2
        and     t4, t1, t3
        cmpult  zero, t2, t2
        cmpeq   t3, t1, t3
        beq     t3, 60f
        and     t3, t2, t3
60:     cpys    f17, f17, f0            // Return q ...
        bne     t3, return              // ... if it's a NaN
        cpys    f16, f16, f0            // Otherwise, return p
        br      zero, return

//
//      If p (or q) is a NaN, return p (or q)
//      Otherwise, report an exception (MOD_REM_INF)
//

p_NaN_inf:
        stt     f16, Temp0(sp)
        ldl     ra, Temp0(sp)
        ldah    v0, 0x10(zero)
        ldl     t5, Temp0 + HighPart(sp)
        lda     v0, -1(v0)
        and     t5, v0, v0
        bis     v0, ra, v0
        and     t5, t1, t7
        cmpult  zero, v0, v0
        cmpeq   t7, t1, t7
        beq     t7, 70f
        and     t7, v0, t7
        beq     t7, 70f

        cpys    f16, f16, f0            // return p
        br      zero, return

70:     stt     f17, Temp0(sp)
        ldl     t4, Temp0 + HighPart(sp)
        and     t4, t1, t6
        cmpeq   t6, t1, t1
        beq     t1, exc_mod_rem_of_inf

        ldl     t2, Temp0(sp)
        ldah    t3, 0x10(zero)
        lda     t3, -1(t3)
        and     t4, t3, t3
        bis     t3, t2, t2
        cmpult  zero, t2, t2
        and     t1, t2, t1
        beq     t1, exc_mod_rem_of_inf

        cpys    f17, f17, f0            // return q
        br      zero, return

exc_mod_rem_of_inf: 
        lda     t0, fmodName
        stl     t0, ExRec + ErName(sp)
        ldah    t7, 0x800(zero)
        stt     f16, ExRec + ErArg0(sp)
        stt     f17, ExRec + ErArg1(sp)
        lda     t7, 0x3a(t7)            // MOD_REM_OF_INF
        stl     t7, ExRec + ErErr(sp)
call_exception:
        lda     v0, ExRec(sp)
        bsr     ra, __dpml_exception
        ldt     f0, 0(v0)

//
//      And return
//

return: ldq     ra, SaveRa(sp)
        lda     sp, FrameLength(sp)

        ret     zero, (ra)

        .end   fmod

        .align 2
        .data

denorm_con:
       .long 0x0               // Scale factor for normalization
       .long 0x44d00000

int_con:
       .long 0x0               // 2^TT, to integerize a number
       .long 0x43300000

fmodName:
        .ascii  "fmod\0"