//      TITLE("Alpha AXP ArcTangent2")
//++
//
// Copyright (c) 1993, 1994  Digital Equipment Corporation
//
// Module Name:
//
//    atan2.s
//
// Abstract:
//
//    This module implements a high-performance Alpha AXP specific routine
//    for IEEE double format arctangent2.
//
// Author:
//
//    Andy Garside
//
// Environment:
//
//    User mode.
//
// Revision History:
//
//    Thomas Van Baak (tvb) 15-Feb-1994
//
//        Adapted for NT.
//
//--

#include "ksalpha.h"

//
// Define DPML exception record for NT.
//

        .struct 0
ErErr:  .space  4                       // error code
ErCxt:  .space  4                       // context
ErPlat: .space  4                       // platform
ErEnv:  .space  4                       // environment
ErRet:  .space  4                       // return value pointer
ErName: .space  4                       // function name
ErType: .space  8                       // flags and fill
ErVal:  .space  8                       // return value
ErArg0: .space  8                       // arg 0
ErArg1: .space  8                       // arg 1
ErArg2: .space  8                       // arg 2
ErArg3: .space  8                       // arg 3
DpmlExceptionLength:

//
// Define stack frame.
//

        .struct 0
SaveS0: .space  8                       //
SaveS1: .space  8                       //
SaveRa: .space  8                       //
SaveF2: .space  8                       //
SaveF3: .space  8                       //
Temp:   .space  8                       // save argument
ExRec:  .space  DpmlExceptionLength     // exception record
        .space  8                       // for 16-byte stack alignment
FrameLength:

//
// Define lower and upper 32-bit parts of 64-bit double.
//

#define LowPart 0x0
#define HighPart 0x4

//
// Define offsets into atan_t_table.
//

#define ATAN_INF        0xf18
#define TWICE_ATAN_INF  0xf28

        SBTTL("ArcTangent2")

//++
//
// double
// atan2 (
//    IN double y
//    IN double x
//    )
//
// Routine Description:
//
//    This function returns the arctangent of the given double arguments.
//    It returns atan(y/x) in range [-pi,pi].
//
// Arguments:
//
//    y (f16) - Supplies the argument value.
//
//    x (f17) - Supplies the argument value.
//
// Return Value:
//
//    The double arctangent2 result is returned as the function value in f0.
//
//--

        NESTED_ENTRY(atan2, FrameLength, ra)

        lda     sp, -FrameLength(sp)    // allocate stack frame
        stq     s0, SaveS0(sp)
        stq     s1, SaveS1(sp)
        stq     ra, SaveRa(sp)
        stt     f2, SaveF2(sp)
        stt     f3, SaveF3(sp)

        PROLOGUE_END

        cpys    f16, f16, f2            // y
        ldah    s0, 0x7ff0(zero)
        cpys    f17, f17, f3            // x
        stt     f2, Temp(sp)
        ldl     v0, Temp + HighPart(sp)
        and     v0, s0, v0
        mov     v0, t0
        xor     t0, s0, t1
        beq     t1, spec_y
        beq     t0, spec_y

        stt     f3, Temp(sp)
        ldl     t2, Temp + HighPart(sp)
        and     t2, s0, t2
        xor     t2, s0, t1
        beq     t1, class_y
        bne     t2, calc_atan2
        br      zero, class_y

//
// Abnormal inputs
//

spec_y: stt     f3, Temp(sp)
        ldl     t2, Temp + HighPart(sp)
        and     t2, s0, t2

//
// Classify y according to type
//

class_y:
        stt     f2, Temp(sp)
        ldl     t3, Temp + HighPart(sp)
        zapnot  t3, 0xf, t1
        and     t3, s0, t4
        srl     t1, 31, t1
        and     t1, 1, t1
        beq     t4, LL00d0
        cmpult  t4, s0, t4
        beq     t4, LL0098
        addl    t1, 4, t5
        br      zero, class_x
LL0098: ldah    t6, 0x10(zero)
        ldl     t4, Temp(sp)
        lda     t6, -1(t6)
        and     t3, t6, t6
        stl     t6, Temp + HighPart(sp)
        bis     t6, t4, t4
        srl     t6, 19, t6
        beq     t4, LL00c8
        and     t6, 1, t6
        mov     t6, t5
        br      zero, class_x
LL00c8: addl    t1, 2, t5
        br      zero, class_x
LL00d0: ldl     t7, Temp(sp)
        ldah    t4, 0x10(zero)
        lda     t4, -1(t4)
        and     t3, t4, t3
        bis     t3, t7, t7
        stl     t3, Temp + HighPart(sp)
        mov     6, t6
        cmoveq  t7, 8, t6
        addl    t1, t6, t5

//
// Classify x according to type
//

class_x:
        stt     f3, Temp(sp)
        ldl     t3, Temp + HighPart(sp)
        zapnot  t3, 0xf, t4
        and     t3, s0, t1
        srl     t4, 31, t4
        and     t4, 1, t4
        beq     t1, LL0158
        cmpult  t1, s0, t1
        beq     t1, LL0120
        addl    t4, 4, t6
        br      zero, switch
LL0120: ldah    t1, 0x10(zero)
        ldl     t7, Temp(sp)
        lda     t1, -1(t1)
        and     t3, t1, t1
        bis     t1, t7, t7
        stl     t1, Temp + HighPart(sp)
        beq     t7, LL0150
        srl     t1, 19, t1
        and     t1, 1, t1
        mov     t1, t6
        br      zero, switch
LL0150: addl    t4, 2, t6
        br      zero, switch
LL0158: ldl     a0, Temp(sp)
        ldah    t7, 0x10(zero)
        lda     t7, -1(t7)
        and     t3, t7, t3
        bis     t3, a0, a0
        stl     t3, Temp + HighPart(sp)
        mov     6, t1
        cmoveq  a0, 8, t1
        addl    t4, t1, t6

//
// switch on class(y) and class(x)
//

switch: sra     t5, 1, a0
        sra     t6, 1, t3
        s4addl  a0, a0, a0
        addl    a0, t3, t3
        cmpule  t3, 24, t12
        beq     t12, cpys_y_class

        lda     t12, Switch_table
        s4addl  t3, t12, t12
        ldl     t12, 0(t12)
        jmp     zero, (t12)

ret_y:  cpys    f2, f2, f0
        br      zero, done

ret_x:  cpys    f3, f3, f0
        br      zero, done

infs:
        lda     t1, atan2Name
        stl     t1, ExRec + ErName(sp)
        ldah    t3, 0x800(zero)
        stt     f2, ExRec + ErArg0(sp)
        stt     f3, ExRec + ErArg1(sp)
        lda     t3, 9(t3)
        stl     t3, ExRec + ErErr(sp)
        lda     v0, ExRec(sp)
        bsr     ra, __dpml_exception
        ldt     f0, 0(v0)
        br      zero, done

zeros:
        lda     t6, atan2Name
        stl     t6, ExRec + ErName(sp)
        ldah    a0, 0x800(zero)
        stt     f2, ExRec + ErArg0(sp)
        stt     f3, ExRec + ErArg1(sp)
        lda     a0, 8(a0)
        stl     a0, ExRec + ErErr(sp)
        lda     v0, ExRec(sp)
        bsr     ra, __dpml_exception
        ldt     f0, 0(v0)
        br      zero, done

ret_inf:
        ldt     f0, __atan_t_table + ATAN_INF

cpys_y_class:
        blbc    t5, done
        cpysn   f0, f0, f0
        br      zero, done

ret_tw_inf:
        blbc    t6, x_pos

        ldt     f16, __atan_t_table + TWICE_ATAN_INF
        cpys    f16, f16, f0

        blbc    t5, done
        cpysn   f0, f0, f0
        br      zero, done

x_pos:  cpys    f31, f31, f16
        cpys    f16, f16, f0

        blbc    t5, done
        cpysn   f0, f0, f0
        br      zero, done

de_o_norm:
        ldah    t4, 0x4350(zero)        // underflow check
        cmpult  t2, t4, t4
        bne     t4, scale_up_denorm
        br      zero, underflow

n_o_de: ldah    t1, 0x360(zero)         // check for const range
        cmplt   t0, t1, t1
        beq     t1, const_range


// Scale x and y up by 2^F_PRECISION and adjust exp_x and exp_y accordingly.
// With x and y scaled into the normal range, we can rejoin the main logic
// flow for computing atan(y/x)

scale_up_denorm:

        beq     t0, LL02c0
        stt     f2, Temp(sp)
        ldl     ra, Temp + HighPart(sp)
        ldah    v0, 0x4330(zero)
        ldah    t3, -0x7ff0(zero)
        addl    t0, v0, v0
        lda     t3, -1(t3)
        and     ra, t3, t3
        mov     v0, t0
        bis     t3, t0, t3
        stl     t3, Temp + HighPart(sp)
        ldt     f2, Temp(sp)
        br      zero, LL02e4
LL02c0: ldt     f17, Two53
        cpys    f2, f17, f16
        cpyse   f16, f2, f0
        subt    f0, f16, f2
        stt     f2, Temp(sp)
        ldl     t4, Temp + HighPart(sp)
        and     t4, s0, t4
        mov     t4, t0
LL02e4: beq     t2, LL0318
        stt     f3, Temp(sp)
        ldl     a0, Temp + HighPart(sp)
        ldah    v0, -0x7ff0(zero)
        ldah    ra, 0x4330(zero)
        lda     v0, -1(v0)
        addl    t2, ra, t2
        and     a0, v0, v0
        bis     v0, t2, v0
        stl     v0, Temp + HighPart(sp)
        ldt     f3, Temp(sp)
        br      zero, calc_atan2
LL0318: ldt     f17, Two53
        cpys    f3, f17, f0
        cpyse   f0, f3, f16
        subt    f16, f0, f3
        stt     f3, Temp(sp)
        ldl     t1, Temp + HighPart(sp)
        and     t1, s0, t1
        mov     t1, t2

//
//  OK. Calculate atan2.
//

calc_atan2:
        subl    t0, t2, s1
        ldah    t4, 0x360(zero)         // check for const range
        ldah    t5, -0x1c0(zero)        // check for identity range
        cmplt   s1, t4, t4
        cmple   s1, t5, t5
        beq     t4, const_range
        bne     t5, ident_range
        divt    f2, f3, f16
        bsr     ra, atan
        cpys    f0, f0, f1
        cmptlt  f31, f3, f3
        cpys    f1, f1, f0
        fbeq    f3, post_proc
        br      zero, done

ident_range:
        ldah    v0, -0x360(zero)        // check for possible underflow
        cmpult  s1, v0, v0
        fbge    f3, poss_under
        beq     v0, poss_under

        ldt     f10, __atan_t_table + TWICE_ATAN_INF
        br      zero, fix_sign

poss_under:
        ldah    t1, -0x3fe0(zero)       // check for certain underflow or denorm
        cmpule  s1, t1, t1
        bne     t1, under_or_de

        divt    f2, f3, f1
        cmptlt  f31, f3, f3
        fbeq    f3, post_proc
        cpys    f1, f1, f0
        br      zero, done

post_proc:
        ldt     f11, __atan_t_table + TWICE_ATAN_INF
        cpys    f2, f11, f12
        addt    f1, f12, f0
        br      zero, done

under_or_de:
        ldah    t3, -0x4350(zero)       // check for underflow
        cmpult  s1, t3, t3
        bne     t3, underflow

        ldah    t6, 0x350(zero)         // fixup denorm check
        cpys    f2, f2, f13
        stt     f13, Temp(sp)
        ldl     t5, Temp + HighPart(sp)
        addl    t5, t6, t5
        stl     t5, Temp + HighPart(sp)
        ldt     f14, Temp(sp)
        divt    f14, f3, f14
        stt     f14, Temp(sp)
        ldl     a2, Temp + HighPart(sp)
        and     a2, s0, s0
        subl    s0, t6, t6
        ble     t6, underflow

        stt     f14, Temp(sp)
        ldl     a4, Temp + HighPart(sp)
        ldah    a5, -0x7ff0(zero)
        lda     a5, -1(a5)
        and     a4, a5, a4
        bis     a4, t6, t6
        stl     t6, Temp + HighPart(sp)
        ldt     f0, Temp(sp)
        br      zero, done

//
// quotient underflows
//

underflow:
        lda     t10, atan2Name
        ldah    v0, 0x800(zero)
        stl     t10, ExRec + ErName(sp)
        stt     f2, ExRec + ErArg0(sp)
        lda     v0, 0xa(v0)
        stt     f3, ExRec + ErArg1(sp)
        stl     v0, ExRec + ErErr(sp)
        lda     v0, ExRec(sp)
        bsr     ra, __dpml_exception
        ldt     f0, 0(v0)
        br      zero, done

const_range:
        ldt     f10, __atan_t_table + ATAN_INF

fix_sign:
        cpys    f2, f10, f0

//
// Restore registers and return with result in f0.
//

done:
        ldq     s0, SaveS0(sp)
        ldq     s1, SaveS1(sp)
        ldq     ra, SaveRa(sp)
        ldt     f2, SaveF2(sp)
        ldt     f3, SaveF3(sp)
        lda     sp, FrameLength(sp)     // deallocate stack frame
        ret     zero, (ra)              // return

        .end    atan2

        .rdata
        .align  3

//
// Define floating point constants.
//

One:    .double 1.0

Two53:  .quad   0x4340000000000000      // 2^53 (9007199254740992)

//
// switch on class of y and x
//
Switch_table:
        .long   ret_y
        .long   ret_y
        .long   ret_y
        .long   ret_y
        .long   ret_y
        .long   ret_x
        .long   infs
        .long   ret_inf
        .long   ret_inf
        .long   ret_inf
        .long   ret_x
        .long   ret_tw_inf
        .long   cpys_y_class
        .long   n_o_de
        .long   ret_inf
        .long   ret_x
        .long   ret_tw_inf
        .long   de_o_norm
        .long   scale_up_denorm
        .long   ret_inf
        .long   ret_x
        .long   ret_tw_inf
        .long   ret_tw_inf
        .long   ret_tw_inf
        .long   zeros

//
// Function name for dpml_exception.
//

atan2Name:
       .ascii  "atan2\0"