mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1178 lines
42 KiB
1178 lines
42 KiB
|
|
# Copyright 1992, Digital Equipment Corporation
|
|
#
|
|
# This software is furnished under a license and may be used and copied
|
|
# only in accordance with the terms of such license and with the
|
|
# inclusion of the above copyright notice. This software or any other
|
|
# copies thereof may not be provided or otherwise made available to any
|
|
# other person. No title to and ownership of the software is hereby
|
|
# transferred.
|
|
#
|
|
# The information in this software is subject to change without notice
|
|
# and should not be construed as a commitment by Digital Equipment
|
|
# Corporation.
|
|
#
|
|
# Digital assumes no responsibility for the use or reliability of its
|
|
# software on equipment which is not supplied by Digital.
|
|
#
|
|
# 008 17 Jun 1992 KDG/wbn Most of initial tailored version. (See
|
|
# commentary below.)
|
|
#
|
|
# 009 4 Jul 1992 KDG Continue work on initial tailored version,
|
|
# including bugfixes and mod entry points
|
|
#
|
|
# 010 15 Jul 1992 KDG - Final touches for V1 (other than any bugfixes)
|
|
# - .aligns commented out to allow older assembler versions
|
|
#
|
|
# 011 16 Jul 1992 KDG - Bugfix for ots_div_l for -maxint dividend
|
|
# - OSF-only source changes for BL7
|
|
#
|
|
# 012 10 Aug 1992 KDG Fix overflow division entry points
|
|
#
|
|
# 013 23 Sep 1992 KDG Add case-sensitive entry names
|
|
#
|
|
# 014 4 Jan 1993 KDG Tweak for OSF assembler
|
|
#
|
|
# 015 26 Jan 1993 KDG Add underscore prefix, OSF uses CS names
|
|
#
|
|
# 016 5 Apr 1993 WBN Speed up core 64-bit, shrink table entry to 2 QWs
|
|
|
|
#++
|
|
# Entry points defined in this module:
|
|
#
|
|
# -- 32 bit division/remainder support
|
|
# unsigned ots_rem_ui(unsigned dividend, unsigned divisor)
|
|
# unsigned ots_div_ui(unsigned dividend, unsigned divisor)
|
|
# int ots_mod_i(int dividend, int modulus)
|
|
# int ots_rem_i(int dividend, int divisor)
|
|
# int ots_div_i_o(int dividend, int divisor)
|
|
# int ots_div_i(int dividend, int divisor) | "hot spot"
|
|
# {core routine - div32} |
|
|
#
|
|
# -- 64 bit division support
|
|
# {core routine - div64} | (uses div32 for 32b cases)
|
|
# long ots_div_l_o(long dividend, long divisor)
|
|
# long ots_div_l(long dividend, long divisor) | "hot spot"
|
|
# long ots_rem_l(long dividend, long divisor)
|
|
# long ots_mod_l(long dividend, long modulus)
|
|
# unsigned long ots_div_ul(unsigned long dividend, unsigned long divisor)
|
|
# unsigned long ots_rem_ul(unsigned long dividend, unsigned long divisor)
|
|
#
|
|
# Special conventions: No stack space, r0-r1, r16-r19 and r26-r28 ONLY.
|
|
# (Warning: The auto-loader potentially takes some regs across
|
|
# the call if this is being used in a shared lib. environment.)
|
|
#
|
|
# NOTE: This set of routines may start using stack space at some
|
|
# future point in time.
|
|
#
|
|
# -- Possible future entry points include:
|
|
# (These all return results in r0/r1)
|
|
# {int quotient, int remainder} ots_div_mod_i(int dividend, int divisor)
|
|
# {int quotient, int remainder} ots_div_mod_i_o(int dividend, int divisor)
|
|
# {int quotient, int remainder} ots_div_rem_i(int dividend, int divisor)
|
|
# {int quotient, int remainder} ots_div_rem_i_o(int dividend, int divisor)
|
|
# {unsigned quotient, unsigned remainder} ots_div_rem_ui(unsigned dividend, unsigned divisor)
|
|
#
|
|
# {long quotient, long remainder} ots_div_mod_i(long dividend, long divisor)
|
|
# {long quotient, long remainder} ots_div_mod_i_o(long dividend, long divisor)
|
|
# {long quotient, long remainder} ots_div_rem_i(long dividend, long divisor)
|
|
# {long quotient, long remainder} ots_div_rem_i_o(long dividend, long divisor)
|
|
# {unsigned long quotient, unsigned long remainder}
|
|
# ots_div_rem_ui(unsigned long dividend, unsigned long divisor)
|
|
#
|
|
#
|
|
# General commentary:
|
|
#
|
|
# This is an attempt at a fairly high performance version using relatively
|
|
# straightforward algorithms. Note that the code is intended to be scheduled
|
|
# well for EV4, but still reasonably for LCA/EV5.
|
|
#
|
|
# Also, note that there was only so much time available for this, so it
|
|
# is far from "perfect". "Better is the enemy of done"...
|
|
#
|
|
# Possible future areas of improvement (and unfinished business):
|
|
#
|
|
# - Another possible way of doing things for the "slow" (divnn cases)
|
|
# is to use an approximate inverse and convergence. Given the speed
|
|
# of the multiplier on EV4, and given "time to market", this wasn't
|
|
# done for V1.) I have some mail with the algorithm from Bob Gries
|
|
# (through Scott Robinson).
|
|
#
|
|
# - When the divisor is too large for the table, but has n low-order zero
|
|
# bits, see if divisor/2^n fits in the table, and use that entry with
|
|
# dividend/2^n
|
|
#
|
|
# - Use UMULH for the 'mod' routines.
|
|
#
|
|
# This version can do a table lookup division (divisors with <=tablesize)
|
|
# in roughly 32 cycles on an EV4 (with cache hits for all loads), of which
|
|
# 21 are for the umulh. There is a strong bias toward the table-lookup case.
|
|
# Note that for many cases, the umulh is the last thing before the return,
|
|
# so the multiply can occur in parallel with the procedure return.
|
|
# (It is interesting that the R3000 hardware divide instruction takes 33
|
|
# cycles and the R4000 takes 76(!) ...) Small powers of 2 are retired in
|
|
# roughly 20 cycles. Larger divisors take considerably longer at this point.
|
|
#
|
|
|
|
#include "ots_defs.hs"
|
|
|
|
#ifdef OSF
|
|
# to get the PAL_gentrap literal
|
|
#include <alpha/pal.h>
|
|
#endif
|
|
|
|
# Data area description
|
|
#
|
|
# The data area "ots_div_data" is an array of structures, indexed
|
|
# by the divisor value, with each array entry being 16 bytes in size
|
|
# formatted as follows:
|
|
#
|
|
# 6
|
|
# 3 6 0
|
|
# +-------+-------+-------+-------+-------+-------+-------+-------+
|
|
# | 32 bit reciprocal (58 bits) |shift|
|
|
# +-------+-------+-------+-------+-------+-------+-------+-------+
|
|
# | 64 bit reciprocal |
|
|
# +---------------------------------------------------------------+
|
|
#
|
|
# The 64-bit reciprocal has the leading '1' bit omitted, so it provides
|
|
# 65 bits of precision -- enough to handle unsigned 64-bit values.
|
|
#
|
|
# The first longword contains the 6-bit shift amount needed to handle
|
|
# 64-bit cases and powers of two.
|
|
#
|
|
# The 32-bit reciprocal has the shift count built in, so a UMULH gives
|
|
# the correct quotient without shifting. The reciprocal needs 33 bits
|
|
# of precision. The 6-bit shift amount is noise in the reciprocal that
|
|
# can be ignored.
|
|
#
|
|
# (Oh, you want proof?) For divisors up to 2^k, we store k-1 leading
|
|
# zero bits, 33 bits of fraction, (25-k) more bits of fraction, and
|
|
# 6 bits of noise. The standard method would round at the 33rd fraction
|
|
# bit. We need to ensure that the value actually stored is geq the
|
|
# infinite reciprocal, but leq the standard value. For divisors up to
|
|
# 2^k, there will be a zero bit somewhere in the k bits below the 33rd,
|
|
# so as long as we round below the (33+k)th bit, the rounded value
|
|
# plus any noise is still less than the standard value. This requires
|
|
# k < 12.
|
|
#
|
|
# The actual data is declared in ots_div_data_alpha.
|
|
#
|
|
# Offsets to the various fields in the data structure
|
|
#
|
|
#define shift_o 0
|
|
#define recip64_o 8
|
|
#define recip32_o 0
|
|
#
|
|
# Note that the shift/add ops used to compute the table entries
|
|
# "know" that the table size is 16. (i.e. addq -> s8addq -> ldq)
|
|
# By changing the first instruction, it's fairly easy to change the
|
|
# table entry size to 24, 32, or 40 bytes (using s4add/sub), or
|
|
# 56/64/72 bytes using s8add/sub, should that be desirable.
|
|
|
|
# Maximum divisor present in the table
|
|
#
|
|
#define table_max 512
|
|
|
|
# Division by zero gentrap code
|
|
#
|
|
#define GEN_INTDIV -2
|
|
|
|
# Address of division data area (shared by all entry points)
|
|
#
|
|
#ifdef VMS
|
|
.psect ots_link
|
|
ots_div_addr:
|
|
.address ots_div_data
|
|
.psect ots_code
|
|
#endif
|
|
|
|
# Dummy entry point for the module
|
|
#
|
|
.globl _OtsDivide
|
|
.ent _OtsDivide
|
|
_OtsDivide:
|
|
.set noat
|
|
.set noreorder
|
|
#ifdef OSF
|
|
.frame sp, 0, r26
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
#endif
|
|
|
|
|
|
# unsigned ots_rem_ui(unsigned dividend, unsigned divisor)
|
|
# unsigned 32 bit remainder support
|
|
#
|
|
#.align 4
|
|
.globl _OtsRemainder32Unsigned
|
|
.aent _OtsRemainder32Unsigned
|
|
_OtsRemainder32Unsigned:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_rem_ui>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
lda r28, -table_max(r17) # test for table lookup
|
|
subl r17, 1, r1 # first part of power-of-2 check
|
|
blt r17, rui_big # big divisors can (must) be handled by a simple comparison
|
|
and r17, r1, r18 # second part of power-of-2 check
|
|
bgt r28, rui_lrgdiv # branch if large divisor
|
|
addq r17, r17, r0 # compute divisor*2 for table lookup
|
|
beq r18, rui_pwr2 # if zero, divisor is a power of 2
|
|
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
|
|
ldq r1, recip32_o(r27) # load approximate reciprocal
|
|
cmpult r16, r17, r18 # is the dividend < divisor?
|
|
zap r16, 0xF0, r0 # kill the propagated sign bit
|
|
bne r18, rui_lss # if dividend < divisor, fast exit
|
|
umulh r0, r1, r0 # multiplication for division step
|
|
mull r0, r17, r0 # multiply back to get value to subtract
|
|
subl r16, r0, r0
|
|
ret r31, (r26) # and return
|
|
|
|
rui_pwr2:
|
|
beq r17, divzer # check for 0
|
|
and r16, r1, r0 # use x-1 to mask
|
|
ret r31, (r26)
|
|
|
|
rui_lss:
|
|
mov r16, r0
|
|
ret r31, (r26)
|
|
|
|
rui_lrgdiv:
|
|
zap r16, 0xf0, r16 # zero-extend the dividend
|
|
bsr r28, div32 # use the core routine getting the remainder in r1
|
|
sextl r1, r0
|
|
ret r31, (r26)
|
|
|
|
# divisors with the sign bit set. two possible results,
|
|
# dividend if dividend < divisor, or dividend-divisor otherwise
|
|
rui_big:
|
|
cmpult r16, r17, r1
|
|
subl r16, r17, r0
|
|
cmovne r1, r16, r0
|
|
ret r31, (r26)
|
|
|
|
|
|
# unsigned ots_div_ui(unsigned dividend, unsigned divisor)
|
|
# unsigned 32 bit division support
|
|
#
|
|
|
|
#.align 4
|
|
.globl _OtsDivide32Unsigned
|
|
.aent _OtsDivide32Unsigned
|
|
_OtsDivide32Unsigned:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_div_ui>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
lda r28, -table_max(r17) # test for table lookup
|
|
blt r17, dui_big # big divisors can (must) be handled by a simple comparison
|
|
addq r17, r17, r18 # compute divisor*2
|
|
cmpule r17, r16, r0 # is the dividend < divisor?
|
|
beq r17, divzer # check for 0
|
|
s8addq r18, r27, r27 # finish computing table entry addr (table addr+divisor*16)
|
|
beq r0, dui_end # fast out for divisor > dividend
|
|
bgt r28, dui_lrgdiv # branch if large divisor
|
|
ldq r1, recip32_o(r27) # load approximate reciprocal
|
|
zap r16, 0xF0, r16 # kill the propagated sign bit
|
|
blt r1, dui_smpwr2 # go handle powers of 2 specially
|
|
umulh r16, r1, r0 # start multiplication for division step
|
|
dui_end:ret r31, (r26) # and return
|
|
nop
|
|
dui_smpwr2:
|
|
srl r16, r1, r0 # shift the result into place
|
|
sextl r0, r0 # reinsert sign if dividing by 1
|
|
ret r31, (r26) #
|
|
dui_lrgdiv:
|
|
zap r16, 0xf0, r16 # zero-extend the dividend
|
|
bsr r28, div32 # use the core routine getting the remainder in r1
|
|
sextl r0, r0 # make sure the result is in normal form for uint32
|
|
ret r31, (r26)
|
|
|
|
# divisor with the sign bit set. two possible results,
|
|
# 1 if divisor <= dividend, or 0 otherwise
|
|
dui_big:
|
|
cmpule r17, r16, r0
|
|
ret r31, (r26)
|
|
|
|
|
|
# int ots_mod_i(int dividend, int modulus)
|
|
# signed 32 bit modulus support
|
|
#
|
|
# This entry could be MUCH more optimized. It doesn't even try to use
|
|
# UMULH division currently... (A casualty of time-to-market.)
|
|
# Note that mod is only used by Ada and PL/I.
|
|
#
|
|
#.align 4
|
|
.globl _OtsModulus32
|
|
.aent _OtsModulus32
|
|
_OtsModulus32:
|
|
negq r17, r18 # first part of abs(divisor)
|
|
cmovge r17, r17, r18 # second part of abs(divisor)
|
|
subq r18, 1, r1 # start checking for power of 2
|
|
beq r17, divzer # check for 0
|
|
and r18, r1, r0 # second part of power-of-2 check
|
|
beq r0, mi_p2 # for powers of two, simply do a mask
|
|
# (note that the power-of-2 case MUST be used to handle
|
|
# the -maxint case due to the way the fix-up info is
|
|
# saved across the core routine call)
|
|
xor r16, r17, r28 # get xor of signs
|
|
clr r19 # don't need a bias if dividend and divisor have same sign
|
|
cmovlt r28, r17, r19 # bias is original divisor for different sign case
|
|
and r16, r17, r27 # if both dividend & divisor were neg. need to negate result
|
|
mov r18, r17 # move abs(divisor) into r17
|
|
negq r16, r18 # first part of abs(dividend)
|
|
cmovlt r16, r18, r16 # second part of abs(dividend)
|
|
cmplt r27, r31, r0 # get 1 if both operands were <0
|
|
sll r0, 63, r0 # get bit as the high bit
|
|
bis r0, r19, r19 # and MERGE with bias (0 -> no fixup, -maxint -> negate result,
|
|
# divisor > 0 - subtract remainder if non-zero, divisor < 0 -
|
|
# add remainder if non-zero)
|
|
bsr r28, div32 # use the core routine getting the remainder in r1
|
|
cmoveq r1, r31, r19 # don't do any fix-up if the remainder was zero
|
|
addq r19, r19, r18 # check to see if this is the negative/negative case, which just gets a negated remainder
|
|
subq r19, 1, r28 # wrap -maxint to positive
|
|
negl r1, r0 # move negated value, may abort later
|
|
cmovlt r28, r1, r0 # if both positive, or negative divisor, keep positive remainder
|
|
cmoveq r18, r31, r19 # now that negation is done, treat -maxint case as 0
|
|
addl r19, r0, r0 # add any bias (original divisor or 0)
|
|
ret r31, (r26) # and return
|
|
|
|
mi_p2: cmovge r17, r31, r17 # no bias if divisor was >= 0
|
|
and r16, r1, r1 # use the divisor-1 mask that's already in r1
|
|
cmoveq r1, r31, r17 # use zero if result was zero
|
|
addl r17, r1, r0 # do any biasing, and ensure the result is sign ext
|
|
ret r31, (r26) # and return
|
|
|
|
# int ots_rem_i(int dividend, int divisor)
|
|
# signed 32 bit remainder support
|
|
#
|
|
#.align 4
|
|
.globl _OtsRemainder32
|
|
.aent _OtsRemainder32
|
|
_OtsRemainder32:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_rem_i>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
negq r17, r18 # first part of abs(divisor)
|
|
cmovlt r17, r18, r17 # second part of abs(divisor)
|
|
subq r17, 1, r1 # start checking for power of 2
|
|
and r17, r1, r0 # finish check for power of 2
|
|
sra r16, 63, r19 # get -1/0 if dividend was negative
|
|
negq r16, r18 # first part of abs(dividend)
|
|
cmovlt r16, r18, r16 # second part of abs(dividend)
|
|
beq r0, ri_pwr2 # for powers of two, simply do a mask (not power of 2 include 0 and 80000000)
|
|
lda r28, -table_max(r17) # test for table lookup
|
|
bgt r28, ri_lrgdiv # branch if large divisor
|
|
addq r17, r17, r0 # compute divisor*2 for table lookup
|
|
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
|
|
ldq r1, recip32_o(r27) # load approximate reciprocal
|
|
umulh r16, r1, r0 # multiplication for division step
|
|
mull r0, r17, r0 # multiply back to get value to subtract
|
|
subl r16, r0, r0 # get abs of final result
|
|
xor r0, r19, r0 # start compliment if original dividend was <0
|
|
subl r0, r19, r0 # finish compliement
|
|
ret r31, (r26) # and return
|
|
|
|
# Handle powers of 2, including 0 and 80000000
|
|
ri_pwr2:
|
|
and r16, r1, r0 # use the divisor-1 mask in r1
|
|
beq r17, divzer # division by zero
|
|
xor r0, r19, r0 # start compliment if original dividend was <0
|
|
subl r0, r19, r0 # finish compliement
|
|
ret r31, (r26)
|
|
|
|
nop
|
|
ri_lrgdiv:
|
|
bsr r28, div32 # use the core routine getting the remainder in r1
|
|
xor r1, r19, r0 # start compliment if original dividend was <0
|
|
subl r0, r19, r0 # finish compliement
|
|
ret r31, (r26)
|
|
|
|
|
|
# int ots_div_i_o(int dividend, int divisor)
|
|
# signed 32 bit division support, overflow detection
|
|
#
|
|
#.align 4
|
|
.globl _OtsDivide32Overflow
|
|
.aent _OtsDivide32Overflow
|
|
_OtsDivide32Overflow:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_div_i_o>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
not r17, r1 # is the divisor -1?
|
|
bne r1, di_skip # continue if not
|
|
neglv r16, r0 # quotient = -dividend, overflow on ^x800000000
|
|
ret r31, (r26)
|
|
|
|
# int ots_div_i(int dividend, int divisor)
|
|
# signed 32 bit division support, no overflow detection
|
|
#
|
|
nop #.align 4
|
|
.globl _OtsDivide32
|
|
.aent _OtsDivide32
|
|
_OtsDivide32:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_div_i>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
di_skip:
|
|
di_retry:
|
|
lda r28, -table_max(r17) # test for table lookup
|
|
ble r17, di_notpos # not a positive divisor case
|
|
addq r17, r17, r0 # compute divisor*2
|
|
negq r16, r18 # part 1 of abs(dividend) -> r18. (Note 0xffffffff 80000000 => 0x00000000 80000000)
|
|
bgt r28, di_lrgdiv # branch if large divisor
|
|
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
|
|
cmpule r17, r18, r0 # divisor <= dividend?
|
|
cmovge r16, r16, r18 # part 2 abs. val of the dividend -> r18
|
|
beq r0, di_end # if not, result is zero
|
|
ldq r1, recip32_o(r27) # load approximate reciprocal
|
|
blt r1, di_smpwr2 # go handle powers of 2 specially
|
|
umulh r18, r1, r0 # start multiplication
|
|
blt r16, di_negres # negate result? (done as a branch to allow umulh to "hang out" over end for common case)
|
|
di_end: ret r31, (r26) # return for same-sign (common) case
|
|
di_negres:
|
|
negl r0, r0 # different signs - compliment result
|
|
ret r31, (r26) # return for different-sign (uncommon) case
|
|
di_smpwr2:
|
|
srl r18, r1, r18 # shift the result into place
|
|
sra r16, 63, r16 # get 0/-1 based on sign of dividend
|
|
xor r18, r16, r18 # conditionally compliment
|
|
subl r18, r16, r0 # and increment for the final value
|
|
ret r31, (r26) # (note subl is required for sign ext for %x80000000/1 case)
|
|
|
|
# Zero or negative divisor case. If just a negative divisor,
|
|
# compliment both dividend and divisor and do things again.
|
|
di_notpos:
|
|
beq r17, divzer # division by zero
|
|
negl r17, r17 # |divisor|, note that 0x80000000 still appears negative
|
|
negq r16, r16 # compliment dividend (negq so that 0xffffffff 80000000 => 0x00000000 80000000
|
|
bgt r17, di_retry # dispatch back for normal case (not 0x80000000 or 0)
|
|
sextl r16, r16 #
|
|
cmpeq r16, r17, r0 # -maxint/-maxint = 1, all others 0
|
|
ret r31, (r26) # done
|
|
|
|
# Large divisor for signed 32/32 case
|
|
#
|
|
nop #.align 3
|
|
di_lrgdiv:
|
|
sra r16, 63, r19 # get 0/-1 based on sign of dividend
|
|
cmovlt r16, r18, r16 #
|
|
bsr r28, div32 # go use core routine
|
|
xor r0, r19, r0 # conditionally compliment
|
|
subl r0, r19, r0 # and increment for the final value (subl ensures normalized result)
|
|
ret r31, (r26) # done
|
|
|
|
|
|
# Large divisor case core routine for 32b
|
|
# (wbn)
|
|
#
|
|
# r0 - quotient (output)
|
|
# r1 - remainder (output)
|
|
# r16 - dividend (range 0..2^32-1, zero extended)
|
|
# r17 - divisor (range 1..2^31-1 - overwritten)
|
|
# r18 - scratch
|
|
# r19 - not used (one temp for 'caller')
|
|
# r26 - not used (expected to contain main return address)
|
|
# [r27 - scratch] (not currently written)
|
|
# r28 - this "subroutine" return address
|
|
#
|
|
# Some tightened up bit-at-a-time code for dividing 32-bit integers.
|
|
# It uses two tricks: keep the running remainder and the quotient in
|
|
# the same 64-bit register (MQ?), and add 1 while subtracting the divisor,
|
|
# so that a single CMOV sets both the new remainder and the new quotient.
|
|
# I start off by trying to skip 8 bits at a time; should this skip a
|
|
# smaller amount, so the main loop iterates less often? If the divisor
|
|
# is already known to be large enough, the last case in this test is never
|
|
# used...
|
|
#
|
|
# This code expects as input two integers in the range 0 <= x < 2^31
|
|
# (that is, it doesn't work for general unsigned longwords, and doesn't
|
|
# include sign manipulation.)
|
|
#
|
|
# The code here takes about 34n+11 cycles for a quotient occupying n bytes.
|
|
#
|
|
# Inputs: dividend in r16, divisor in r17
|
|
# Outputs: quotient in r0, remainder in r1
|
|
# Destroys: [r16,]r17,r18,[r27]
|
|
#
|
|
# How many quotient bytes will there be: 0, 1, 2, 3, 4?
|
|
#
|
|
#.align 4
|
|
div32: cmpule r17, r16, r0 # Divisor leq dividend?
|
|
sll r17, 32, r18 # Position divisor for loop
|
|
sll r17, 8, r1 # Prepare for next compare
|
|
beq r0, d32end # Dividend less, quotient is zero.
|
|
ediv32: mov 8-3, r17 # Hope to skip 3 bytes of loop
|
|
cmpule r1, r16, r0 # Shifted divisor still leq dividend?
|
|
sll r1, 8, r1 # Prepare for next compare
|
|
beq r0, d32ent # Go loop over just one byte
|
|
mov 8-2, r17 # Hope to skip 2 bytes of loop
|
|
cmpule r1, r16, r0 # Shifted divisor still leq dividend?
|
|
sll r1, 8, r1 # Prepare for next compare
|
|
beq r0, d32ent # Go loop over just two bytes
|
|
mov 8-1, r17 # Hope to skip 1 byte of loop
|
|
cmpule r1, r16, r0 # Shifted divisor still leq dividend?
|
|
nop # stall - align d32ent and d32loop
|
|
cmovne r0, 8, r17 # If we can't skip any bytes
|
|
|
|
# start loop generating quotient bits. NOTE: The loop setup requires
|
|
# an even number of iterations.
|
|
#
|
|
d32ent: extqh r16, r17, r0 # Shift dividend left for skipped bytes
|
|
subq r18, 1, r1 # Divisor in high LW - 1 in low LW
|
|
s8subq r17, 34, r17 # Convert bytes to bits and adjust
|
|
|
|
addq r0, r0, r0 # Shift left to start first iteration
|
|
d32loop:subq r0, r1, r18 # Can we subtract divisor from it?
|
|
cmovge r18, r18, r0 # If so, set new remainder & quotient
|
|
# stall
|
|
addq r0, r0, r0 # Shift remainder and quotient left
|
|
subq r0, r1, r18 # Can we subtract divisor from it?
|
|
cmovge r18, r18, r0 # If so, set new remainder & quotient
|
|
subq r17, 2, r17 # Loop counter
|
|
addq r0, r0, r0 # Shift remainder and quotient left
|
|
bgt r17, d32loop # Repeat
|
|
subq r0, r1, r18 # Can we subtract divisor from it?
|
|
cmovge r18, r18, r0 # If so, set new remainder & quotient
|
|
# stall
|
|
addq r0, r0, r0 # Shift remainder and quotient left
|
|
subq r0, r1, r18 # Finish last iteration
|
|
cmovge r18, r18, r0
|
|
# stall
|
|
srl r0, 32, r1 # Get remainder in r1
|
|
zap r0, 0xf0, r0 # Keep only quotient in r0
|
|
nop # for alignment
|
|
d32end: cmoveq r0, r16, r1 # Move remainder to r1 for quotient=0 case
|
|
ret r31, (r28) # Not a real software procedure return
|
|
|
|
|
|
# Large divisor case core routine for 64b
|
|
#
|
|
# r0 - quotient (output)
|
|
# r1 - remainder (output)
|
|
# r16 - dividend (range 0..2^64-1 - overwritten)
|
|
# r17 - divisor (range 1..2^63-1 - overwritten)
|
|
# r18 - scratch
|
|
# r19 - not used (one temp for 'caller')
|
|
# r26 - not used (expected to contain main return address)
|
|
# r27 - points to table of inverses (overwritten)
|
|
# r28 - this "subroutine" return address
|
|
#
|
|
# Inputs: dividend in r16, divisor in r17
|
|
# Outputs: quotient in r0, remainder in r1
|
|
# Destroys: r16,r17,r18,r27
|
|
#
|
|
# Note- this routine could save a few cycles if we could use
|
|
# another scratch register -- perhaps by pushing one on the stack?
|
|
#
|
|
#.align 4
|
|
div64: sll r17, 32, r18 # Position for ediv32
|
|
cmpule r17, r16, r0 # Is divisor leq dividend?
|
|
srl r17, 31, r1 # Is divisor geq 2^31?
|
|
beq r0, d64end # If divisor > dividend, quotient=0
|
|
cmpule r18, r16, r0 # Is divisor*2^32 leq dividend?
|
|
sll r17, 8, r1 # Position for ediv32 checking
|
|
or r1, r0, r0 # 0 if divisor & quotient fit in 32 bits
|
|
beq r0, ediv32 # Use 32-bit routine if OK
|
|
|
|
# Full 64-bit divide needed. Use the table of shift amounts to compute
|
|
# the number of leading zero bits in the divisor. Find the leftmost
|
|
# nonzero byte, then the leftmost nonzero bit in that byte. Table entry
|
|
# #n+1 contains the number of bits needed to hold n (1..8). We know the
|
|
# divisor is nonzero here.
|
|
#
|
|
cmpbge r31, r17, r0 # Get a zero bit for each nonzero byte
|
|
#stall
|
|
sll r0, 4, r0 # *16 bytes per table entry
|
|
#stall
|
|
subq r27, r0, r0 # table base plus complement...
|
|
#stall
|
|
ldq r1, 256*16(r0) # get position of first nonzero
|
|
#2 stalls
|
|
subq r1, 1, r1 # byte number of first nonzero
|
|
extbl r17, r1, r0 # get first nonzero byte
|
|
#stall
|
|
addq r0, r0, r0 # *2
|
|
s8addq r0, r27, r0 # *16 bytes per table entry
|
|
#stall
|
|
ldq r0, 16(r0) # bit number of first nonzero
|
|
negq r1, r1 # 1 + #leading zero bytes (mod 8)
|
|
#stall
|
|
s8subq r1, r0, r0 # number of leading zero bits
|
|
and r0, 0x3F, r0 # discard other junk
|
|
|
|
# The following code does a similar normalize calculation without the table.
|
|
#===
|
|
# extll r17, #4, r18 ; Normalize the divisor and
|
|
# mov #63, r0 ; count leading zeros
|
|
# cmovne r18, #31, r0
|
|
# cmoveq r18, r17, r18
|
|
# ;stall
|
|
# extwl r18, #2, r1
|
|
# ;stall
|
|
# cmovne r1, r1, r18
|
|
# cmovne r1, #16, r1
|
|
# ;stall
|
|
# subq r0, r1, r0
|
|
# extbl r18, #1, r1
|
|
# ;stall
|
|
# cmovne r1, r1, r18
|
|
# cmovne r1, #8, r1
|
|
# ;stall
|
|
# subq r0, r1, r0
|
|
# andnot r18, #^x0f, r1
|
|
# cmovne r1, r1, r18
|
|
# cmovne r1, #4, r1
|
|
# ;stall
|
|
# subq r0, r1, r0
|
|
# andnot r18, #^x33, r1
|
|
# cmovne r1, r1, r18
|
|
# cmovne r1, #2, r1
|
|
# ;stall
|
|
# subq r0, r1, r0
|
|
# andnot r18, #^x55, r1
|
|
# cmovne r1, #1, r1
|
|
# ;stall
|
|
# subq r0, r1, r0
|
|
#===
|
|
|
|
# R0 contains number of leading zero bits in the divisor.
|
|
|
|
sll r17, r0, r17 # Normalize: MSB is set.
|
|
|
|
# Now break divisor into pieces a+x, where a is the leading
|
|
# 9 bits, rounded, and x is the rest. Use a linear
|
|
# approximation for 1/divisor = 1/a - x/a^2 [+ x^2/a^3 -...]
|
|
#
|
|
srl r17, 64-10, r1 # Keep 10 bits of divisor
|
|
#stall
|
|
addq r1, 1, r1 # Round to form 'a'
|
|
andnot r1, 1, r1
|
|
s8addq r1, r27, r27 # Index table of 1/a and 1/a^2
|
|
sll r1, 64-10, r1 # shift 'a' to match divisor
|
|
ldq r18, (r27) # Load QW containing 1/a^2
|
|
subq r1, r17, r1 # -x = a - divisor
|
|
beq r1, d64_easy # Use table directly if x=0
|
|
inswl r18, 6, r18 # position 1/a^2
|
|
blt r1, d64_sub # correct for sign of -x
|
|
umulh r1, r18, r1 # -x/a^2
|
|
ldq r27, 8(r27) # Load QW containing 1/a - 1
|
|
br r31, d64_cont
|
|
|
|
d64_sub:umulh r1, r18, r1 # -x/a^2
|
|
ldq r27, 8(r27) # load QW containing 1/a - 1
|
|
# 2 stalls
|
|
s4addq r18, 0, r18
|
|
subq r27, r18, r27 # correct for sign of -x
|
|
d64_cont:
|
|
# many stalls
|
|
s4addq r1, r27, r18 # 1/divisor approx= 1/a - x/a^2
|
|
|
|
# Now one or two Newton iterations to get 24 or 56 good bits of the inverse.
|
|
# Each computes inv = inv * (2 - inv*divisor). We could skip out early
|
|
# here or above if the dividend and/or quotient is small enough for the
|
|
# amount of precision we've developed...
|
|
#
|
|
# We handle quadwords with the radix point on the left. The divisor has
|
|
# been normalized to the range 0.5 < divisor < 1.0; the inverses are in
|
|
# the range 1.0 < inverse < 2.0, and are represented without the leading 1.
|
|
#
|
|
umulh r18, r17, r1 # (inv0 - 1) * divisor
|
|
# many stalls
|
|
addq r1, r17, r1 # add hidden bit * divisor
|
|
negq r1, r1 # 2 - inv0*divisor, very near 1.0
|
|
umulh r18, r1, r27 # (inv0 - 1) * (2 - inv0*divisor)
|
|
cmovlt r1, 0, r18 # keep inv0 if (2-inv0*divisor) > 1.0
|
|
#stall
|
|
addq r18, r1, r1 # add it to hidden bit * (2-inv0*divisor)
|
|
# many stalls
|
|
addq r27, r1, r18 # inv1 = inv0 * (2 - inv0*divisor)
|
|
|
|
umulh r18, r17, r1 # (inv1 - 1) * divisor
|
|
# many stalls
|
|
addq r1, r17, r1 # add hidden bit * divisor
|
|
negq r1, r1 # 2 - inv1*divisor, very near 1.0
|
|
umulh r18, r1, r27 # (inv1 - 1) * (2 - inv1*divisor)
|
|
cmovlt r1, 0, r18 # keep inv1 if (2-inv1*divisor) > 1.0
|
|
addq r18, r1, r1 # add it to hidden bit * (2-inv1*divisor)
|
|
# many stalls
|
|
addq r27, r1, r1 # inverse = inv1 * (2 - inv1*divisor)
|
|
umulh r1, r16, r18 # dividend * (1/divisor - 1)
|
|
srl r17, r0, r17 # un-normalize divisor
|
|
negq r0, r0
|
|
subq r0, 8, r0 # how far right after first byte
|
|
# many stalls
|
|
addq r18, r16, r18 # add hidden bit * dividend
|
|
cmpult r18, r16, r1 # did it carry?
|
|
srl r18, 8, r18 # start to shift
|
|
sll r1, 56, r1 # position the carry
|
|
#stall
|
|
addq r1, r18, r1 # add the carry
|
|
srl r1, r0, r0 # final shift
|
|
mulq r17, r0, r1 # try out this quotient
|
|
# many stalls
|
|
subq r16, r1, r1 # form remainder
|
|
cmpule r17, r1, r18 # must be less than divisor
|
|
subq r1, r17, r27
|
|
cmovne r18, r27, r1 # if not, decrement remainder
|
|
addq r0, r18, r0 # and increment quotient
|
|
ret r31, (r28) # done
|
|
|
|
d64_easy:
|
|
ldq r1, 8(r27) # get 1/divisor, except hidden bit
|
|
srl r17, r0, r17 # un-normalize divisor again
|
|
blt r18, d64_pow2 # skip if power of 2
|
|
umulh r1, r16, r18 # dividend/divisor
|
|
negq r0, r0 # how far right to shift
|
|
and r16, 0x0ff, r1 # pieces of dividend
|
|
subq r0, 8, r0 # how far right after first byte
|
|
srl r16, 8, r27
|
|
# many stalls
|
|
addq r18, r1, r1 # add low piece of dividend, no carry
|
|
srl r1, 8, r1 # make room for high piece
|
|
#stall
|
|
addq r1, r27, r1 # finish adding hidden bit * dividend
|
|
srl r1, r0, r0 # final shift
|
|
mulq r17, r0, r1 # need to compute remainder too
|
|
# many stalls
|
|
subq r16, r1, r1
|
|
ret r31, (r28) # done
|
|
|
|
d64_pow2:
|
|
not r0, r0 # how far right to shift quotient
|
|
subq r17, 1, r1 # mask for remainder
|
|
srl r16, r0, r0 # shift for quotient
|
|
and r16, r1, r1 # get remainder
|
|
ret r31, (r28) # done
|
|
|
|
d64end: mov r16, r1 # Remainder to r1 for small quotient case
|
|
ret r31, (r28) # Not a real software procedure return
|
|
|
|
# long ots_div_l_o(long dividend, long divisor)
|
|
# signed 64 bit division support, overflow detection
|
|
#
|
|
#.align 4
|
|
.globl _OtsDivide64Overflow
|
|
.aent _OtsDivide64Overflow
|
|
_OtsDivide64Overflow:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_div_l_o>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
not r17, r1 # is the divisor -1?
|
|
bne r1, dl_skip # continue if not
|
|
negqv r16, r0 # q = -dividend, oflow on ^x800000000 00000000
|
|
ret r31, (r26)
|
|
nop
|
|
|
|
# long ots_div_l(long dividend, long divisor)
|
|
# signed 64 bit division support, no overflow detection
|
|
#
|
|
.globl _OtsDivide64
|
|
.aent _OtsDivide64
|
|
_OtsDivide64:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_div_l>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
dl_skip:
|
|
xor r16, r17, r19 # sign bit = result needs to be complimented (here to handle -maxint correctly)
|
|
dl_retry:
|
|
lda r28, -table_max(r17) # test for table lookup
|
|
ble r17, dl_notpos # not a positive divisor case
|
|
addq r17, r17, r0 # compute divisor*2
|
|
negq r16, r18 # part 1 of abs(dividend) -> r18
|
|
bgt r28, dl_lrgdiv # branch if large divisor
|
|
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
|
|
srl r16, 33, r1 # can this be handled via a 32 bit case?
|
|
cmpule r17, r18, r0 # divisor <= dividend?
|
|
bne r1, dl_64bit # does this need to be a real 64 bit case?
|
|
cmovge r16, r16, r18 # part 2 abs. val of the dividend -> r18
|
|
beq r0, dl_end # if not, result is zero
|
|
ldq r27, recip32_o(r27) # load 32b approximate reciprocal
|
|
sra r19, 63, r19 # get 0/-1
|
|
blt r27, dl_smpwr2 # skip umulh for powers of 2 specially
|
|
umulh r18, r27, r0 # start multiplication
|
|
beq r19, dl_end # if compliment not required, let umulh "hang out"
|
|
negq r0, r0 # compliment case
|
|
ret r31, (r26) #
|
|
dl_64bit:
|
|
cmovge r16, r16, r18 # part 2 abs. val of the dividend -> r18
|
|
beq r0, dl_end # if not, result is zero
|
|
ldq r1, recip64_o(r27) # load approximate reciprocal
|
|
sra r19, 63, r19 # get 0/-1
|
|
ldq r27, shift_o(r27) # load shift count (low 6 bits are all that matters)
|
|
beq r1, dl_smpwr2 # skip umulh for powers of 2 specially
|
|
umulh r18, r1, r0 # start multiplication
|
|
addq r0, r18, r18 # add hidden bit
|
|
dl_smpwr2:
|
|
srl r18, r27, r18 # shift the result into place
|
|
xor r18, r19, r18 # conditionally compliment
|
|
subq r18, r19, r0 # and increment for the final value
|
|
dl_end: ret r31, (r26) #
|
|
|
|
# Zero or negative divisor case. If just a negative divisor,
|
|
# compliment both dividend and divisor and do things again.
|
|
dl_notpos:
|
|
beq r17, divzer # division by zero
|
|
negq r17, r17 # |divisor|, note that 0x80000000 00000000 still appears negative
|
|
negq r16, r16 # compliment dividend
|
|
bgt r17, dl_retry # dispatch back for normal case (not 0x80000000 00000000 or 0)
|
|
cmpeq r16, r17, r0 # -maxint/-maxint = 1, all others 0
|
|
ret r31, (r26) # done
|
|
|
|
# Large divisor for signed 64/64 case
|
|
#
|
|
dl_lrgdiv:
|
|
sra r19, 63, r19 # get 0/-1
|
|
cmovlt r16, r18, r16 #
|
|
bsr r28, div64 # go use core routine
|
|
xor r0, r19, r0 # conditionally compliment
|
|
subq r0, r19, r0 # and increment for the final value
|
|
ret r31, (r26) # done
|
|
|
|
|
|
# long ots_rem_l(long dividend, long divisor)
|
|
# signed 64 bit remainder support
|
|
#
|
|
#.align 4
|
|
.globl _OtsRemainder64
|
|
.aent _OtsRemainder64
|
|
_OtsRemainder64:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_rem_l>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
negq r17, r18 # first part of abs(divisor)
|
|
cmovlt r17, r18, r17 # second part of abs(divisor)
|
|
subq r17, 1, r1 # start checking for power of 2
|
|
and r17, r1, r0 # finish check for power of 2
|
|
sra r16, 63, r19 # get -1/0 if dividend was negative
|
|
negq r16, r18 # first part of abs(dividend)
|
|
cmovlt r16, r18, r16 # second part of abs(dividend)
|
|
beq r0, rl_pwr2 # for powers of two, simply do a mask (not power of 2 include 0 and 80000000)
|
|
lda r28, -table_max(r17) # test for table lookup
|
|
bgt r28, rl_lrgdiv # branch if large divisor
|
|
addq r17, r17, r0 # compute divisor*2 for table lookup
|
|
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
|
|
ldq r1, recip64_o(r27) # load approximate reciprocal
|
|
ldq r18, shift_o(r27) # load shift amount
|
|
umulh r16, r1, r0 # multiplication for division step
|
|
addq r0, r16, r0 # add hidden bit
|
|
srl r0, r18, r0
|
|
mulq r0, r17, r0 # multiply back to get value to subtract
|
|
subq r16, r0, r0 # get abs of final result
|
|
xor r0, r19, r0 # start compliment if original dividend was <0
|
|
subq r0, r19, r0 # finish compliement
|
|
ret r31, (r26) # and return
|
|
|
|
# Handle powers of 2, including 0 and 80000000 00000000
|
|
rl_pwr2:
|
|
negq r16, r18 # first part of abs(dividend)
|
|
cmovlt r16, r18, r16 # second part of abs(dividend)
|
|
and r16, r1, r0 # use the divisor-1 mask in r1
|
|
beq r17, divzer # division by zero
|
|
xor r0, r19, r0 # start compliment if original dividend was <0
|
|
subq r0, r19, r0 # finish compliement
|
|
ret r31, (r26)
|
|
|
|
rl_lrgdiv:
|
|
bsr r28, div64 # use the core routine getting the remainder in r1
|
|
xor r1, r19, r0 # start compliment if original dividend was <0
|
|
subq r0, r19, r0 # finish complement
|
|
ret r31, (r26)
|
|
|
|
# long ots_mod_l(long dividend, long modulus)
|
|
# signed 64 bit modulus support
|
|
#
|
|
# This entry could be MUCH more optimized. It doesn't even try to use
|
|
# UMULH division currently... (A casualty of time-to-market.)
|
|
# Note that mod is only used by Ada and PL/I.
|
|
#
|
|
#.align 4
|
|
.globl _OtsModulus64
|
|
.aent _OtsModulus64
|
|
_OtsModulus64:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_rem_l>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
negq r17, r18 # first part of abs(divisor)
|
|
cmovge r17, r17, r18 # second part of abs(divisor)
|
|
subq r18, 1, r1 # start checking for power of 2
|
|
beq r17, divzer # check for 0
|
|
and r18, r1, r0 # second part of power-of-2 check
|
|
beq r0, ml_p2 # for powers of two, simply do a mask
|
|
# (note that the power-of-2 case MUST be used to handle
|
|
# the -maxint case due to the way the fix-up info is
|
|
# saved across the core routine call)
|
|
xor r16, r17, r28 # get xor of signs
|
|
clr r19 # don't need a bias if dividend and divisor have same sign
|
|
cmovlt r28, r17, r19 # bias is original divisor for different sign case
|
|
and r16, r17, r28 # if both dividend & divisor were neg. need to negate result
|
|
mov r18, r17 # move abs(divisor) into r17
|
|
negq r16, r18 # first part of abs(dividend)
|
|
cmovlt r16, r18, r16 # second part of abs(dividend)
|
|
cmplt r28, r31, r0 # get 1 if both operands were <0
|
|
sll r0, 63, r0 # get bit as the high bit
|
|
bis r0, r19, r19 # and MERGE with bias (0 -> no fixup, -maxint -> negate result,
|
|
# divisor > 0 - subtract remainder if non-zero, divisor < 0 -
|
|
# add remainder if non-zero)
|
|
bsr r28, div64 # use the core routine getting the remainder in r1
|
|
cmoveq r1, r31, r19 # don't do any fix-up if the remainder was zero
|
|
addq r19, r19, r18 # check to see if this is the negative/negative case, which just gets a negated remainder
|
|
subq r19, 1, r28 # wrap -maxint to positive
|
|
negq r1, r0 # move negated value, may abort later
|
|
cmovlt r28, r1, r0 # if both positive, or negative divisor, keep positive remainder
|
|
cmoveq r18, r31, r19 # now that negation is done, treat -maxint case as 0
|
|
addq r19, r0, r0 # add any bias (original divisor or 0)
|
|
ret r31, (r26) # and return
|
|
|
|
ml_p2: cmovge r17, r31, r17 # no bias if divisor was >= 0
|
|
and r16, r1, r1 # use the divisor-1 mask that's already in r1
|
|
cmoveq r1, r31, r17 # use zero if result was zero
|
|
addq r17, r1, r0 # do any biasing
|
|
ret r31, (r26) # and return
|
|
|
|
|
|
# unsigned long ots_div_ul(unsigned long dividend, unsigned long divisor)
|
|
# unsigned 64 bit division support
|
|
#
|
|
nop #.align 4
|
|
.globl _OtsDivide64Unsigned
|
|
.aent _OtsDivide64Unsigned
|
|
_OtsDivide64Unsigned:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_div_ul>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
lda r28, -table_max(r17) # test for table lookup
|
|
blt r17, dul_big # big divisors can (must) be handled by a simple comparison
|
|
addq r17, r17, r18 # compute divisor*2
|
|
srl r16, 33, r19 # can this be handled via the fast path for 31 bit dividends?
|
|
beq r17, divzer # check for 0
|
|
s8addq r18, r27, r18 # finish computing table entry addr (table addr+divisor*16)
|
|
bgt r28, dul_lrgdiv # branch if large divisor
|
|
cmpule r17, r16, r0 # is the dividend < divisor?
|
|
bne r19, dul_64bit # if the dividend doesn't fit in 31 bits, use the larger umulh form
|
|
ldq r27, recip32_o(r18) # load approximate 32b reciprocal & shift count
|
|
beq r0, dul_end # fast out for divisor > dividend
|
|
blt r27, dul_smpwr2 # go handle powers of 2 specially
|
|
umulh r16, r27, r0 # 32b recip
|
|
ret r31, (r26) #
|
|
|
|
# the 64 bit case is at a disadvantage to the 32b case because it needs
|
|
# a fix-up at the end, which prevents the latency of the umulh from
|
|
# being partially absorbed by the procedure return and anything that
|
|
# immediately follows that doesn't interlock.
|
|
nop
|
|
dul_64bit:
|
|
ldq r1, recip64_o(r18) # load approximate 64b reciprocal
|
|
ldq r27, shift_o(r18) # load shift count (low 6 bits are all that matters)
|
|
beq r0, dul_end # fast out for divisor > dividend
|
|
beq r1, dul_smpwr2 # go handle powers of 2 specially
|
|
umulh r16, r1, r0 # start multiplication for division step
|
|
zap r16, 0x0f, r18 # split dividend into two parts
|
|
zapnot r16, 0x0f, r16
|
|
srl r18, r27, r18 # position the high part
|
|
addq r0, r16, r0 # add hidden * low dividend (no carry)
|
|
srl r0, r27, r0 # shift into place
|
|
addq r0, r18, r0 # add hidden * high dividend
|
|
ret r31, (r26)
|
|
|
|
dul_smpwr2:
|
|
srl r16, r27, r0 # shift the result into place
|
|
dul_end: ret r31, (r26) #
|
|
|
|
dul_lrgdiv:
|
|
bsr r28, div64 # use the core routine
|
|
ret r31, (r26)
|
|
|
|
# divisor with the sign bit set. two possible results,
|
|
# 1 if divisor <= dividend, or 0 otherwise
|
|
dul_big:
|
|
cmpule r17, r16, r0
|
|
ret r31, (r26)
|
|
|
|
|
|
# long unsigned ots_rem_ul(long unsigned dividend, long unsigned divisor)
|
|
# unsigned 64 bit remainder support
|
|
#
|
|
#.align 4
|
|
.globl _OtsRemainder64Unsigned
|
|
.aent _OtsRemainder64Unsigned
|
|
_OtsRemainder64Unsigned:
|
|
#ifdef VMS
|
|
ldq r27, <ots_div_addr-ots_rem_ul>(r27)# start loading address of division data area
|
|
#endif
|
|
#ifdef OSF
|
|
ldgp gp, 0(r27) # load the global pointer
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # start loading address of the division data area
|
|
#endif
|
|
#ifdef WNT
|
|
.frame sp, 0, r26
|
|
lda r27, _OtsDivData # load the division data table address
|
|
#endif
|
|
lda r28, -table_max(r17) # test for table lookup
|
|
subq r17, 1, r1 # first part of power-of-2 check
|
|
blt r17, rul_big # big divisors can (must) be handled by a simple comparison
|
|
and r17, r1, r18 # second part of power-of-2 check
|
|
bgt r28, rul_lrgdiv # branch if large divisor
|
|
addq r17, r17, r0 # compute divisor*2 for table lookup
|
|
beq r18, rul_pwr2 # if zero, divisor is a power of 2
|
|
s8addq r0, r27, r27 # finish computing table entry addr (table addr+divisor*16)
|
|
ldq r1, recip64_o(r27) # load approximate reciprocal
|
|
cmpult r16, r17, r18 # is the dividend < divisor?
|
|
bne r18, rul_lss # if so, fast exit
|
|
ldq r19, shift_o(r27) # load the shift count
|
|
umulh r16, r1, r0 # multiplication for division step
|
|
blt r16, rul_carry # careful handling if >= 2^63
|
|
addq r0, r16, r0 # add hidden bit * dividend
|
|
srl r0, r19, r0
|
|
mulq r0, r17, r0 # multiply back to get value to subtract
|
|
subq r16, r0, r0
|
|
ret r31, (r26) # and return
|
|
|
|
rul_carry:
|
|
zap r16, 0x0f, r18 # split dividend into two parts
|
|
zapnot r16, 0x0f, r1
|
|
srl r18, r19, r18 # position the high part
|
|
addq r0, r1, r0 # add hidden * low dividend (no carry)
|
|
srl r0, r19, r0 # shift into place
|
|
addq r0, r18, r0 # add hidden * high dividend
|
|
mulq r0, r17, r0 # multiply back to get value to subtract
|
|
subq r16, r0, r0
|
|
ret r31, (r26)
|
|
|
|
rul_pwr2:
|
|
beq r17, divzer # check for 0
|
|
and r16, r1, r0 # use x-1 to mask
|
|
ret r31, (r26)
|
|
|
|
rul_lss:
|
|
mov r16, r0
|
|
ret r31, (r26)
|
|
|
|
# divisors with the sign bit set. two possible results,
|
|
# dividend if dividend < divisor, or dividend-divisor otherwise
|
|
rul_big:
|
|
cmpult r16, r17, r1
|
|
subq r16, r17, r0
|
|
cmovne r1, r16, r0
|
|
ret r31, (r26)
|
|
|
|
nop
|
|
rul_lrgdiv:
|
|
bsr r28, div64 # use the core routine getting the remainder in r1
|
|
mov r1, r0 # return remainder as the result in r0
|
|
ret r31, (r26)
|
|
|
|
|
|
# Division-by-zero handling
|
|
# (forward branch from all routines, out of the way here as well.)
|
|
#
|
|
divzer: lda r16, GEN_INTDIV(r31) # load GENTRAP code for division by zero
|
|
clr r0 # return 0 for the result
|
|
clr r1 #
|
|
#ifdef VMS
|
|
gentrap # signal the error
|
|
#endif
|
|
#ifdef OSF
|
|
call_pal PAL_gentrap
|
|
#endif
|
|
#ifdef WNT
|
|
# Since I couldn't find this in a header file anywhere for NT...
|
|
#define PAL_gentrap 0xaa
|
|
call_pal PAL_gentrap
|
|
#endif
|
|
ret r31, (r26) # return (in case someone tries to continue)
|
|
|
|
.set at
|
|
.set reorder
|
|
.end _OtsDiv
|