windows-nt-4.0/private/sdktools/vctools/crt/crtw32/misc/alpha/otsdiv.s



								 #	Copyright 1992, Digital Equipment Corporation

								 #

								 # This software is furnished under a license and may be used and  copied

								 # only  in  accordance  with  the  terms  of  such  license and with the

								 # inclusion of the above copyright notice.  This software or  any  other

								 # copies  thereof may not be provided or otherwise made available to any

								 # other person.  No title to and ownership of  the  software  is  hereby

								 # transferred.

								 #

								 # The information in this software is subject to change  without  notice

								 # and  should  not  be  construed  as  a commitment by Digital Equipment

								 # Corporation.

								 #

								 # Digital assumes no responsibility for the use or  reliability  of  its

								 # software on equipment which is not supplied by Digital.

								 #

								 #   008	  17 Jun 1992  KDG/wbn	Most of initial tailored version.  (See

								 #				commentary below.)

								 #

								 #   009	   4 Jul 1992	KDG	Continue work on initial tailored version,

								 #				including bugfixes and mod entry points

								 #

								 #   010	  15 Jul 1992	KDG	- Final touches for V1 (other than any bugfixes)

								 #				- .aligns commented out to allow older assembler versions

								 #

								 #   011	  16 Jul 1992	KDG	- Bugfix for ots_div_l for -maxint dividend

								 #				- OSF-only source changes for BL7

								 #

								 #   012	  10 Aug 1992	KDG	Fix overflow division entry points

								 #

								 #   013	  23 Sep 1992	KDG	Add case-sensitive entry names

								 #

								 #   014	   4 Jan 1993	KDG	Tweak for OSF assembler

								 #

								 #   015	  26 Jan 1993	KDG	Add underscore prefix, OSF uses CS names

								 #

								 #   016    5 Apr 1993	WBN	Speed up core 64-bit, shrink table entry to 2 QWs


								 #++

								 #   Entry points defined in this module:

								 #

								 #   -- 32 bit division/remainder support

								 #	unsigned ots_rem_ui(unsigned dividend, unsigned divisor)

								 #	unsigned ots_div_ui(unsigned dividend, unsigned divisor)

								 #	int ots_mod_i(int dividend, int modulus)

								 #	int ots_rem_i(int dividend, int divisor)

								 #	int ots_div_i_o(int dividend, int divisor)

								 #	int ots_div_i(int dividend, int divisor)    | "hot spot"

								 #	{core routine - div32}			    |

								 #

								 #   -- 64 bit division support

								 #	{core routine - div64}			    | (uses div32 for 32b cases)

								 #	long ots_div_l_o(long dividend, long divisor)

								 #	long ots_div_l(long dividend, long divisor) | "hot spot"

								 #	long ots_rem_l(long dividend, long divisor)

								 #	long ots_mod_l(long dividend, long modulus)

								 #	unsigned long ots_div_ul(unsigned long dividend, unsigned long divisor)

								 #	unsigned long ots_rem_ul(unsigned long dividend, unsigned long divisor)

								 #

								 #	Special conventions: No stack space, r0-r1, r16-r19 and r26-r28 ONLY.

								 #	(Warning: The auto-loader potentially takes some regs across

								 #	the call if this is being used in a shared lib. environment.)

								 #

								 #	    NOTE: This set of routines may start using stack space at some

								 #	    future point in time.

								 #

								 #   -- Possible future entry points include:

								 #	(These all return results in r0/r1)

								 #	{int quotient, int remainder} ots_div_mod_i(int dividend, int divisor)

								 #	{int quotient, int remainder} ots_div_mod_i_o(int dividend, int divisor)

								 #	{int quotient, int remainder} ots_div_rem_i(int dividend, int divisor)

								 #	{int quotient, int remainder} ots_div_rem_i_o(int dividend, int divisor)

								 #	{unsigned quotient, unsigned remainder} ots_div_rem_ui(unsigned dividend, unsigned divisor)

								 #

								 #	{long quotient, long remainder} ots_div_mod_i(long dividend, long divisor)

								 #	{long quotient, long remainder} ots_div_mod_i_o(long dividend, long divisor)

								 #	{long quotient, long remainder} ots_div_rem_i(long dividend, long divisor)

								 #	{long quotient, long remainder} ots_div_rem_i_o(long dividend, long divisor)

								 #	{unsigned long quotient, unsigned long remainder}

								 #		ots_div_rem_ui(unsigned long dividend, unsigned long divisor)

								 #

								 #

								 # General commentary:

								 #

								 #   This is an attempt at a fairly high performance version using relatively

								 #   straightforward algorithms.  Note that the code is intended to be scheduled

								 #   well for EV4, but still reasonably for LCA/EV5.

								 #

								 #   Also, note that there was only so much time available for this, so it

								 #   is far from "perfect".  "Better is the enemy of done"...

								 #

								 #   Possible future areas of improvement (and unfinished business):

								 #

								 #	- Another possible way of doing things for the "slow" (divnn cases)

								 #	  is to use an approximate inverse and convergence.  Given the speed

								 #	  of the multiplier on EV4, and given "time to market", this wasn't

								 #	  done for V1.)  I have some mail with the algorithm from Bob Gries

								 #	  (through Scott Robinson).

								 #

								 #	- When the divisor is too large for the table, but has n low-order zero

								 #	  bits, see if divisor/2^n fits in the table, and use that entry with

								 #	  dividend/2^n

								 #

								 #	- Use UMULH for the 'mod' routines.

								 #

								 #   This version can do a table lookup division (divisors with <=tablesize)

								 #   in roughly 32 cycles on an EV4 (with cache hits for all loads), of which

								 #   21 are for the umulh.  There is a strong bias toward the table-lookup case.

								 #   Note that for many cases, the umulh is the last thing before the return,

								 #   so the multiply can occur in parallel with the procedure return.

								 #   (It is interesting that the R3000 hardware divide instruction takes 33

								 #   cycles and the R4000 takes 76(!) ...)  Small powers of 2 are retired in

								 #   roughly 20 cycles.  Larger divisors take considerably longer at this point.

								 #


								#include	"ots_defs.hs"


								#ifdef	OSF

									# to get the PAL_gentrap literal

								#include	<alpha/pal.h>

								#endif


								 # Data area description

								 #

								 #	The data area "ots_div_data" is an array of structures, indexed

								 #	by the divisor value, with each array entry being 16 bytes in size

								 #	formatted as follows:

								 #

								 #	 6

								 #	 3                                                       6     0

								 #	+-------+-------+-------+-------+-------+-------+-------+-------+

								 #	|               32 bit reciprocal (58 bits)               |shift|

								 #	+-------+-------+-------+-------+-------+-------+-------+-------+

								 #	|                       64 bit reciprocal                       |

								 #	+---------------------------------------------------------------+

								 #

								 #	The 64-bit reciprocal has the leading '1' bit omitted, so it provides

								 #	65 bits of precision -- enough to handle unsigned 64-bit values.

								 #

								 #	The first longword contains the 6-bit shift amount needed to handle

								 #	64-bit cases and powers of two.

								 #

								 #	The 32-bit reciprocal has the shift count built in, so a UMULH gives

								 #	the correct quotient without shifting.  The reciprocal needs 33 bits

								 #	of precision.  The 6-bit shift amount is noise in the reciprocal that

								 #	can be ignored.

								 #

								 #	(Oh, you want proof?)  For divisors up to 2^k, we store k-1 leading

								 #	zero bits, 33 bits of fraction, (25-k) more bits of fraction, and

								 #	6 bits of noise.  The standard method would round at the 33rd fraction

								 #	bit.  We need to ensure that the value actually stored is geq the

								 #	infinite reciprocal, but leq the standard value.  For divisors up to

								 #	2^k, there will be a zero bit somewhere in the k bits below the 33rd,

								 #	so as long as we round below the (33+k)th bit, the rounded value

								 #	plus any noise is still less than the standard value.  This requires

								 #	k < 12.

								 #

								 #   The actual data is declared in ots_div_data_alpha.

								 #

								 # Offsets to the various fields in the data structure

								 #

								#define shift_o 0

								#define recip64_o 8

								#define recip32_o 0

								 #

								 # Note that the shift/add ops used to compute the table entries

								 # "know" that the table size is 16.  (i.e. addq -> s8addq -> ldq)

								 # By changing the first instruction, it's fairly easy to change the

								 # table entry size to 24, 32, or 40 bytes (using s4add/sub), or

								 # 56/64/72 bytes using s8add/sub, should that be desirable.


								 # Maximum divisor present in the table

								 #

								#define table_max 512


								 # Division by zero gentrap code

								 #

								#define GEN_INTDIV -2


								 # Address of division data area (shared by all entry points)

								 #

								#ifdef	VMS

									.psect	ots_link

								ots_div_addr:

									.address ots_div_data

									.psect	ots_code

								#endif


								 # Dummy entry point for the module

								 #

									.globl	_OtsDivide

									.ent	_OtsDivide

								_OtsDivide:

									.set noat

									.set noreorder

								#ifdef	OSF

									.frame	sp, 0, r26

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

								#endif


								 # unsigned ots_rem_ui(unsigned dividend, unsigned divisor)

								 # unsigned 32 bit remainder support

								 #

									#.align	4

									.globl	_OtsRemainder32Unsigned

									.aent	_OtsRemainder32Unsigned

								_OtsRemainder32Unsigned:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_rem_ui>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									lda	r28, -table_max(r17)	# test for table lookup

									subl	r17, 1, r1		# first part of power-of-2 check

									blt	r17, rui_big		# big divisors can (must) be handled by a simple comparison

									and	r17, r1, r18		# second part of power-of-2 check

									bgt	r28, rui_lrgdiv		# branch if large divisor

									addq	r17, r17, r0		# compute divisor*2 for table lookup

									beq	r18, rui_pwr2		# if zero, divisor is a power of 2

									s8addq	r0, r27, r27		# finish computing table entry addr (table addr+divisor*16)

									ldq	r1, recip32_o(r27)	# load approximate reciprocal

									cmpult	r16, r17, r18		# is the dividend < divisor?

									zap	r16, 0xF0, r0		# kill the propagated sign bit

									bne	r18, rui_lss		# if dividend < divisor, fast exit

									umulh	r0, r1, r0		# multiplication for division step

									mull	r0, r17, r0		# multiply back to get value to subtract

									subl	r16, r0, r0

									ret	r31, (r26)		# and return


								rui_pwr2:

									beq	r17, divzer		# check for 0

									and	r16, r1, r0		# use x-1 to mask

									ret	r31, (r26)


								rui_lss:

									mov	r16, r0

									ret	r31, (r26)


								rui_lrgdiv:

									zap	r16, 0xf0, r16		# zero-extend the dividend

									bsr	r28, div32		# use the core routine getting the remainder in r1

									sextl	r1, r0

									ret	r31, (r26)


									# divisors with the sign bit set.  two possible results,

									# dividend if dividend < divisor, or dividend-divisor otherwise

								rui_big:

									cmpult	r16, r17, r1

									subl	r16, r17, r0

									cmovne	r1, r16, r0

									ret	r31, (r26)


								 # unsigned ots_div_ui(unsigned dividend, unsigned divisor)

								 # unsigned 32 bit division support

								 #


									#.align	4

									.globl	_OtsDivide32Unsigned

									.aent	_OtsDivide32Unsigned

								_OtsDivide32Unsigned:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_div_ui>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									lda	r28, -table_max(r17)	# test for table lookup

									blt	r17, dui_big		# big divisors can (must) be handled by a simple comparison

									addq	r17, r17, r18		# compute divisor*2

									cmpule	r17, r16, r0		# is the dividend < divisor?

									beq	r17, divzer		# check for 0

									s8addq	r18, r27, r27		# finish computing table entry addr (table addr+divisor*16)

									beq	r0, dui_end		# fast out for divisor > dividend

									bgt	r28, dui_lrgdiv		# branch if large divisor

									ldq	r1, recip32_o(r27)	# load approximate reciprocal

									zap	r16, 0xF0, r16		# kill the propagated sign bit

									blt	r1, dui_smpwr2		# go handle powers of 2 specially

									umulh	r16, r1, r0		# start multiplication for division step

								dui_end:ret	r31, (r26)		# and return

									nop

								dui_smpwr2:

									srl	r16, r1, r0		# shift the result into place

									sextl	r0, r0			# reinsert sign if dividing by 1

									ret	r31, (r26)		#

								dui_lrgdiv:

									zap	r16, 0xf0, r16		# zero-extend the dividend

									bsr	r28, div32		# use the core routine getting the remainder in r1

									sextl	r0, r0			# make sure the result is in normal form for uint32

									ret	r31, (r26)


									# divisor with the sign bit set.  two possible results,

									# 1 if divisor <= dividend, or 0 otherwise

								dui_big:

									cmpule	r17, r16, r0

									ret	r31, (r26)


								 # int ots_mod_i(int dividend, int modulus)

								 # signed 32 bit modulus support

								 #

								 # This entry could be MUCH more optimized.  It doesn't even try to use

								 # UMULH division currently...  (A casualty of time-to-market.)

								 # Note that mod is only used by Ada and PL/I.

								 #

									#.align	4

									.globl	_OtsModulus32

									.aent	_OtsModulus32

								_OtsModulus32:

									negq	r17, r18		# first part of abs(divisor)

									cmovge	r17, r17, r18		# second part of abs(divisor)

									subq	r18, 1, r1		# start checking for power of 2

									beq	r17, divzer		# check for 0

									and	r18, r1, r0		# second part of power-of-2 check

									beq	r0, mi_p2		# for powers of two, simply do a mask

													# (note that the power-of-2 case MUST be used to handle

													# the -maxint case due to the way the fix-up info is

													# saved across the core routine call)

									xor	r16, r17, r28		# get xor of signs

									clr	r19			# don't need a bias if dividend and divisor have same sign

									cmovlt	r28, r17, r19		# bias is original divisor for different sign case

									and	r16, r17, r27		# if both dividend & divisor were neg. need to negate result

									mov	r18, r17		# move abs(divisor) into r17

									negq	r16, r18		# first part of abs(dividend)

									cmovlt	r16, r18, r16		# second part of abs(dividend)

									cmplt	r27, r31, r0		# get 1 if both operands were <0

									sll	r0, 63, r0		# get bit as the high bit

									bis	r0, r19, r19		# and MERGE with bias (0 -> no fixup, -maxint -> negate result,

													# divisor > 0 - subtract remainder if non-zero, divisor < 0 -

													# add remainder if non-zero)

									bsr	r28, div32		# use the core routine getting the remainder in r1

									cmoveq	r1, r31, r19		# don't do any fix-up if the remainder was zero

									addq	r19, r19, r18		# check to see if this is the negative/negative case, which just gets a negated remainder

									subq	r19, 1, r28		# wrap -maxint to positive

									negl	r1, r0			# move negated value, may abort later

									cmovlt	r28, r1, r0		# if both positive, or negative divisor, keep positive remainder

									cmoveq	r18, r31, r19		# now that negation is done, treat -maxint case as 0

									addl	r19, r0, r0		# add any bias (original divisor or 0)

									ret	r31, (r26)		# and return


								mi_p2:	cmovge	r17, r31, r17		# no bias if divisor was >= 0

									and	r16, r1, r1		# use the divisor-1 mask that's already in r1

									cmoveq	r1, r31, r17		# use zero if result was zero

									addl	r17, r1, r0		# do any biasing, and ensure the result is sign ext

									ret	r31, (r26)		# and return


								 # int ots_rem_i(int dividend, int divisor)

								 # signed 32 bit remainder support

								 #

									#.align	4

									.globl	_OtsRemainder32

									.aent	_OtsRemainder32

								_OtsRemainder32:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_rem_i>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									negq	r17, r18		# first part of abs(divisor)

									cmovlt	r17, r18, r17		# second part of abs(divisor)

									subq	r17, 1, r1		# start checking for power of 2

									and	r17, r1, r0		# finish check for power of 2

									sra	r16, 63, r19		# get -1/0 if dividend was negative

									negq	r16, r18		# first part of abs(dividend)

									cmovlt	r16, r18, r16		# second part of abs(dividend)

									beq	r0, ri_pwr2		# for powers of two, simply do a mask (not power of 2 include 0 and 80000000)

									lda	r28, -table_max(r17)	# test for table lookup

									bgt	r28, ri_lrgdiv		# branch if large divisor

									addq	r17, r17, r0		# compute divisor*2 for table lookup

									s8addq	r0, r27, r27		# finish computing table entry addr (table addr+divisor*16)

									ldq	r1, recip32_o(r27)	# load approximate reciprocal

									umulh	r16, r1, r0		# multiplication for division step

									mull	r0, r17, r0		# multiply back to get value to subtract

									subl	r16, r0, r0		# get abs of final result

									xor	r0, r19, r0		# start compliment if original dividend was <0

									subl	r0, r19, r0		# finish compliement

									ret	r31, (r26)		# and return


									# Handle powers of 2, including 0 and 80000000

								ri_pwr2:

									and	r16, r1, r0		# use the divisor-1 mask in r1

									beq	r17, divzer		# division by zero

									xor	r0, r19, r0		# start compliment if original dividend was <0

									subl	r0, r19, r0		# finish compliement

									ret	r31, (r26)


									nop

								ri_lrgdiv:

									bsr	r28, div32		# use the core routine getting the remainder in r1

									xor	r1, r19, r0		# start compliment if original dividend was <0

									subl	r0, r19, r0		# finish compliement

									ret	r31, (r26)


								 # int ots_div_i_o(int dividend, int divisor)

								 # signed 32 bit division support, overflow detection

								 #

									#.align	4

									.globl	_OtsDivide32Overflow

									.aent	_OtsDivide32Overflow

								_OtsDivide32Overflow:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_div_i_o>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									not	r17, r1			# is the divisor -1?

									bne	r1, di_skip		# continue if not

									neglv	r16, r0			# quotient = -dividend, overflow on ^x800000000

									ret	r31, (r26)


								 # int ots_div_i(int dividend, int divisor)

								 # signed 32 bit division support, no overflow detection

								 #

									nop	#.align	4

									.globl	_OtsDivide32

									.aent	_OtsDivide32

								_OtsDivide32:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_div_i>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

								di_skip:

								di_retry:

									lda	r28, -table_max(r17)	# test for table lookup

									ble	r17, di_notpos		# not a positive divisor case

									addq	r17, r17, r0		# compute divisor*2

									negq	r16, r18		# part 1 of abs(dividend) -> r18.  (Note 0xffffffff 80000000 => 0x00000000 80000000)

									bgt	r28, di_lrgdiv		# branch if large divisor

									s8addq	r0, r27, r27		# finish computing table entry addr (table addr+divisor*16)

									cmpule	r17, r18, r0		# divisor <= dividend?

									cmovge	r16, r16, r18		# part 2 abs. val of the dividend -> r18

									beq	r0, di_end		# if not, result is zero

									ldq	r1, recip32_o(r27)	# load approximate reciprocal

									blt	r1, di_smpwr2		# go handle powers of 2 specially

									umulh	r18, r1, r0		# start multiplication

									blt	r16, di_negres		# negate result? (done as a branch to allow umulh to "hang out" over end for common case)

								di_end:	ret	r31, (r26)		# return for same-sign (common) case

								di_negres:

									negl	r0, r0			# different signs - compliment result

									ret	r31, (r26)		# return for different-sign (uncommon) case

								di_smpwr2:

									srl	r18, r1, r18		# shift the result into place

									sra	r16, 63, r16		# get 0/-1 based on sign of dividend

									xor	r18, r16, r18		# conditionally compliment

									subl	r18, r16, r0		# and increment for the final value

									ret	r31, (r26)		# (note subl is required for sign ext for %x80000000/1 case)


									# Zero or negative divisor case.  If just a negative divisor,

									# compliment both dividend and divisor and do things again.

								di_notpos:

									beq	r17, divzer		# division by zero

									negl	r17, r17		# |divisor|, note that 0x80000000 still appears negative

									negq	r16, r16		# compliment dividend (negq so that 0xffffffff 80000000 => 0x00000000 80000000

									bgt	r17, di_retry		# dispatch back for normal case (not 0x80000000 or 0)

									sextl	r16, r16		#

									cmpeq	r16, r17, r0		# -maxint/-maxint = 1, all others 0

									ret	r31, (r26)		# done


									# Large divisor for signed 32/32 case

									#

									nop	#.align	3

								di_lrgdiv:

									sra	r16, 63, r19		# get 0/-1 based on sign of dividend

									cmovlt	r16, r18, r16		#

									bsr	r28, div32		# go use core routine

									xor	r0, r19, r0		# conditionally compliment

									subl	r0, r19, r0		# and increment for the final value (subl ensures normalized result)

									ret	r31, (r26)		# done


								 # Large divisor case core routine for 32b

								 # (wbn)

								 #

								 #   r0	- quotient (output)

								 #   r1	- remainder (output)

								 #   r16	- dividend (range 0..2^32-1, zero extended)

								 #   r17 - divisor (range 1..2^31-1 - overwritten)

								 #   r18	- scratch

								 #   r19	- not used (one temp for 'caller')

								 #   r26 - not used (expected to contain main return address)

								 #  [r27 - scratch] (not currently written)

								 #   r28	- this "subroutine" return address

								 #

								 # Some tightened up bit-at-a-time code for dividing 32-bit integers.

								 # It uses two tricks: keep the running remainder and the quotient in

								 # the same 64-bit register (MQ?), and add 1 while subtracting the divisor,

								 # so that a single CMOV sets both the new remainder and the new quotient.

								 # I start off by trying to skip 8 bits at a time; should this skip a

								 # smaller amount, so the main loop iterates less often?  If the divisor

								 # is already known to be large enough, the last case in this test is never

								 # used...

								 #

								 # This code expects as input two integers in the range 0 <= x < 2^31

								 # (that is, it doesn't work for general unsigned longwords, and doesn't

								 # include sign manipulation.)

								 #

								 # The code here takes about 34n+11 cycles for a quotient occupying n bytes.

								 #

								 # Inputs: dividend in r16, divisor in r17

								 # Outputs: quotient in r0, remainder in r1

								 # Destroys: [r16,]r17,r18,[r27]

								 #

									# How many quotient bytes will there be: 0, 1, 2, 3, 4?

									#

									#.align	4

								div32:	cmpule	r17, r16, r0		# Divisor leq dividend?

									sll	r17, 32, r18		# Position divisor for loop

									sll	r17, 8, r1		# Prepare for next compare

									beq	r0, d32end		# Dividend less, quotient is zero.

								ediv32:	mov	8-3, r17		# Hope to skip 3 bytes of loop

									cmpule	r1, r16, r0		# Shifted divisor still leq dividend?

									sll	r1, 8, r1		# Prepare for next compare

									beq	r0, d32ent		# Go loop over just one byte

									mov	8-2, r17		# Hope to skip 2 bytes of loop

									cmpule	r1, r16, r0		# Shifted divisor still leq dividend?

									sll	r1, 8, r1		# Prepare for next compare

									beq	r0, d32ent		# Go loop over just two bytes

									mov	8-1, r17		# Hope to skip 1 byte of loop

									cmpule	r1, r16, r0		# Shifted divisor still leq dividend?

									nop				# stall - align d32ent and d32loop

									cmovne	r0, 8, r17		# If we can't skip any bytes


									# start loop generating quotient bits.  NOTE: The loop setup requires

									# an even number of iterations.

									#

								d32ent:	extqh	r16, r17, r0		# Shift dividend left for skipped bytes

									subq	r18, 1, r1		# Divisor in high LW - 1 in low LW

									s8subq	r17, 34, r17		# Convert bytes to bits and adjust


									addq	r0, r0, r0		# Shift left to start first iteration

								d32loop:subq	r0, r1, r18		# Can we subtract divisor from it?

									cmovge	r18, r18, r0		# If so, set new remainder & quotient

									# stall

									addq	r0, r0, r0		# Shift remainder and quotient left

									subq	r0, r1, r18		# Can we subtract divisor from it?

									cmovge	r18, r18, r0		# If so, set new remainder & quotient

									subq	r17, 2, r17		# Loop counter

									addq	r0, r0, r0		# Shift remainder and quotient left

									bgt	r17, d32loop		# Repeat

									subq	r0, r1, r18		# Can we subtract divisor from it?

									cmovge	r18, r18, r0		# If so, set new remainder & quotient

									# stall

									addq	r0, r0, r0		# Shift remainder and quotient left

									subq	r0, r1, r18		# Finish last iteration

									cmovge	r18, r18, r0

									# stall

									srl	r0, 32, r1		# Get remainder in r1

									zap	r0, 0xf0, r0		# Keep only quotient in r0

									nop	# for alignment

								d32end:	cmoveq	r0, r16, r1		# Move remainder to r1 for quotient=0 case

									ret	r31, (r28)		# Not a real software procedure return


								 # Large divisor case core routine for 64b

								 #

								 #   r0	- quotient (output)

								 #   r1	- remainder (output)

								 #   r16	- dividend (range 0..2^64-1 - overwritten)

								 #   r17 - divisor (range 1..2^63-1 - overwritten)

								 #   r18	- scratch

								 #   r19 - not used (one temp for 'caller')

								 #   r26 - not used (expected to contain main return address)

								 #   r27	- points to table of inverses (overwritten)

								 #   r28	- this "subroutine" return address

								 #

								 # Inputs: dividend in r16, divisor in r17

								 # Outputs: quotient in r0, remainder in r1

								 # Destroys: r16,r17,r18,r27

								 #

								 # Note- this routine could save a few cycles if we could use

								 # another scratch register -- perhaps by pushing one on the stack?

								 #

									#.align	4

								div64:	sll	r17, 32, r18		# Position for ediv32

									cmpule	r17, r16, r0		# Is divisor leq dividend?

									srl	r17, 31, r1		# Is divisor geq 2^31?

									 beq	r0, d64end		# If divisor > dividend, quotient=0

									cmpule	r18, r16, r0		# Is divisor*2^32 leq dividend?

									sll	r17, 8, r1		# Position for ediv32 checking

									or	r1, r0, r0		# 0 if divisor & quotient fit in 32 bits

									 beq	r0, ediv32		# Use 32-bit routine if OK


								 # Full 64-bit divide needed.  Use the table of shift amounts to compute

								 # the number of leading zero bits in the divisor.  Find the leftmost

								 # nonzero byte, then the leftmost nonzero bit in that byte.  Table entry

								 # #n+1 contains the number of bits needed to hold n (1..8).  We know the

								 # divisor is nonzero here.

								 #

									cmpbge	r31, r17, r0		# Get a zero bit for each nonzero byte

									#stall

									sll	r0, 4, r0		# *16 bytes per table entry

									#stall

									subq	r27, r0, r0		# table base plus complement...

									#stall

									ldq	r1, 256*16(r0)		# get position of first nonzero

									#2 stalls

									subq	r1, 1, r1		# byte number of first nonzero

									extbl	r17, r1, r0		# get first nonzero byte

									#stall

									addq	r0, r0, r0		# *2

									s8addq	r0, r27, r0		# *16 bytes per table entry

									#stall

									ldq	r0, 16(r0)		# bit number of first nonzero

									negq	r1, r1			# 1 + #leading zero bytes (mod 8)

									#stall

									s8subq	r1, r0, r0		# number of leading zero bits

									and	r0, 0x3F, r0		# discard other junk


								 # The following code does a similar normalize calculation without the table.

								 #===

								 #	extll	r17, #4, r18		; Normalize the divisor and

								 #	mov	#63, r0			; count leading zeros

								 #	cmovne	r18, #31, r0

								 #	cmoveq	r18, r17, r18

								 #	;stall

								 #	extwl	r18, #2, r1

								 #	;stall

								 #	cmovne	r1, r1, r18

								 #	cmovne	r1, #16, r1

								 #	;stall

								 #	subq	r0, r1, r0

								 #	extbl	r18, #1, r1

								 #	;stall

								 #	cmovne	r1, r1, r18

								 #	cmovne	r1, #8, r1

								 #	;stall

								 #	subq	r0, r1, r0

								 #	andnot	r18, #^x0f, r1

								 #	cmovne	r1, r1, r18

								 #	cmovne	r1, #4, r1

								 #	;stall

								 #	subq	r0, r1, r0

								 #	andnot	r18, #^x33, r1

								 #	cmovne	r1, r1, r18

								 #	cmovne	r1, #2, r1

								 #	;stall

								 #	subq	r0, r1, r0

								 #	andnot	r18, #^x55, r1

								 #	cmovne	r1, #1, r1

								 #	;stall

								 #	subq	r0, r1, r0

								 #===


								 # R0 contains number of leading zero bits in the divisor.


									sll	r17, r0, r17		# Normalize: MSB is set.


									# Now break divisor into pieces a+x, where a is the leading

									# 9 bits, rounded, and x is the rest.  Use a linear

									# approximation for 1/divisor = 1/a - x/a^2 [+ x^2/a^3 -...]

									#

									srl	r17, 64-10, r1		# Keep 10 bits of divisor

									#stall

									addq	r1, 1, r1		# Round to form 'a'

									andnot	r1, 1, r1

									s8addq	r1, r27, r27		# Index table of 1/a and 1/a^2

									sll	r1, 64-10, r1		# shift 'a' to match divisor

									ldq	r18, (r27)		# Load QW containing 1/a^2

									subq	r1, r17, r1		# -x = a - divisor

									beq	r1, d64_easy		# Use table directly if x=0

									inswl	r18, 6, r18		# position 1/a^2

									blt	r1, d64_sub		# correct for sign of -x

									umulh	r1, r18, r1		# -x/a^2

									ldq	r27, 8(r27)		# Load QW containing 1/a - 1

									br	r31, d64_cont


								d64_sub:umulh	r1, r18, r1		# -x/a^2

									ldq	r27, 8(r27)		# load QW containing 1/a - 1

									# 2 stalls

									s4addq	r18, 0, r18

									subq	r27, r18, r27		# correct for sign of -x

								d64_cont:

									# many stalls

									s4addq	r1, r27, r18		# 1/divisor approx= 1/a - x/a^2


									# Now one or two Newton iterations to get 24 or 56 good bits of the inverse.

									# Each computes  inv = inv * (2 - inv*divisor).  We could skip out early

									# here or above if the dividend and/or quotient is small enough for the

									# amount of precision we've developed...

									#

									# We handle quadwords with the radix point on the left.  The divisor has

									# been normalized to the range 0.5 < divisor < 1.0; the inverses are in

									# the range 1.0 < inverse < 2.0, and are represented without the leading 1.

									#

									umulh	r18, r17, r1		# (inv0 - 1) * divisor

									# many stalls

									addq	r1, r17, r1		# add hidden bit * divisor

									negq	r1, r1			# 2 - inv0*divisor, very near 1.0

									umulh	r18, r1, r27		# (inv0 - 1) * (2 - inv0*divisor)

									cmovlt	r1, 0, r18		# keep inv0 if (2-inv0*divisor) > 1.0

									#stall

									addq	r18, r1, r1		# add it to hidden bit * (2-inv0*divisor)

									# many stalls

									addq	r27, r1, r18		# inv1 = inv0 * (2 - inv0*divisor)


									umulh	r18, r17, r1		# (inv1 - 1) * divisor

									# many stalls

									addq	r1, r17, r1		# add hidden bit * divisor

									negq	r1, r1			# 2 - inv1*divisor, very near 1.0

									umulh	r18, r1, r27		# (inv1 - 1) * (2 - inv1*divisor)

									cmovlt	r1, 0, r18		# keep inv1 if (2-inv1*divisor) > 1.0

									addq	r18, r1, r1		# add it to hidden bit * (2-inv1*divisor)

									# many stalls

									addq	r27, r1, r1		# inverse = inv1 * (2 - inv1*divisor)

									umulh	r1, r16, r18		# dividend * (1/divisor - 1)

									srl	r17, r0, r17		# un-normalize divisor

									negq	r0, r0

									subq	r0, 8, r0		# how far right after first byte

									# many stalls

									addq	r18, r16, r18		# add hidden bit * dividend

									cmpult	r18, r16, r1		# did it carry?

									srl	r18, 8, r18		# start to shift

									sll	r1, 56, r1		# position the carry

									#stall

									addq	r1, r18, r1		# add the carry

									srl	r1, r0, r0		# final shift

									mulq	r17, r0, r1		# try out this quotient

									# many stalls

									subq	r16, r1, r1		# form remainder

									cmpule	r17, r1, r18		# must be less than divisor

									subq	r1, r17, r27

									cmovne	r18, r27, r1		# if not, decrement remainder

									addq	r0, r18, r0		# and increment quotient

									ret	r31, (r28)		# done


								d64_easy:

									ldq	r1, 8(r27)		# get 1/divisor, except hidden bit

									srl	r17, r0, r17		# un-normalize divisor again

									blt	r18, d64_pow2		# skip if power of 2

									umulh	r1, r16, r18		# dividend/divisor

									negq	r0, r0			# how far right to shift

									and	r16, 0x0ff, r1		# pieces of dividend

									subq	r0, 8, r0		# how far right after first byte

									srl	r16, 8, r27

									# many stalls

									addq	r18, r1, r1		# add low piece of dividend, no carry

									srl	r1, 8, r1		# make room for high piece

									#stall

									addq	r1, r27, r1		# finish adding hidden bit * dividend

									srl	r1, r0, r0		# final shift

									mulq	r17, r0, r1		# need to compute remainder too

									# many stalls

									subq	r16, r1, r1

									ret	r31, (r28)		# done


								d64_pow2:

									not	r0, r0			# how far right to shift quotient

									subq	r17, 1, r1		# mask for remainder

									srl	r16, r0, r0		# shift for quotient

									and	r16, r1, r1		# get remainder

									ret	r31, (r28)		# done


								d64end:	mov	r16, r1			# Remainder to r1 for small quotient case

									  ret	r31, (r28)		# Not a real software procedure return


								 # long ots_div_l_o(long dividend, long divisor)

								 # signed 64 bit division support, overflow detection

								 #

									#.align	4

									.globl	_OtsDivide64Overflow

									.aent	_OtsDivide64Overflow

								_OtsDivide64Overflow:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_div_l_o>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									not	r17, r1			# is the divisor -1?

									bne	r1, dl_skip		# continue if not

									negqv	r16, r0			# q = -dividend, oflow on ^x800000000 00000000

									ret	r31, (r26)

									nop


								 # long ots_div_l(long dividend, long divisor)

								 # signed 64 bit division support, no overflow detection

								 #

									.globl	_OtsDivide64

									.aent	_OtsDivide64

								_OtsDivide64:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_div_l>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

								dl_skip:

									xor	r16, r17, r19		# sign bit = result needs to be complimented (here to handle -maxint correctly)

								dl_retry:

									lda	r28, -table_max(r17)	# test for table lookup

									ble	r17, dl_notpos		# not a positive divisor case

									addq	r17, r17, r0		# compute divisor*2

									negq	r16, r18		# part 1 of abs(dividend) -> r18

									bgt	r28, dl_lrgdiv		# branch if large divisor

									s8addq	r0, r27, r27		# finish computing table entry addr (table addr+divisor*16)

									srl	r16, 33, r1		# can this be handled via a 32 bit case?

									cmpule	r17, r18, r0		# divisor <= dividend?

									bne	r1, dl_64bit		# does this need to be a real 64 bit case?

									cmovge	r16, r16, r18		# part 2 abs. val of the dividend -> r18

									beq	r0, dl_end		# if not, result is zero

									ldq	r27, recip32_o(r27)	# load 32b approximate reciprocal

									sra	r19, 63, r19		# get 0/-1

									blt	r27, dl_smpwr2		# skip umulh for powers of 2 specially

									umulh	r18, r27, r0		# start multiplication

									beq	r19, dl_end		# if compliment not required, let umulh "hang out"

									negq	r0, r0			# compliment case

									ret	r31, (r26)		#

								dl_64bit:

									cmovge	r16, r16, r18		# part 2 abs. val of the dividend -> r18

									beq	r0, dl_end		# if not, result is zero

									ldq	r1, recip64_o(r27)	# load approximate reciprocal

									sra	r19, 63, r19		# get 0/-1

									ldq	r27, shift_o(r27)	# load shift count (low 6 bits are all that matters)

									beq	r1, dl_smpwr2		# skip umulh for powers of 2 specially

									umulh	r18, r1, r0		# start multiplication

									addq	r0, r18, r18		# add hidden bit

								dl_smpwr2:

									srl	r18, r27, r18		# shift the result into place

									xor	r18, r19, r18		# conditionally compliment

									subq	r18, r19, r0		# and increment for the final value

								dl_end:	ret	r31, (r26)		#


									# Zero or negative divisor case.  If just a negative divisor,

									# compliment both dividend and divisor and do things again.

								dl_notpos:

									beq	r17, divzer		# division by zero

									negq	r17, r17		# |divisor|, note that 0x80000000 00000000 still appears negative

									negq	r16, r16		# compliment dividend

									bgt	r17, dl_retry		# dispatch back for normal case (not 0x80000000 00000000 or 0)

									cmpeq	r16, r17, r0		# -maxint/-maxint = 1, all others 0

									ret	r31, (r26)		# done


									# Large divisor for signed 64/64 case

									#

								dl_lrgdiv:

									sra	r19, 63, r19		# get 0/-1

									cmovlt	r16, r18, r16		#

									bsr	r28, div64		# go use core routine

									xor	r0, r19, r0		# conditionally compliment

									subq	r0, r19, r0		# and increment for the final value

									ret	r31, (r26)		# done


								 # long ots_rem_l(long dividend, long divisor)

								 # signed 64 bit remainder support

								 #

									#.align	4

									.globl	_OtsRemainder64

									.aent	_OtsRemainder64

								_OtsRemainder64:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_rem_l>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									negq	r17, r18		# first part of abs(divisor)

									cmovlt	r17, r18, r17		# second part of abs(divisor)

									subq	r17, 1, r1		# start checking for power of 2

									and	r17, r1, r0		# finish check for power of 2

									sra	r16, 63, r19		# get -1/0 if dividend was negative

									negq	r16, r18		# first part of abs(dividend)

									cmovlt	r16, r18, r16		# second part of abs(dividend)

									beq	r0, rl_pwr2		# for powers of two, simply do a mask (not power of 2 include 0 and 80000000)

									lda	r28, -table_max(r17)	# test for table lookup

									bgt	r28, rl_lrgdiv		# branch if large divisor

									addq	r17, r17, r0		# compute divisor*2 for table lookup

									s8addq	r0, r27, r27		# finish computing table entry addr (table addr+divisor*16)

									ldq	r1, recip64_o(r27)	# load approximate reciprocal

									ldq	r18, shift_o(r27)	# load shift amount

									umulh	r16, r1, r0		# multiplication for division step

									addq	r0, r16, r0		# add hidden bit

									srl	r0, r18, r0

									mulq	r0, r17, r0		# multiply back to get value to subtract

									subq	r16, r0, r0		# get abs of final result

									xor	r0, r19, r0		# start compliment if original dividend was <0

									subq	r0, r19, r0		# finish compliement

									ret	r31, (r26)		# and return


									# Handle powers of 2, including 0 and 80000000 00000000

								rl_pwr2:

									negq	r16, r18		# first part of abs(dividend)

									cmovlt	r16, r18, r16		# second part of abs(dividend)

									and	r16, r1, r0		# use the divisor-1 mask in r1

									beq	r17, divzer		# division by zero

									xor	r0, r19, r0		# start compliment if original dividend was <0

									subq	r0, r19, r0		# finish compliement

									ret	r31, (r26)


								rl_lrgdiv:

									bsr	r28, div64		# use the core routine getting the remainder in r1

									xor	r1, r19, r0		# start compliment if original dividend was <0

									subq	r0, r19, r0		# finish complement

									ret	r31, (r26)


								 # long ots_mod_l(long dividend, long modulus)

								 # signed 64 bit modulus support

								 #

								 # This entry could be MUCH more optimized.  It doesn't even try to use

								 # UMULH division currently...  (A casualty of time-to-market.)

								 # Note that mod is only used by Ada and PL/I.

								 #

									#.align	4

									.globl	_OtsModulus64

									.aent	_OtsModulus64

								_OtsModulus64:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_rem_l>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									negq	r17, r18		# first part of abs(divisor)

									cmovge	r17, r17, r18		# second part of abs(divisor)

									subq	r18, 1, r1		# start checking for power of 2

									beq	r17, divzer		# check for 0

									and	r18, r1, r0		# second part of power-of-2 check

									beq	r0, ml_p2		# for powers of two, simply do a mask

													# (note that the power-of-2 case MUST be used to handle

													# the -maxint case due to the way the fix-up info is

													# saved across the core routine call)

									xor	r16, r17, r28		# get xor of signs

									clr	r19			# don't need a bias if dividend and divisor have same sign

									cmovlt	r28, r17, r19		# bias is original divisor for different sign case

									and	r16, r17, r28		# if both dividend & divisor were neg. need to negate result

									mov	r18, r17		# move abs(divisor) into r17

									negq	r16, r18		# first part of abs(dividend)

									cmovlt	r16, r18, r16		# second part of abs(dividend)

									cmplt	r28, r31, r0		# get 1 if both operands were <0

									sll	r0, 63, r0		# get bit as the high bit

									bis	r0, r19, r19		# and MERGE with bias (0 -> no fixup, -maxint -> negate result,

													# divisor > 0 - subtract remainder if non-zero, divisor < 0 -

													# add remainder if non-zero)

									bsr	r28, div64		# use the core routine getting the remainder in r1

									cmoveq	r1, r31, r19		# don't do any fix-up if the remainder was zero

									addq	r19, r19, r18		# check to see if this is the negative/negative case, which just gets a negated remainder

									subq	r19, 1, r28		# wrap -maxint to positive

									negq	r1, r0			# move negated value, may abort later

									cmovlt	r28, r1, r0		# if both positive, or negative divisor, keep positive remainder

									cmoveq	r18, r31, r19		# now that negation is done, treat -maxint case as 0

									addq	r19, r0, r0		# add any bias (original divisor or 0)

									ret	r31, (r26)		# and return


								ml_p2:	cmovge	r17, r31, r17		# no bias if divisor was >= 0

									and	r16, r1, r1		# use the divisor-1 mask that's already in r1

									cmoveq	r1, r31, r17		# use zero if result was zero

									addq	r17, r1, r0		# do any biasing

									ret	r31, (r26)		# and return


								 # unsigned long ots_div_ul(unsigned long dividend, unsigned long divisor)

								 # unsigned 64 bit division support

								 #

									nop	#.align	4

									.globl	_OtsDivide64Unsigned

									.aent	_OtsDivide64Unsigned

								_OtsDivide64Unsigned:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_div_ul>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									lda	r28, -table_max(r17)	# test for table lookup

									blt	r17, dul_big		# big divisors can (must) be handled by a simple comparison

									addq	r17, r17, r18		# compute divisor*2

									srl	r16, 33, r19		# can this be handled via the fast path for 31 bit dividends?

									beq	r17, divzer		# check for 0

									s8addq	r18, r27, r18		# finish computing table entry addr (table addr+divisor*16)

									bgt	r28, dul_lrgdiv		# branch if large divisor

									cmpule	r17, r16, r0		# is the dividend < divisor?

									bne	r19, dul_64bit		# if the dividend doesn't fit in 31 bits, use the larger umulh form

									ldq	r27, recip32_o(r18)	# load approximate 32b reciprocal & shift count

									beq	r0, dul_end		# fast out for divisor > dividend

									blt	r27, dul_smpwr2		# go handle powers of 2 specially

									umulh	r16, r27, r0		# 32b recip

									ret	r31, (r26)		#


									# the 64 bit case is at a disadvantage to the 32b case because it needs

									# a fix-up at the end, which prevents the latency of the umulh from

									# being partially absorbed by the procedure return and anything that

									# immediately follows that doesn't interlock.

									nop

								dul_64bit:

									ldq	r1, recip64_o(r18)	# load approximate 64b reciprocal

									ldq	r27, shift_o(r18)	# load shift count (low 6 bits are all that matters)

									beq	r0, dul_end		# fast out for divisor > dividend

									beq	r1, dul_smpwr2		# go handle powers of 2 specially

									umulh	r16, r1, r0		# start multiplication for division step

									zap	r16, 0x0f, r18		# split dividend into two parts

									zapnot	r16, 0x0f, r16

									srl	r18, r27, r18		# position the high part

									addq	r0, r16, r0		# add hidden * low dividend (no carry)

									srl	r0, r27, r0		# shift into place

									addq	r0, r18, r0		# add hidden * high dividend

									ret	r31, (r26)


								dul_smpwr2:

									srl	r16, r27, r0		# shift the result into place

								dul_end: ret	r31, (r26)		#


								dul_lrgdiv:

									bsr	r28, div64		# use the core routine

									ret	r31, (r26)


									# divisor with the sign bit set.  two possible results,

									# 1 if divisor <= dividend, or 0 otherwise

								dul_big:

									cmpule	r17, r16, r0

									ret	r31, (r26)


								 # long unsigned ots_rem_ul(long unsigned dividend, long unsigned divisor)

								 # unsigned 64 bit remainder support

								 #

									#.align	4

									.globl	_OtsRemainder64Unsigned

									.aent	_OtsRemainder64Unsigned

								_OtsRemainder64Unsigned:

								#ifdef	VMS

									ldq	r27, <ots_div_addr-ots_rem_ul>(r27)# start loading address of division data area

								#endif

								#ifdef	OSF

									ldgp	gp, 0(r27)		# load the global pointer

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# start loading address of the division data area

								#endif

								#ifdef	WNT

									.frame	sp, 0, r26

									lda	r27, _OtsDivData	# load the division data table address

								#endif

									lda	r28, -table_max(r17)	# test for table lookup

									subq	r17, 1, r1		# first part of power-of-2 check

									blt	r17, rul_big		# big divisors can (must) be handled by a simple comparison

									and	r17, r1, r18		# second part of power-of-2 check

									bgt	r28, rul_lrgdiv		# branch if large divisor

									addq	r17, r17, r0		# compute divisor*2 for table lookup

									beq	r18, rul_pwr2		# if zero, divisor is a power of 2

									s8addq	r0, r27, r27		# finish computing table entry addr (table addr+divisor*16)

									ldq	r1, recip64_o(r27)	# load approximate reciprocal

									cmpult	r16, r17, r18		# is the dividend < divisor?

									bne	r18, rul_lss		# if so, fast exit

									ldq	r19, shift_o(r27)	# load the shift count

									umulh	r16, r1, r0		# multiplication for division step

									blt	r16, rul_carry		# careful handling if >= 2^63

									addq	r0, r16, r0		# add hidden bit * dividend

									srl	r0, r19, r0

									mulq	r0, r17, r0		# multiply back to get value to subtract

									subq	r16, r0, r0

									ret	r31, (r26)		# and return


								rul_carry:

									zap	r16, 0x0f, r18		# split dividend into two parts

									zapnot	r16, 0x0f, r1

									srl	r18, r19, r18		# position the high part

									addq	r0, r1, r0		# add hidden * low dividend (no carry)

									srl	r0, r19, r0		# shift into place

									addq	r0, r18, r0		# add hidden * high dividend

									mulq	r0, r17, r0		# multiply back to get value to subtract

									subq	r16, r0, r0

									ret	r31, (r26)


								rul_pwr2:

									beq	r17, divzer		# check for 0

									and	r16, r1, r0		# use x-1 to mask

									ret	r31, (r26)


								rul_lss:

									mov	r16, r0

									ret	r31, (r26)


									# divisors with the sign bit set.  two possible results,

									# dividend if dividend < divisor, or dividend-divisor otherwise

								rul_big:

									cmpult	r16, r17, r1

									subq	r16, r17, r0

									cmovne	r1, r16, r0

									ret	r31, (r26)


									nop

								rul_lrgdiv:

									bsr	r28, div64		# use the core routine getting the remainder in r1

									mov	r1, r0			# return remainder as the result in r0

									ret	r31, (r26)


								 # Division-by-zero handling

								 #   (forward branch from all routines, out of the way here as well.)

								 #

								divzer:	lda	r16, GEN_INTDIV(r31)	# load GENTRAP code for division by zero

									clr	r0			# return 0 for the result

									clr	r1			#

								#ifdef	VMS

									gentrap				# signal the error

								#endif

								#ifdef	OSF

									call_pal PAL_gentrap

								#endif

								#ifdef	WNT

									# Since I couldn't find this in a header file anywhere for NT...

								#define PAL_gentrap 0xaa

									call_pal PAL_gentrap

								#endif

									ret	r31, (r26)		# return (in case someone tries to continue)


									.set at

									.set reorder

									.end	_OtsDiv