Windows NT 4.0 source code leak
// TITLE("High-Performance Division")
// Copyright (c) 1993 Digital Equipment Corporation
// Module Name:
// fastdiv.s
// Abstract:
// This module implements a high-performance integer division routine
// whose source is included from each of the division and remainder
// functions.
// Author:
// Thomas Van Baak (tvb) 12-Jan-1993
// Ken Lesniak (lesniak) 04-Nov-1992
// Environment:
// Any mode.
// Revision History:
// Implementation Notes:
// This code is the main "guts" of the eight divide and remainder routines
// in the C library. It is intended to be included by a wrapper function
// that defines options to control 32 vs. 64-bit, division vs. remainder,
// and signed vs. unsigned. The wrapper function is responsible for the
// prologue and epilogue sequences, as well as overflow checking and sign
// adjustment on both input arguments and the return value.
// The algorithm used here is based on long division and a table for
// approximating inverses as discussed in the paper "Division by a Constant"
// by Mary Payne and Robert Gries. If the divisor inverse can be constructed
// from the table without an error, the division is performed as described
// in the paper with a multiplication and a shift.
// If the inverse can not be found in the table, we improve the inverse with
// a linear approximation, "I". A multiplication by "I" and a shift by
// log2(y) is used to obtain an approximate quotient, "Q". Now, like long
// division, the most significant bits are correct, therefore if we
// calculate the remainder R = x-Q*y, R will be smaller than x, and R will
// contain the true remainder and the error in Q, "e" multiplied by y.
// So if we do the same multiplication and shift, we will get an
// approximation for e. This is just long division, and it will finish
// when R, the remainder, is less than y.
// Both the division algorithm code and the large division tables used by
// the code are contained in this source file (to keep them together).
// The code below requires that the wrapper function define register numbers
// for each of the following symbolic register names:
// Nu dividend (numerator)
// Di divisor (denominator)
// Qu quotient
// Re remainder (may be the same as Qu)
// T0 temp
// T1 temp
// T2 temp
// T3 temp
// T4 temp
// T5 temp
// T6 temp (may be shared with Qu)
// The code below requires that the wrapper function define the symbol
// FASTDIV_OPTIONS as the logical sum of the following names:
#define THIRTY_TWO_BIT 1 // perform 32-bit computations, otherwise
#define SIXTY_FOUR_BIT 0 // perform 64-bit computations
#define UNSIGNED 2 // treat operands as unsigned, otherwise
#define SIGNED 0 // treat operands as signed
#define DIVISION 4 // return quotient in Qu register, and/or
#define REMAINDER 8 // return remainder in Re register
// Define the symbols that actually control the code generation.
// These constants give the algorithm the best performance.
#define BIT_LENGTH 8
#define BIT_VALUE (1 << BIT_LENGTH)
#define BIT_MASK (BIT_VALUE - 1)
#define K_BIT_LENGTH 8
#define K_BIT_VALUE (1 << K_BIT_LENGTH)
#define K_BIT_MASK (K_BIT_VALUE - 1)
#define TABLE_BIAS 16384
#define LOG2TAB -16384
#define INV_FLAG -15360
#define INV -14848
#define INV_M -14840
.set noat
lda T3, __2divdata + TABLE_BIAS // set address of table
beq Di, 30f // if zero divisor, generate trap
cmpule Nu, Di, T0 // check if dividend <= divisor
addl Di, 0, T1 // sign extend longword to quadword
blt T1, 20f // high bit set so quotient <= 1
blt Di, 20f // high bit set so quotient <= 1
cmpbge zero, Di, T2 // perform 8 byte parallel compare
bne T0, 20f // dividend <= divisor so quotient <= 1
xor T2, 0xff, T0 //
s4addq T0, T3, T0 //
ldl AT, LOG2TAB(T0) //
extbl Di, AT, T0 //
s4addq T0, T3, T0 //
ldl T0, LOG2TAB(T0) //
s8addq AT, T0, AT //
ornot zero, AT, T0 //
sll Di, T0, T0 //
sll T0, BIT_LENGTH + 1, T1 //
bne T1, 40f //
// Short case.
subq Di, 1, T1 // compute divisor - 1
and Di, T1, T2 // divisor & (divisor - 1)
srl T0, 63 - (BIT_LENGTH + 1), T6 //
beq T2, 10f // divisor is a power of two
subq T6, BIT_VALUE + BIT_VALUE, T6 //
bic T6, 1, T4 //
s8addq T4, T3, T4 //
ldq T5, INV(T4) //
ldq T4, INV_M(T4) //
subq T5, T4, T4 //
cmovlbs T6, T4, T5 //
addq T6, T3, T2 //
ldq_u T0, INV_FLAG(T2) //
extbl T0, T2, T2 //
addq Nu, T2, T1 //
umulh T1, T5, T3 //
cmoveq T1, T5, T3 //
srl T3, AT, Qu //
mulq Qu, Di, T0 //
subq Nu, T0, Re //
br zero, 90f // all done, branch to epilogue code
// The divisor is now known to be a power of two.
// - The quotient is the dividend shifted right by exponent bits and
// the remainder is the low order exponent bits of the dividend.
// AT = log2(divisor)
// T1 = divisor - 1
and Nu, T1, Re //
srl Nu, AT, Qu //
br zero, 90f // all done, branch to epilogue code
// The quotient is now known to be either 0 or 1.
// - if the high bit of the divisor is set, the quotient must be less
// than 2, so the quotient is 0 or 1.
// - if the dividend is less than the divisor, the quotient is 0 and
// the remainder is the dividend.
// - if the dividend is equal to the divisor, the quotient is 1 and
// the remainder is 0.
// - if the dividend is greater than the divisor, the quotient is 1
// and the remainder is the dividend minus the divisor.
cmpule Di, Nu, T0 // if divisor <= dividend
subq Nu, Di, Re // then remainder is dividend,
cmoveq T0, Nu, Re // else remainder is dividend - divisor
cmpule Di, Nu, Qu // if divisor <= dividend, 1, else 0
br zero, 90f // all done, branch to epilogue code
// Generate divide by zero exception. If execution is continued, return
// a zero result.
ldiq Qu, 0 // supply 0 result if continued
br zero, 90f // branch to epilogue code
// Long case.
40: srl T0, 63-(BIT_LENGTH+K_BIT_LENGTH), T6 //
srl T6, K_BIT_LENGTH, T0 //
and T6, K_BIT_MASK, T6 //
addq T0, T0, T0 //
s8addq T0, T3, T1 //
ldq T2, INV-(16*BIT_VALUE)(T1) //
ldq T0, INV_M-(16*BIT_VALUE)(T1) //
mulq T0, T6, T0 //
srl T0, K_BIT_LENGTH-1, T0 //
subq T2, T0, T6 //
umulh Nu, T6, T4 //
srl T4, AT, T4 //
mulq T4, Di, T5 //
mov zero, T3 //
xor Nu, T5, T0 //
bge T0, 50f //
umulh Di, T4, T3 //
subq Nu, T5, T5 //
beq T4, 70f //
cmpult Nu, T5, T0 //
or T0, T3, T3 //
negq T5, T1 //
cmovne T3, T1, T5 //
addq Di, Di, T0 //
cmpule T0, T5, T0 //
beq T0, 70f //
// do {
60: umulh T5, T6, T2 //
srl T2, AT, T2 //
negq T2, T0 //
cmoveq T3, T2, T0 //
addq T4, T0, T4 //
mulq T2, Di, T1 //
subq T5, T1, T5 //
ldiq T0, 1 //
cmovne T3, 0, T0 //
negq T5, T2 //
cmovlt T5, T0, T3 //
cmovlt T5, T2, T5 //
addq Di, Di, T0 //
cmpule T5, T0, T0 //
cmovne T0, 0, T1 //
bne T1, 60b //
// } while
70: cmpule Di, T5, T0 //
beq T0, 80f //
subq T5, Di, T5 //
subq T4, 1, T0 //
addq T4, 1, T4 //
cmovne T3, T0, T4 //
cmoveq T5, 0, T3 //
subq Di, T5, Re //
cmoveq T3, T5, Re //
subq T4, 1, Qu //
cmoveq T3, T4, Qu //
.set at
// The following data was machine generated. DO NOT EDIT MANUALLY.
.globl __2divdata
.align 4
.align 2
