title   adj_fdiv   - routines to compensate for incorrect Pentium FDIV
;*** 
;adj_fdiv - routines to compensate for incorrect Pentium FDIV
;
;   Copyright (c) 1994-2001, Microsoft Corporation. All rights reserved.
;
;Purpose:
;   Workarounds to correct for broken FDIV
;
;Revision History:
;
;   12/06/94	Jamie MacCalman
;		initial version, based on Intel fix
;   12/09/94	Jamie MacCalman
;		added _adj_fpremX & _safe_fdivX entry points
;   12/13/94	Jamie MacCalman
;		upgraded to V.3 of Intel's workarounds
;   12/19/94	Jamie MacCalman
;		upgraded to V.4 of Intel's workarounds
;   12/27/94	Jamie MacCalman
;		upgraded to V.5 (aka "V1.0") of Intel's workarounds
;    1/13/95	Jamie MacCalman
;		added underscores to fdivp_sti_st & fdivrp_sti_st for ANSI conformance
;
;  The following code is a PRELIMINARY IMPLEMENTATION of a
;  software patch for the floating point divide instructions.
;
;


	include cruntime.inc
	include mrt386.inc
	include elem87.inc

;
;  Stack variables for divide routines.
;

DENOM		EQU	0
NUMER		EQU	12
PREV_CW		EQU	28
PATCH_CW 	EQU	32

DENOM_SAVE	EQU	32

MAIN_DENOM	EQU	4
MAIN_NUMER	EQU	16

SPILL_SIZE	EQU	12
MEM_OPERAND	EQU	8
STACK_SIZE	EQU	44
SPILL_MEM_OPERAND	EQU	20

ONESMASK	EQU	0e000000h

SINGLE_NAN	EQU	07f800000h
DOUBLE_NAN	EQU	07ff00000h

ILLEGAL_OPC	EQU	6

;
; FPREM constants
;

FPREM_FLT_SIZE		EQU	12
FPREM_DENOM			EQU 0
FPREM_DENOM_SAVE	EQU	FPREM_DENOM + FPREM_FLT_SIZE
FPREM_NUMER			EQU FPREM_DENOM_SAVE + FPREM_FLT_SIZE
FPREM_PREV_CW		EQU FPREM_NUMER + FPREM_FLT_SIZE
FPREM_PATCH_CW		EQU FPREM_PREV_CW + 4
FPREM_SW			EQU	FPREM_PATCH_CW + 4
FPREM_STACK_SIZE	EQU FPREM_SW + 4
FPREM_RET_SIZE		EQU	4
FPREM_PUSH_SIZE		EQU	4

FPREM_MAIN_FUDGE	EQU	FPREM_RET_SIZE + FPREM_PUSH_SIZE + FPREM_PUSH_SIZE + FPREM_PUSH_SIZE

FPREM_MAIN_DENOM		EQU FPREM_DENOM + FPREM_MAIN_FUDGE
FPREM_MAIN_DENOM_SAVE	EQU	FPREM_DENOM_SAVE + FPREM_MAIN_FUDGE
FPREM_MAIN_NUMER		EQU FPREM_NUMER + FPREM_MAIN_FUDGE
FPREM_MAIN_PREV_CW		EQU	FPREM_PREV_CW + FPREM_MAIN_FUDGE
FPREM_MAIN_PATCH_CW		EQU	FPREM_PATCH_CW + FPREM_MAIN_FUDGE
FPREM_MAIN_FPREM_SW		EQU	FPREM_SW + FPREM_MAIN_FUDGE

FPREM_ONESMASK	EQU     700h


.data

fdiv_risc_table	DB	0, 1, 0, 0, 4, 0, 0, 7, 0, 0, 10, 0, 0, 13, 0, 0
fdiv_scale_1  	DD	03f700000h		;0.9375
fdiv_scale_2	DD	03f880000h		;1.0625
one_shl_63  	DD	05f000000h

fprem_risc_table 	DB 	0, 1, 0, 0, 4, 0, 0, 7, 0, 0, 10, 0, 0, 13, 0, 0
fprem_scale 		DB 	0, 0, 0, 0, 0, 0, 0eeh, 03fh
one_shl_64 		DB 	0, 0, 0, 0, 0, 0, 0f0h, 043h
one_shr_64 		DB 	0, 0, 0, 0, 0, 0, 0f0h, 03bh
one 			DB 	0, 0, 0, 0, 0, 0, 0f0h, 03fh
half 			DB 	0, 0, 0, 0, 0, 0, 0e0h, 03fh
big_number		DB	0, 0, 0, 0, 0, 0, 0ffh, 0ffh, 0feh, 07fh

ifdef	DEBUG
	public	fpcw
	public	fpsw
fpcw	dw	0
fpsw	dw	0
endif

FPU_STATE	STRUC
	CONTROL_WORD	DW	?
	reserved_1	DW	?
	STATUS_WORD	DD	?
	TAG_WORD	DW	?
	reserved_3	DW	?
	IP_OFFSET	DD	?
	CS_SLCT		DW	?
	OPCODE		DW	?
	DATA_OFFSET	DD	?
	OPERAND_SLCT	DW	?
	reserved_4	DW	?
FPU_STATE	ENDS

ENV_SIZE	EQU	28



dispatch_table DD	offset FLAT:label0
	DD	offset FLAT:label1
	DD	offset FLAT:label2
	DD	offset FLAT:label3
	DD	offset FLAT:label4
	DD	offset FLAT:label5
	DD	offset FLAT:label6
	DD	offset FLAT:label7
	DD	offset FLAT:label8
	DD	offset FLAT:label9
	DD	offset FLAT:label10
	DD	offset FLAT:label11
	DD	offset FLAT:label12
	DD	offset FLAT:label13
	DD	offset FLAT:label14
	DD	offset FLAT:label15
	DD	offset FLAT:label16
	DD	offset FLAT:label17
	DD	offset FLAT:label18
	DD	offset FLAT:label19
	DD	offset FLAT:label20
	DD	offset FLAT:label21
	DD	offset FLAT:label22
	DD	offset FLAT:label23
	DD	offset FLAT:label24
	DD	offset FLAT:label25
	DD	offset FLAT:label26
	DD	offset FLAT:label27
	DD	offset FLAT:label28
	DD	offset FLAT:label29
	DD	offset FLAT:label30
	DD	offset FLAT:label31
	DD	offset FLAT:label32
	DD	offset FLAT:label33
	DD	offset FLAT:label34
	DD	offset FLAT:label35
	DD	offset FLAT:label36
	DD	offset FLAT:label37
	DD	offset FLAT:label38
	DD	offset FLAT:label39
	DD	offset FLAT:label40
	DD	offset FLAT:label41
	DD	offset FLAT:label42
	DD	offset FLAT:label43
	DD	offset FLAT:label44
	DD	offset FLAT:label45
	DD	offset FLAT:label46
	DD	offset FLAT:label47
	DD	offset FLAT:label48
	DD	offset FLAT:label49
	DD	offset FLAT:label50
	DD	offset FLAT:label51
	DD	offset FLAT:label52
	DD	offset FLAT:label53
	DD	offset FLAT:label54
	DD	offset FLAT:label55
	DD	offset FLAT:label56
	DD	offset FLAT:label57
	DD	offset FLAT:label58
	DD	offset FLAT:label59
	DD	offset FLAT:label60
	DD	offset FLAT:label61
	DD	offset FLAT:label62
	DD	offset FLAT:label63


fpcw	dw	0



CODESEG


;
;  PRELIMINARY VERSION for register-register divides.
;


					; In this implementation the
					; fdiv_main_routine is called,
					; therefore all the stack frame
					; locations are adjusted for the
					; return pointer.

fdiv_main_routine PROC	NEAR

	fld     tbyte ptr [esp+MAIN_NUMER]	; load the numerator
	fld     tbyte ptr [esp+MAIN_DENOM]	; load the denominator
retry:

;  The following three lines test for denormals and zeros.
;  A denormal or zero has a 0 in the explicit digit to the left of the
;  binary point.  Since that bit is the high bit of the word, adding
;  it to itself will produce a carry if and only if the number is not
;  denormal or zero.
;
	mov 	eax, [esp+MAIN_DENOM+4]	; get mantissa bits 32-64
	add 	eax,eax			; shift the one's bit onto carry
	jnc 	denormal		; if no carry, we're denormal

;  The following three lines test the three bits after the four bit 
;  pattern (1,4,7,a,d).  If these three bits are not all one, then
;  the denominator cannot expose the flaw.  This condition is tested by
;  inverting the bits and testing that all are equal to zero afterward.

	xor 	eax, ONESMASK		; invert the bits that must be ones
	test	eax, ONESMASK		; and make sure they are all ones
	jz  	scale_if_needed		; if all are one scale numbers
	fdivp	st(1), st		; use of hardware is OK.
	ret

;
;  Now we test the four bits for one of the five patterns.
;
scale_if_needed:
	shr	eax, 28			; keep first 4 bits after point
	cmp	byte ptr fdiv_risc_table[eax], 0	; check for (1,4,7,a,d)
	jnz	divide_scaled		; are in potential problem area
	fdivp	st(1), st		; use of hardware is OK.
	ret

divide_scaled:
	mov	eax, [esp + MAIN_DENOM+8]	; test denominator exponent
	and	eax, 07fffh             ; if pseudodenormal ensure that only
	jz	invalid_denom		; invalid exception flag is set
	cmp	eax, 07fffh             ; if NaN or infinity  ensure that only
	je	invalid_denom		; invalid exception flag is set
;
;  The following six lines turn off exceptions and set the
;  precision control to 80 bits.  The former is necessary to
;  force any traps to be taken at the divide instead of the scaling
;  code.  The latter is necessary in order to get full precision for
;  codes with incoming 32 and 64 bit precision settings.  If
;  it can be guaranteed that before reaching this point, the underflow
;  exception is masked and the precision control is at 80 bits, these
;  six lines can be omitted.
;
	fnstcw	[esp+PREV_CW]		; save caller's control word
	mov	eax, [esp+PREV_CW] 
	or	eax, 033fh		; mask exceptions, pc=80
	and	eax, 0f3ffh		; set rounding mode to nearest
	mov	[esp+PATCH_CW], eax
	fldcw	[esp+PATCH_CW]		; mask exceptions & pc=80

;  The following lines check the numerator exponent before scaling.
;  This in order to prevent undeflow when scaling the numerator,
;  which will cause a denormal exception flag to be set when the
;  actual divide is preformed. This flag would not have been set
;  normally. If there is a risk of underflow, the scale factor is
;  17/16 instead of 15/16.
;
 	mov	eax, [esp+MAIN_NUMER+8]	; test numerator exponent
 	and	eax, 07fffh
 	cmp	eax, 00001h
 	je	small_numer

	fmul	fdiv_scale_1		; scale denominator by 15/16
	fxch
	fmul	fdiv_scale_1		; scale numerator by 15/16
	fxch

;
;  The next line restores the users control word.  If the incoming
;  control word had the underflow exception masked and precision
;  control set to 80 bits, this line can be omitted.
;

	fldcw	[esp+PREV_CW]		; restore caller's control word
	fdivp	st(1), st		; use of hardware is OK.
	ret

small_numer:
	fmul	fdiv_scale_2		; scale denominator by 17/16
	fxch
	fmul	fdiv_scale_2		; scale numerator by 17/16
	fxch

;
;  The next line restores the users control word.  If the incoming
;  control word had the underflow exception masked and precision
;  control set to 80 bits, this line can be omitted.
;

	fldcw	[esp+PREV_CW]		; restore caller's control word
	fdivp	st(1), st		; use of hardware is OK.
	ret

denormal:
	mov	eax, [esp+MAIN_DENOM]	; test for whole mantissa == 0
	or	eax, [esp+MAIN_DENOM+4]	; test for whole mantissa == 0
	jnz	denormal_divide_scaled	; denominator is not zero
invalid_denom:				; zero or invalid denominator
	fdivp	st(1), st		; use of hardware is OK.
	ret

denormal_divide_scaled:
	mov	eax, [esp + MAIN_DENOM + 8]	; get exponent 
	and	eax, 07fffh		; check for zero exponent
	jnz	invalid_denom		; 
;
;  The following six lines turn off exceptions and set the
;  precision control to 80 bits.  The former is necessary to
;  force any traps to be taken at the divide instead of the scaling
;  code.  The latter is necessary in order to get full precision for
;  codes with incoming 32 and 64 bit precision settings.  If
;  it can be guaranteed that before reaching this point, the underflow
;  exception is masked and the precision control is at 80 bits, these
;  six lines can be omitted.
;

	fnstcw	[esp+PREV_CW]		; save caller's control word
	mov	eax, [esp+PREV_CW] 
	or	eax, 033fh		; mask exceptions, pc=80
	and	eax, 0f3ffh		; set rounding mode to nearest
	mov	[esp+PATCH_CW], eax
	fldcw	[esp+PATCH_CW]		; mask exceptions & pc=80

	mov	eax, [esp + MAIN_NUMER +8]	; test numerator exponent
	and	eax, 07fffh		; check for denormal numerator
	je	denormal_numer	
	cmp	eax, 07fffh		; NaN or infinity
	je	invalid_numer
	mov	eax, [esp + MAIN_NUMER + 4]	; get bits 32..63 of mantissa
	add	eax, eax		; shift the first bit into carry
	jnc	invalid_numer		; if there is no carry, we have an
					; invalid numer
	jmp	numer_ok

denormal_numer:
	mov	eax, [esp + MAIN_NUMER + 4]	; get bits 32..63 of mantissa
	add	eax, eax		; shift the first bit into carry
	jc	invalid_numer		; if there is a carry, we have an
					; invalid numer
	
numer_ok:
	fxch
	fstp	st			; pop numerator
	fld 	st			; make copy of denominator
	fmul	dword ptr[one_shl_63]	; make denominator not denormal
	fstp	tbyte ptr [esp+MAIN_DENOM]	; save modified denominator
	fld 	tbyte ptr [esp+MAIN_NUMER]	; load numerator
	fxch				; restore proper order
	fwait

;  The next line restores the users control word.  If the incoming
;  control word had the underflow exception masked and precision
;  control set to 80 bits, this line can be omitted.
;

	fldcw	[esp+PREV_CW]		; restore caller's control word
	jmp	retry			; start the whole thing over

invalid_numer:
;
;  The next line restores the users control word.  If the incoming
;  control word had the underflow exception masked and precision
;  control set to 80 bits, this line can be omitted.
;
	fldcw	[esp + PREV_CW]
	fdivp	st(1), st		; use of hardware is OK.
	ret

fdiv_main_routine	ENDP

fdivr_st	MACRO	reg_index, reg_index_minus1
	fstp	tbyte ptr [esp+DENOM]
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fstp	tbyte ptr [esp+NUMER]
	call	fdiv_main_routine
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fld	tbyte ptr [esp+NUMER]
	fxch	st(reg_index)
	add	esp, STACK_SIZE
ENDM

fdivr_sti	MACRO	reg_index, reg_index_minus1
	fstp	tbyte ptr [esp+NUMER]
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fstp	tbyte ptr [esp+DENOM]
	call	fdiv_main_routine
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fld	tbyte ptr [esp+NUMER]
	add	esp, STACK_SIZE
ENDM

fdivrp_sti	MACRO	reg_index, reg_index_minus1
	fstp	tbyte ptr [esp+NUMER]
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fstp	tbyte ptr [esp+DENOM]
	call	fdiv_main_routine
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	add	esp, STACK_SIZE
ENDM

fdiv_st		MACRO	reg_index, reg_index_minus1
	fstp	tbyte ptr [esp+NUMER]
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fld	st
	fstp	tbyte ptr [esp+DENOM]
	fstp	tbyte ptr [esp+DENOM_SAVE]	; save original denom, 
	call	fdiv_main_routine
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fld	tbyte ptr [esp+DENOM_SAVE]
	fxch	st(reg_index)
	add	esp, STACK_SIZE
ENDM

fdiv_sti	MACRO	reg_index, reg_index_minus1
	fxch	st(reg_index)
	fstp	tbyte ptr [esp+NUMER]
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fld	st
	fstp	tbyte ptr [esp+DENOM]
	fstp	tbyte ptr [esp+DENOM_SAVE]	; save original denom, 
	call	fdiv_main_routine
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fld	tbyte ptr [esp+DENOM_SAVE]
	add	esp, STACK_SIZE
ENDM

fdivp_sti	MACRO	reg_index, reg_index_minus1
	fstp	tbyte ptr [esp+DENOM]
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	fstp	tbyte ptr [esp+NUMER]
	call	fdiv_main_routine
IF	reg_index_minus1 GE 1
	fxch	st(reg_index_minus1)
ENDIF
	add	esp, STACK_SIZE
ENDM 


	public  _adj_fdiv_r
_adj_fdiv_r      PROC    NEAR

	sub	esp, STACK_SIZE			; added back at end of fdiv_x macros
	and eax, 0000003FH			; upper 26 bits could be anything
	jmp	dword ptr dispatch_table[eax*4]



label0::
	fdiv	st,st(0)		; D8 F0 	FDIV	ST,ST(0)
	add	esp, STACK_SIZE
	ret
label1::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label2::
	fdivr	st,st(0)		; D8 F8		FDIVR	ST,ST(0)
	add	esp, STACK_SIZE
	ret
label3::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label4::
	fdiv 	st(0),st		; DC F8/D8 F0	FDIV	ST(0),ST
	add	esp, STACK_SIZE
	ret
label5::
	fdivp 	st(0),st		; DE F8		FDIVP	ST(0),ST
	add	esp, STACK_SIZE
	ret
label6::
	fdivr 	st(0),st		; DC F0/DE F0	FDIVR	ST(0),ST
	add	esp, STACK_SIZE
	ret
label7::
	fdivrp 	st(0),st		; DE F0		FDIVRP	ST(0),ST
	add	esp, STACK_SIZE
	ret
label8::
	fdiv_st 1,0
	ret
label9::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label10::
	fdivr_st 1,0
	ret
label11::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label12::
	fdiv_sti 1,0
	ret
label13::
	fdivp_sti 1,0
	ret
label14::
	fdivr_sti 1,0
	ret
label15::
	fdivrp_sti 1,0
	ret
label16::
	fdiv_st 2,1
	ret
label17::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label18::
	fdivr_st 2,1
	ret
label19::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label20::
	fdiv_sti 2,1
	ret
label21::
	fdivp_sti 2,1
	ret
label22::
	fdivr_sti 2,1
	ret
label23::
	fdivrp_sti 2,1
	ret
label24::
	fdiv_st 3,2
	ret
label25::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label26::
	fdivr_st 3,2
	ret
label27::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label28::
	fdiv_sti 3,2
	ret
label29::
	fdivp_sti 3,2
	ret
label30::
	fdivr_sti 3,2
	ret
label31::
	fdivrp_sti 3,2
	ret
label32::
	fdiv_st 4,3
	ret
label33::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label34::
	fdivr_st 4,3
	ret
label35::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label36::
	fdiv_sti 4,3
	ret
label37::
	fdivp_sti 4,3
	ret
label38::
	fdivr_sti 4,3
	ret
label39::
	fdivrp_sti 4,3
	ret
label40::
	fdiv_st 5,4
	ret
label41::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label42::
	fdivr_st 5,4
	ret
label43::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label44::
	fdiv_sti 5,4
	ret
label45::
	fdivp_sti 5,4
	ret
label46::
	fdivr_sti 5,4
	ret
label47::
	fdivrp_sti 5,4
	ret
label48::
	fdiv_st 6,5
	ret
label49::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label50::
	fdivr_st 6,5
	ret
label51::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label52::
	fdiv_sti 6,5
	ret
label53::
	fdivp_sti 6,5
	ret
label54::
	fdivr_sti 6,5
	ret
label55::
	fdivrp_sti 6,5
	ret
label56::
	fdiv_st 7,6
	ret
label57::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label58::
	fdivr_st 7,6
	ret
label59::
	add	esp, STACK_SIZE
	int	ILLEGAL_OPC
label60::
	fdiv_sti 7,6
	ret
label61::
	fdivp_sti 7,6
	ret
label62::
	fdivr_sti 7,6
	ret
label63::
	fdivrp_sti 7,6
	ret
_adj_fdiv_r      ENDP



_fdivp_sti_st	PROC	NEAR
				; for calling from mem routines
	sub	esp, STACK_SIZE			; added back at end of fdivp_sti macro
	fdivp_sti 1, 0
	ret
_fdivp_sti_st	ENDP

_fdivrp_sti_st	PROC	NEAR
				; for calling from mem routines
	sub	esp, STACK_SIZE			; added back at end of fdivrp_sti macro
	fdivrp_sti 1, 0
	ret
_fdivrp_sti_st	ENDP


;;; _adj_fdiv_m32 - FDIV m32real FIX
;;
;; 	Input : Value of the m32real in the top of STACK
;;
;;	Output: Result of FDIV in ST

	PUBLIC	_adj_fdiv_m32
_adj_fdiv_m32	PROC	NEAR

	push	eax				; save eax
	mov	eax, [esp + MEM_OPERAND]	; check for
	and	eax, SINGLE_NAN			; NaN
	cmp	eax, SINGLE_NAN			;
	je	memory_divide_m32		;

	fnstsw	ax				; get status word
	and	eax, 3800h			; get top of stack
	je	spill_fpstack			; is FP stack full?
	fld	dword ptr[esp + MEM_OPERAND]	; load m32real in ST
	call	_fdivp_sti_st			; do actual divide
	pop	eax
	ret 4
spill_fpstack:
	fxch
	sub	esp, SPILL_SIZE 		; make temp space
	fstp	tbyte ptr[esp ]			; save user's ST(1)
	fld	dword ptr[esp + SPILL_MEM_OPERAND] ; load m32 real 
	call	_fdivp_sti_st			; do actual divide
	fld	tbyte ptr[esp]			; restore user's ST(1)
						;esp is adjusted by fdivrp fn
	fxch
	add	esp, SPILL_SIZE
	pop	eax
	ret 4
memory_divide_m32:
	fdiv	dword ptr[esp + MEM_OPERAND]	; do actual divide
	pop	eax
	ret 4

_adj_fdiv_m32	ENDP
	

;;; _adj_fdiv_m64 - FDIV m64real FIX
;;
;; 	Input : Value of the m64real in the top of STACK
;;
;;	Output: Result of FDIV in ST

	PUBLIC	_adj_fdiv_m64
_adj_fdiv_m64	PROC	NEAR

	push	eax				; save eax
	mov	eax, [esp + MEM_OPERAND + 4]	; check for
	and	eax, DOUBLE_NAN			; NaN
	cmp	eax, DOUBLE_NAN			;
	je	memory_divide_m64		;

	fnstsw	ax				; get status word
	and	eax, 3800h			; get top of stack
	je	spill_fpstack_m64		; is FP stack full?
	fld	qword ptr[esp + MEM_OPERAND]	; load m64real in ST
	call	_fdivp_sti_st			; do actual divide
	pop	eax
	ret 8
spill_fpstack_m64:
	fxch
	sub	esp, SPILL_SIZE 		; make temp space
	fstp	tbyte ptr[esp]			; save user's ST(1)
	fld	qword ptr[esp + SPILL_MEM_OPERAND] ; load m64real 
	call	_fdivp_sti_st			; do actual divide
	fld	tbyte ptr[esp]			; restore user's ST(1)
						;esp is adjusted by fdivrp fn
	fxch
	add	esp, SPILL_SIZE
	pop	eax
	ret 8
memory_divide_m64:
	fdiv	qword ptr[esp + MEM_OPERAND]	; do actual divide
	pop	eax
	ret 8

_adj_fdiv_m64	ENDP

;;; _adj_fdiv_m16i - FDIV m16int FIX
;;
;; 	Input : Value of the m16int in the top of STACK
;;
;;	Output: Result of FDIV in ST

	PUBLIC	_adj_fdiv_m16i
_adj_fdiv_m16i	PROC	NEAR
	push	eax				; save eax
	fnstsw	ax				; get status word
	and	eax, 3800h			; get top of stack
	je	spill_fpstack_m16i		; is FP stack full?
	fild	word ptr[esp + MEM_OPERAND]	; load m16int in ST
	call	_fdivp_sti_st			; do actual divide
	pop	eax
	ret 4
spill_fpstack_m16i:
	fxch
	sub	esp, SPILL_SIZE 		; make temp space
	fstp	tbyte ptr[esp ]			; save user's ST(1)
	fild	word ptr[esp + SPILL_MEM_OPERAND] ; load m16int
	call	_fdivp_sti_st			; do actual divide
	fld	tbyte ptr[esp]			; restore user's ST(1)
						;esp is adjusted by fdivrp fn
	fxch
	add	esp, SPILL_SIZE
	pop	eax
	ret 4

_adj_fdiv_m16i	ENDP

;;; _adj_fdiv_m32i - FDIV m32int FIX
;;
;; 	Input : Value of the m32int in the top of STACK
;;
;;	Output: Result of FDIV in ST

	PUBLIC	_adj_fdiv_m32i
_adj_fdiv_m32i	PROC	NEAR
	push	eax				; save eax
	fnstsw	ax				; get status word
	and	eax, 3800h			; get top of stack
	je	spill_fpstack_m32i		; is FP stack full?
	fild	dword ptr[esp + MEM_OPERAND]	; load m32int in ST
	call	_fdivp_sti_st			; do actual divide
	pop	eax
	ret 4
spill_fpstack_m32i:
	fxch
	sub	esp, SPILL_SIZE 		; make temp space
	fstp	tbyte ptr[esp ]			; save user's ST(1)
	fild	dword ptr[esp + SPILL_MEM_OPERAND] ; load m32int
	call	_fdivp_sti_st			; do actual divide
	fld	tbyte ptr[esp]			; restore user's ST(1)
						;esp is adjusted by fdivrp fn
	fxch
	add	esp, SPILL_SIZE
	pop	eax
	ret 4

_adj_fdiv_m32i	ENDP



;;; _adj_fdivr_m32 - FDIVR m32real FIX
;;
;; 	Input : Value of the m32real in the top of STACK
;;
;;	Output: Result of FDIVR in ST

	PUBLIC	_adj_fdivr_m32
_adj_fdivr_m32	PROC	NEAR
	push	eax				; save eax
	mov	eax, [esp + MEM_OPERAND]	; check for
	and	eax, SINGLE_NAN			; NaN
	cmp	eax, SINGLE_NAN			;
	je	memory_divide_m32r		;

	fnstsw	ax				; get status word
	and	eax, 3800h			; get top of stack
	je	spill_fpstack_m32r		; is FP stack full?
	fld	dword ptr[esp + MEM_OPERAND]	; load m32real in ST
	call	_fdivrp_sti_st			; do actual divide
	pop	eax
	ret 4
spill_fpstack_m32r:
	fxch
	sub	esp, SPILL_SIZE 		; make temp space
	fstp	tbyte ptr[esp ]			; save user's ST(1)
	fld	dword ptr[esp + SPILL_MEM_OPERAND] ; load m32 real 
	call	_fdivrp_sti_st			; do actual divide
	fld	tbyte ptr[esp]			; restore user's ST(1)
						;esp is adjusted by fdivp fn
	fxch
	add	esp, SPILL_SIZE
	pop	eax
	ret 4
memory_divide_m32r:
	fdivr	dword ptr[esp + MEM_OPERAND]	; do actual divide
	pop	eax
	ret 4

_adj_fdivr_m32	ENDP


;;; _adj_fdivr_m64 - FDIVR m64real FIX
;;
;; 	Input : Value of the m64real in the top of STACK
;;
;;	Output: Result of FDIVR in ST

	PUBLIC	_adj_fdivr_m64
_adj_fdivr_m64	PROC	NEAR
	push	eax				; save eax
	mov	eax, [esp + MEM_OPERAND + 4]	; check for
	and	eax, DOUBLE_NAN			; NaN
	cmp	eax, DOUBLE_NAN			;
	je	memory_divide_m64r		;

	fnstsw	ax				; get status word
	and	eax, 3800h			; get top of stack
	je	spill_fpstack_m64r		; is FP stack full?
	fld	qword ptr[esp + MEM_OPERAND]	; load m64real in ST
	call	_fdivrp_sti_st			; do actual divide
	pop	eax
	ret 8
spill_fpstack_m64r:
	fxch
	sub	esp, SPILL_SIZE 		; make temp space
	fstp	tbyte ptr[esp ]			; save user's ST(1)
	fld	qword ptr[esp + SPILL_MEM_OPERAND] ; load m64real 
	call	_fdivrp_sti_st			; do actual divide
	fld	tbyte ptr[esp]			; restore user's ST(1)
						;esp is adjusted by fdivp fn
	fxch
	add	esp, SPILL_SIZE
	pop	eax
	ret 8
memory_divide_m64r:
	fdivr	qword ptr[esp + MEM_OPERAND]	; do actual divide
	pop	eax
	ret 8

_adj_fdivr_m64	ENDP


;;; _adj_fdivr_m16i - FDIVR m16int FIX
;;
;; 	Input : Value of the m16int in the top of STACK
;;
;;	Output: Result of FDIVR in ST

	PUBLIC	_adj_fdivr_m16i
_adj_fdivr_m16i	PROC	NEAR
	push	eax				; save eax
	fnstsw	ax				; get status word
	and	eax, 3800h			; get top of stack
	je	spill_fpstack_m16ir		; is FP stack full?
	fild	word ptr[esp + MEM_OPERAND]	; load m16int in ST
	call	_fdivrp_sti_st			; do actual divide
	pop	eax
	ret 4
spill_fpstack_m16ir:
	fxch
	sub	esp, SPILL_SIZE 		; make temp space
	fstp	tbyte ptr[esp ]			; save user's ST(1)
	fild	word ptr[esp + SPILL_MEM_OPERAND] ; load m16int
	call	_fdivrp_sti_st			; do actual divide
	fld	tbyte ptr[esp]			; restore user's ST(1)
						;esp is adjusted by fdivp fn
	fxch
	add	esp, SPILL_SIZE
	pop	eax
	ret 4

_adj_fdivr_m16i	ENDP


;;; _adj_fdivr_m32i - FDIVR m32int FIX
;;
;; 	Input : Value of the m32int in the top of STACK
;;
;;	Output: Result of FDIVR in ST

	PUBLIC	_adj_fdivr_m32i
_adj_fdivr_m32i	PROC	NEAR
	push	eax				; save eax
	fnstsw	ax				; get status word
	and	eax, 3800h			; get top of stack
	je	spill_fpstack_m32ir		; is FP stack full?
	fild	dword ptr[esp + MEM_OPERAND]	; load m32int in ST
	call	_fdivrp_sti_st			; do actual divide
	pop	eax
	ret 4
spill_fpstack_m32ir:
	fxch
	sub	esp, SPILL_SIZE 		; make temp space
	fstp	tbyte ptr[esp ]			; save user's ST(1)
	fild	dword ptr[esp + SPILL_MEM_OPERAND] ; load m32int
	call	_fdivrp_sti_st			; do actual divide
	fld	tbyte ptr[esp]			; restore user's ST(1)
						;esp is adjusted by fdivp fn
	fxch
	add	esp, SPILL_SIZE
	pop	eax
	ret 4

_adj_fdivr_m32i	ENDP


;;; _safe_fdiv - FDIV fix
;;
;;	Pentium-safe version of FDIV, aka FDIVP ST(1),ST(0)
;;
;; 	Input : Numerator in ST(1), Denominator in ST(0)
;;
;;	Output: Result of FDIV in ST(0)


	PUBLIC  _safe_fdiv
_safe_fdiv      PROC    NEAR

	push eax
	sub	esp, STACK_SIZE
	fstp	tbyte ptr [esp+DENOM]
	fstp	tbyte ptr [esp+NUMER]
	call	fdiv_main_routine
	add	esp, STACK_SIZE
	pop eax
	ret

_safe_fdiv	ENDP


;;; _safe_fdivr - FDIVR fix
;;
;;	Pentium-safe version of FDIVR, aka FDIVRP ST(1),ST(0)
;;
;; 	Input : Numerator in ST(0), Denominator in ST(1)
;;
;;	Output: Result of FDIVR in ST(0)

	public  _safe_fdivr
_safe_fdivr      PROC    NEAR

	push eax
	sub	esp, STACK_SIZE
	fstp	tbyte ptr [esp+NUMER]
	fstp	tbyte ptr [esp+DENOM]
	call	fdiv_main_routine
	add	esp, STACK_SIZE
	pop eax
	ret

_safe_fdivr	ENDP



;;; _adj_fprem - FPREM FIX
;;
;;	Based on PRELIMINARY Intel code.


_fprem_common	PROC	NEAR

	push	eax
	push	ebx
	push	ecx
    mov     eax, [FPREM_MAIN_DENOM+6+esp] ; exponent and high 16 bits of mantissa
    xor     eax, FPREM_ONESMASK           ; invert bits that have to be one
    test    eax, FPREM_ONESMASK           ; check bits that have to be one
	jnz     remainder_hardware_ok
    shr     eax, 11
    and     eax, 0fh
    cmp     byte ptr fprem_risc_table[eax], 0     ; check for (1,4,7,a,d)
    jz      remainder_hardware_ok

; The denominator has the bit pattern. Weed out the funny cases like NaNs
; before applying the software version. Our caller guarantees that the
; denominator is not a denormal. Here we check for:
;	denominator	inf, NaN, unnormal
;	numerator	inf, NaN, unnormal, denormal

	mov     eax, [FPREM_MAIN_DENOM+6+esp] ; exponent and high 16 bits of mantissa
    and     eax, 07fff0000h	        ; mask the exponent only
    cmp     eax, 07fff0000h         ; check for INF or NaN 
	je  	remainder_hardware_ok
	mov     eax, [FPREM_MAIN_NUMER+6+esp] ; exponent and high 16 bits of mantissa
    and     eax, 07fff0000h		; mask the exponent only
	jz  	remainder_hardware_ok	; jif numerator denormal
    cmp     eax, 07fff0000h         ; check for INF or NaN 
	je  	remainder_hardware_ok
	mov 	eax, [esp + FPREM_MAIN_NUMER + 4]	; high mantissa bits - numerator
	add 	eax, eax		; set carry if explicit bit set
	jnz 	remainder_hardware_ok	; jmp if numerator is unnormal
	mov 	eax, [esp + FPREM_MAIN_DENOM + 4] ; high mantissa bits - denominator
	add 	eax, eax		; set carry if explicit bit set
	jnz 	remainder_hardware_ok	; jmp if denominator is unnormal

rem_patch:
    mov     eax, [FPREM_MAIN_DENOM+8+esp] ; sign and exponent of y (denominator)
    and     eax, 07fffh              ; clear sy
    add     eax, 63                  ; evaluate ey + 63 
    mov     ebx, [FPREM_MAIN_NUMER+8+esp]  ; sign and exponent of x (numerator)
    and     ebx, 07fffh              ; clear sx
    sub     ebx, eax                 ; evaluate the exponent difference (ex - ey)
    ja      rem_large	 	; if ex > ey + 63, case of large arguments 
rem_patch_loop:
	mov     eax, [FPREM_MAIN_DENOM+8+esp]  ; sign and exponent of y (denominator)
	and     eax, 07fffh		; clear sy
	add 	eax, 10			; evaluate ey + 10
	mov     ebx, [FPREM_MAIN_NUMER+8+esp]	; sign and exponent of x (numerator)
	and     ebx, 07fffh		; clear sx 	
	sub 	ebx, eax		; evaluate the exponent difference (ex - ey)
	js  	remainder_hardware_ok	; safe if ey + 10 > ex
	fld     tbyte ptr [FPREM_MAIN_NUMER+esp]   ; load the numerator
	mov     eax, [FPREM_MAIN_DENOM+8+esp] ; sign and exponent of y (denominator)
    mov     ebx, [FPREM_MAIN_NUMER+8+esp] ; sign and exponent of x (numerator)
	and     ebx, 07fffh             ; clear sx
	mov		ecx, ebx
	sub		ebx, eax
	and		ebx, 07h
	or		ebx, 04h
	sub		ecx, ebx
	mov		ebx, eax
	and     ebx, 08000h		; keep sy
	or		ecx, ebx		; merge the sign of y
	mov		dword ptr [FPREM_MAIN_DENOM+8+esp], ecx
	fld     tbyte ptr [FPREM_MAIN_DENOM+esp]   ; load the shifted denominator
	mov     dword ptr [FPREM_MAIN_DENOM+8+esp], eax	; restore the initial denominator
	fxch
	fprem				; this rem is safe
	fstp	tbyte ptr [FPREM_MAIN_NUMER+esp]	; update the numerator
	fstp    st(0)                   ; pop the stack
	jmp     rem_patch_loop 
rem_large:
	test	edx, 02h		; is denominator already saved
	jnz 	already_saved
	fld 	tbyte ptr[esp + FPREM_MAIN_DENOM]
	fstp	tbyte ptr[esp + FPREM_MAIN_DENOM_SAVE]	; save denominator
already_saved:
	; Save user's precision control and institute 80.  The fp ops in
	; rem_large_loop must not round to user's precision (if it is less
	; than 80) because the hardware would not have done so.  We are
	; aping the hardware here, which is all extended.

	fnstcw	[esp+FPREM_MAIN_PREV_CW]	; save caller's control word
	mov 	eax, dword ptr[esp + FPREM_MAIN_PREV_CW]
	or  	eax, 033fh		; mask exceptions, pc=80
	mov 	[esp + FPREM_MAIN_PATCH_CW], eax
	fldcw	[esp + FPREM_MAIN_PATCH_CW]	

    mov     eax, [FPREM_MAIN_DENOM+8+esp] ; sign and exponent of y (denominator)
    and     eax, 07fffh             ; clear sy
    mov     ebx, [FPREM_MAIN_NUMER+8+esp] ; sign and exponent of x (numerator)
    and     ebx, 07fffh             ; clear sx
	sub 	ebx, eax		; evaluate the exponent difference
	and 	ebx, 03fh
	or  	ebx, 020h
	add 	ebx, 1
	mov 	ecx, ebx
	mov     eax, [FPREM_MAIN_DENOM+8+esp] ; sign and exponent of y (denominator)
	mov     ebx, [FPREM_MAIN_NUMER+8+esp] ; sign and exponent of x (numerator)
    and     ebx, 07fffh             ; clear sx
    and     eax, 08000h             ; keep sy
    or      ebx, eax                ; merge the sign of y
    mov     dword ptr[FPREM_MAIN_DENOM+8+esp], ebx	; make ey equal to ex (scaled denominator)
    fld     tbyte ptr [FPREM_MAIN_DENOM+esp]   ; load the scaled denominator
	fabs
    fld     tbyte ptr [FPREM_MAIN_NUMER+esp]   ; load the numerator
	fabs
rem_large_loop:
	fcom
	fnstsw  ax	
	and     eax, 00100h		
	jnz 	rem_no_sub
	fsub	st, st(1)
rem_no_sub:
	fxch
	fmul	qword ptr half
	fxch
	sub	ecx, 1			; decrement the loop counter
	jnz 	rem_large_loop
	mov     ebx, [FPREM_MAIN_NUMER+8+esp] ; sign and exponent of x (numerator)
	fstp	tbyte ptr[esp + FPREM_MAIN_NUMER]	; save result
	fstp	st			; toss modified denom
	fld 	tbyte ptr[esp + FPREM_MAIN_DENOM_SAVE]
	fld 	tbyte ptr[big_number]	; force C2 to be set
	fprem
	fstp	st
	fld 	tbyte ptr[esp + FPREM_MAIN_NUMER]	; restore saved result
	
	fldcw	[esp + FPREM_MAIN_PREV_CW]	; restore caller's control word
	and     ebx, 08000h             ; keep sx
	jz  	rem_done
	fchs
	jmp  	rem_done
remainder_hardware_ok:
    fld     tbyte ptr [FPREM_MAIN_DENOM+esp]   ; load the denominator
    fld     tbyte ptr [FPREM_MAIN_NUMER+esp]   ; load the numerator
	fprem                           ; and finally do a remainder
; prem_main_routine end
rem_done:
	test	edx, 03h
	jz  	rem_exit
	fnstsw	[esp + FPREM_MAIN_FPREM_SW]	; save Q0 Q1 and Q2
	test	edx, 01h
	jz  	do_not_de_scale
; De-scale the result. Go to pc=80 to prevent from fmul
; from user precision (fprem does not round the result).
	fnstcw	[esp + FPREM_MAIN_PREV_CW]	; save callers control word
	mov 	eax, [esp + FPREM_MAIN_PREV_CW]
	or  	eax, 0300h		; pc = 80
	mov 	[esp + FPREM_MAIN_PATCH_CW], eax
	fldcw	[esp + FPREM_MAIN_PATCH_CW]
	fmul	qword ptr one_shr_64
	fldcw	[esp + FPREM_MAIN_PREV_CW]	; restore callers CW
do_not_de_scale:
	mov	eax, [esp + FPREM_MAIN_FPREM_SW]
	fxch
	fstp	st
	fld 	tbyte ptr[esp + FPREM_MAIN_DENOM_SAVE]
	fxch
	and 	eax, 04300h		; restore saved Q0, Q1, Q2
	sub 	esp, ENV_SIZE
	fnstenv	[esp]
	and 	[esp].STATUS_WORD, 0bcffh
	or  	[esp].STATUS_WORD, eax
	fldenv	[esp]
	add 	esp, ENV_SIZE
rem_exit:
	pop 	ecx
	pop 	ebx
	pop 	eax
	ret
_fprem_common	ENDP




    PUBLIC  _adj_fprem
_adj_fprem	PROC	NEAR
	push	edx
    sub     esp, FPREM_STACK_SIZE
    fstp    tbyte ptr [FPREM_NUMER+esp]
    fstp    tbyte ptr [FPREM_DENOM+esp]
	xor 	edx, edx
; prem_main_routine begin
    mov     eax,[FPREM_DENOM+6+esp]       ; exponent and high 16 bits of mantissa
    test    eax,07fff0000h          ; check for denormal
    jz      fprem_denormal
	call	_fprem_common
	add 	esp, FPREM_STACK_SIZE
	pop 	edx
	ret

fprem_denormal:
    fld     tbyte ptr [FPREM_DENOM+esp]   ; load the denominator
    fld     tbyte ptr [FPREM_NUMER+esp]   ; load the numerator
    mov     eax, [FPREM_DENOM+esp]        ; test for whole mantissa == 0
    or      eax, [FPREM_DENOM+4+esp]      ; test for whole mantissa == 0
    jz      remainder_hardware_ok_l ; denominator is zero
	fxch
	fstp	tbyte ptr[esp + FPREM_DENOM_SAVE]	; save org denominator
	fld 	tbyte ptr[esp + FPREM_DENOM]
	fxch
	or  	edx, 02h
;
; For this we need pc=80.  Also, mask exceptions so we don't take any
; denormal operand exceptions.  It is guaranteed that the descaling
; later on will take underflow, which is what the hardware would have done
; on a normal fprem.
;
    fnstcw  [FPREM_PREV_CW+esp]         ; save caller's control word
    mov     eax, [FPREM_PREV_CW+esp] 
    or      eax, 0033fh             	; mask exceptions, pc=80
    mov     [FPREM_PATCH_CW+esp], eax
    fldcw   [FPREM_PATCH_CW+esp]        ; mask exceptions & pc=80

; The denominator is a denormal.  For most numerators, scale both numerator
; and denominator to get rid of denormals.  Then execute the common code
; with the flag set to indicate that the result must be de-scaled.
; For large numerators this won't work because the scaling would cause
; overflow.  In this case we know the numerator is large, the denominator
; is small (denormal), so the exponent difference is also large.  This means
; the rem_large code will be used and this code depends on the difference
; in exponents modulo 64.  Adding 64 to the denominators exponent
; doesn't change the modulo 64 difference.  So we can scale the denominator
; by 64, making it not denormal, and this won't effect the result.
;
; To start with, figure out if numerator is large

	mov 	eax, [esp + FPREM_NUMER + 8]	; load numerator exponent
	and 	eax, 7fffh		; isolate numerator exponent
	cmp 	eax, 7fbeh		; compare Nexp to Maxexp-64
	ja  	big_numer_rem_de	; jif big numerator

; So the numerator is not large scale both numerator and denominator

	or  	edx, 1			; edx = 1, if denormal extended divisor
	fmul	qword ptr one_shl_64	; make numerator not denormal
	fstp	tbyte ptr[esp + FPREM_NUMER]
	fmul	qword ptr one_shl_64	; make denominator not denormal
	fstp	tbyte ptr[esp + FPREM_DENOM]
	jmp 	scaling_done

; The numerator is large.  Scale only the denominator, which will not
; change the result which we know will be partial.  Set the scale flag
; to false.
big_numer_rem_de:
; We must do this with pc=80 to avoid rounding to single/double.
; In this case we do not mask exceptions so that we will take
; denormal operand, as would the hardware.
	fnstcw  [FPREM_PREV_CW+esp]				; save caller's control word
	mov     eax, [FPREM_PREV_CW+esp] 
	or      eax, 00300h             		; pc=80
	mov     [FPREM_PATCH_CW+esp], eax
	fldcw   [FPREM_PATCH_CW+esp]			; pc=80

	fstp	st			; Toss numerator
	fmul	qword ptr one_shl_64	; make denominator not denormal
	fstp	tbyte ptr[esp + FPREM_DENOM]
	
; Restore the control word which was fiddled to scale at 80-bit precision.
; Then call the common code.
scaling_done:
	fldcw	[esp + FPREM_PREV_CW] 	; restore callers control word
	call	_fprem_common
	add 	esp, FPREM_STACK_SIZE
	pop 	edx
	ret
	
remainder_hardware_ok_l:
    fprem              		; and finally do a remainder 

    add     esp, FPREM_STACK_SIZE
	pop 	edx
    ret

_adj_fprem	ENDP



;
; FPREM1 code begins here
;


_fprem1_common	PROC	NEAR

	push	eax
	push	ebx
	push	ecx
    mov     eax, [FPREM_MAIN_DENOM+6+esp] ; exponent and high 16 bits of mantissa
    xor     eax, FPREM_ONESMASK           ; invert bits that have to be one
    test    eax, FPREM_ONESMASK           ; check bits that have to be one
	jnz     remainder1_hardware_ok
    shr     eax, 11
    and     eax, 0fh
    cmp     byte ptr fprem_risc_table[eax], 0     ; check for (1,4,7,a,d)
    jz      remainder1_hardware_ok

; The denominator has the bit pattern. Weed out the funny cases like NaNs
; before applying the software version. Our caller guarantees that the
; denominator is not a denormal. Here we check for:
;	denominator	inf, NaN, unnormal
;	numerator	inf, NaN, unnormal, denormal

	mov     eax, [FPREM_MAIN_DENOM+6+esp] ; exponent and high 16 bits of mantissa
    and     eax, 07fff0000h	        ; mask the exponent only
    cmp     eax, 07fff0000h         ; check for INF or NaN 
	je  	remainder1_hardware_ok
	mov     eax, [FPREM_MAIN_NUMER+6+esp] ; exponent and high 16 bits of mantissa
    and     eax, 07fff0000h		; mask the exponent only
	jz  	remainder1_hardware_ok	; jif numerator denormal
    cmp     eax, 07fff0000h         ; check for INF or NaN 
	je  	remainder1_hardware_ok
	mov 	eax, [esp + FPREM_MAIN_NUMER + 4]	; high mantissa bits - numerator
	add 	eax, eax		; set carry if explicit bit set
	jnz 	remainder1_hardware_ok	; jmp if numerator is unnormal
	mov 	eax, [esp + FPREM_MAIN_DENOM + 4] ; high mantissa bits - denominator
	add 	eax, eax		; set carry if explicit bit set
	jnz 	remainder1_hardware_ok	; jmp if denominator is unnormal

rem1_patch:
    mov     eax, [FPREM_MAIN_DENOM+8+esp] ; sign and exponent of y (denominator)
    and     eax, 07fffh              ; clear sy
    add     eax, 63                  ; evaluate ey + 63 
    mov     ebx, [FPREM_MAIN_NUMER+8+esp]  ; sign and exponent of x (numerator)
    and     ebx, 07fffh              ; clear sx
    sub     ebx, eax                 ; evaluate the exponent difference (ex - ey)
    ja      rem1_large	 	; if ex > ey + 63, case of large arguments 
rem1_patch_loop:
	mov     eax, [FPREM_MAIN_DENOM+8+esp]  ; sign and exponent of y (denominator)
	and     eax, 07fffh		; clear sy
	add 	eax, 10			; evaluate ey + 10
	mov     ebx, [FPREM_MAIN_NUMER+8+esp]	; sign and exponent of x (numerator)
	and     ebx, 07fffh		; clear sx 	
	sub 	ebx, eax		; evaluate the exponent difference (ex - ey)
	js  	remainder1_hardware_ok	; safe if ey + 10 > ex
	fld     tbyte ptr [FPREM_MAIN_NUMER+esp]   ; load the numerator
	mov     eax, [FPREM_MAIN_DENOM+8+esp] ; sign and exponent of y (denominator)
    mov     ebx, [FPREM_MAIN_NUMER+8+esp] ; sign and exponent of x (numerator)
	and     ebx, 07fffh             ; clear sx
	mov		ecx, ebx
	sub		ebx, eax
	and		ebx, 07h
	or		ebx, 04h
	sub		ecx, ebx
	mov		ebx, eax
	and     ebx, 08000h				; keep sy
	or		ecx, ebx				; merge the sign of y
	mov		dword ptr [FPREM_MAIN_DENOM+8+esp], ecx
	fld     tbyte ptr [FPREM_MAIN_DENOM+esp]   ; load the shifted denominator
	mov     dword ptr [FPREM_MAIN_DENOM+8+esp], eax	; restore the initial denominator
	fxch
	fprem				; this rem is safe
	fstp	tbyte ptr [FPREM_MAIN_NUMER+esp]	; update the numerator
	fstp    st(0)                   ; pop the stack
	jmp     rem1_patch_loop 
rem1_large:
	test	ebx, 02h		; is denominator already saved
	jnz 	already_saved1
	fld 	tbyte ptr[esp + FPREM_MAIN_DENOM]
	fstp	tbyte ptr[esp + FPREM_MAIN_DENOM_SAVE]	; save denominator
already_saved1:
	; Save user's precision control and institute 80.  The fp ops in
	; rem1_large_loop must not round to user's precision (if it is less
	; than 80) because the hardware would not have done so.  We are
	; aping the hardware here, which is all extended.

	fnstcw	[esp+FPREM_MAIN_PREV_CW]	; save caller's control word
	mov 	eax, dword ptr[esp + FPREM_MAIN_PREV_CW]
	or  	eax, 033fh		; mask exceptions, pc=80
	mov 	[esp + FPREM_MAIN_PATCH_CW], eax
	fldcw	[esp + FPREM_MAIN_PATCH_CW]	

    mov     eax, [FPREM_MAIN_DENOM+8+esp] ; sign and exponent of y (denominator)
    and     eax, 07fffh             ; clear sy
    mov     ebx, [FPREM_MAIN_NUMER+8+esp] ; sign and exponent of x (numerator)
    and     ebx, 07fffh             ; clear sx
	sub 	ebx, eax		; evaluate the exponent difference
	and 	ebx, 03fh
	or  	ebx, 020h
	add 	ebx, 1
	mov 	ecx, ebx
	mov     eax, [FPREM_MAIN_DENOM+8+esp] ; sign and exponent of y (denominator)
	mov     ebx, [FPREM_MAIN_NUMER+8+esp] ; sign and exponent of x (numerator)
    and     ebx, 07fffh             ; clear sx
    and     eax, 08000h             ; keep sy
    or      ebx, eax                ; merge the sign of y
    mov     dword ptr[FPREM_MAIN_DENOM+8+esp], ebx	; make ey equal to ex (scaled denominator)
    fld     tbyte ptr [FPREM_MAIN_DENOM+esp]   ; load the scaled denominator
	fabs
    fld     tbyte ptr [FPREM_MAIN_NUMER+esp]   ; load the numerator
	fabs
rem1_large_loop:
	fcom
	fnstsw  ax	
	and     eax, 00100h		
	jnz	rem1_no_sub
	fsub	st, st(1)
rem1_no_sub:
	fxch
	fmul	qword ptr half
	fxch
	sub 	ecx, 1			; decrement the loop counter
	jnz 	rem1_large_loop
	mov     ebx, [FPREM_MAIN_NUMER+8+esp] ; sign and exponent of x (numerator)
	fstp	tbyte ptr[esp + FPREM_MAIN_NUMER]	; save result
	fstp	st			; toss modified denom
	fld 	tbyte ptr[esp + FPREM_MAIN_DENOM_SAVE]
	fld 	tbyte ptr[big_number]	; force C2 to be set
	fprem1
	fstp	st
	fld 	tbyte ptr[esp + FPREM_MAIN_NUMER]	; restore saved result
	
	fldcw	[esp + FPREM_MAIN_PREV_CW]	; restore caller's control word
	and     ebx, 08000h             ; keep sx
	jz  	rem1_done
	fchs
	jmp 	rem1_done
remainder1_hardware_ok:
    fld     tbyte ptr [FPREM_MAIN_DENOM+esp]   ; load the denominator
    fld     tbyte ptr [FPREM_MAIN_NUMER+esp]   ; load the numerator
	fprem1                           ; and finally do a remainder
; prem1_main_routine end
rem1_done:
	test	edx, 03h
	jz  	rem1_exit
	fnstsw	[esp + FPREM_MAIN_FPREM_SW]	; save Q0 Q1 and Q2
	test	edx, 01h
	jz  	do_not_de_scale1
; De-scale the result. Go to pc=80 to prevent from fmul
; from user precision (fprem does not round the result).
	fnstcw	[esp + FPREM_MAIN_PREV_CW]	; save callers control word
	mov 	eax, [esp + FPREM_MAIN_PREV_CW]
	or  	eax, 0300h		; pc = 80
	mov 	[esp + FPREM_MAIN_PATCH_CW], eax
	fldcw	[esp + FPREM_MAIN_PATCH_CW]
	fmul	qword ptr one_shr_64
	fldcw	[esp + FPREM_MAIN_PREV_CW]	; restore callers CW
do_not_de_scale1:
	mov	eax, [esp + FPREM_MAIN_FPREM_SW]
	fxch
	fstp	st
	fld 	tbyte ptr[esp + FPREM_MAIN_DENOM_SAVE]
	fxch
	and 	eax, 04300h		; restore saved Q0, Q1, Q2
	sub 	esp, ENV_SIZE
	fnstenv	[esp]
	and 	[esp].STATUS_WORD, 0bcffh
	or  	[esp].STATUS_WORD, eax
	fldenv	[esp]
	add 	esp, ENV_SIZE
rem1_exit:
	pop	ecx
	pop	ebx
	pop	eax
	ret
_fprem1_common	ENDP

	PUBLIC	_adj_fprem1
_adj_fprem1	PROC	NEAR

	push	edx
    sub     esp, FPREM_STACK_SIZE
    fstp    tbyte ptr [FPREM_NUMER+esp]
    fstp    tbyte ptr [FPREM_DENOM+esp]
	mov 	edx, 0
; prem1_main_routine begin
    mov     eax,[FPREM_DENOM+6+esp]       ; exponent and high 16 bits of mantissa
    test    eax,07fff0000h          ; check for denormal
    jz      denormal1
	call	_fprem1_common
	add 	esp, FPREM_STACK_SIZE
	pop 	edx
	ret

denormal1:
    fld     tbyte ptr [FPREM_DENOM+esp]   ; load the denominator
    fld     tbyte ptr [FPREM_NUMER+esp]   ; load the numerator
    mov     eax, [FPREM_DENOM+esp]        ; test for whole mantissa == 0
    or      eax, [FPREM_DENOM+4+esp]      ; test for whole mantissa == 0
    jz      remainder1_hardware_ok_l ; denominator is zero
	fxch
	fstp	tbyte ptr[esp + FPREM_DENOM_SAVE]	; save org denominator
	fld 	tbyte ptr[esp + FPREM_DENOM]
	fxch
	or  	edx, 02h
;
; For this we need pc=80.  Also, mask exceptions so we don't take any
; denormal operand exceptions.  It is guaranteed that the descaling
; later on will take underflow, which is what the hardware would have done
; on a normal fprem.
;
    fnstcw  [FPREM_PREV_CW+esp]         ; save caller's control word
    mov     eax, [FPREM_PREV_CW+esp] 
    or      eax, 0033fh             	; mask exceptions, pc=80
    mov     [FPREM_PATCH_CW+esp], eax
    fldcw   [FPREM_PATCH_CW+esp]        ; mask exceptions & pc=80

; The denominator is a denormal.  For most numerators, scale both numerator
; and denominator to get rid of denormals.  Then execute the common code
; with the flag set to indicate that the result must be de-scaled.
; For large numerators this won't work because the scaling would cause
; overflow.  In this case we know the numerator is large, the denominator
; is small (denormal), so the exponent difference is also large.  This means
; the rem1_large code will be used and this code depends on the difference
; in exponents modulo 64.  Adding 64 to the denominators exponent
; doesn't change the modulo 64 difference.  So we can scale the denominator
; by 64, making it not denormal, and this won't effect the result.
;
; To start with, figure out if numerator is large

	mov 	eax, [esp + FPREM_NUMER + 8]	; load numerator exponent
	and 	eax, 7fffh		; isolate numerator exponent
	cmp 	eax, 7fbeh		; compare Nexp to Maxexp-64
	ja  	big_numer_rem1_de	; jif big numerator

; So the numerator is not large scale both numerator and denominator

	or  	edx, 1			; edx = 1, if denormal extended divisor
	fmul	qword ptr one_shl_64	; make numerator not denormal
	fstp	tbyte ptr[esp + FPREM_NUMER]
	fmul	qword ptr one_shl_64	; make denominator not denormal
	fstp	tbyte ptr[esp + FPREM_DENOM]
	jmp 	scaling_done1

; The numerator is large.  Scale only the denominator, which will not
; change the result which we know will be partial.  Set the scale flag
; to false.
big_numer_rem1_de:
; We must do this with pc=80 to avoid rounding to single/double.
; In this case we do not mask exceptions so that we will take
; denormal operand, as would the hardware.
	fnstcw  [FPREM_PREV_CW+esp]			; save caller's control word
	mov     eax, [FPREM_PREV_CW+esp] 
	or      eax, 00300h             	; pc=80
	mov     [FPREM_PATCH_CW+esp], eax
	fldcw   [FPREM_PATCH_CW+esp]		;  pc=80
	fstp	st							; Toss numerator
	fmul	qword ptr one_shl_64		; make denominator not denormal
	fstp	tbyte ptr[esp + FPREM_DENOM]
	
; Restore the control word which was fiddled to scale at 80-bit precision.
; Then call the common code.
scaling_done1:
	fldcw	[esp + FPREM_PREV_CW] 	; restore callers control word
	call	_fprem1_common
	add 	esp, FPREM_STACK_SIZE
	pop 	edx
	ret
	
remainder1_hardware_ok_l:
    fprem              		; and finally do a remainder 
    add     esp, FPREM_STACK_SIZE
	pop	edx
    ret
_adj_fprem1	ENDP

	PUBLIC	_safe_fprem
_safe_fprem	PROC	NEAR

    call _adj_fprem
    ret

_safe_fprem	ENDP

	PUBLIC	_safe_fprem1
_safe_fprem1	PROC	NEAR

    call _adj_fprem1
    ret

_safe_fprem1	ENDP



;;; _adj_fpatan - FPATAN FIX
;;
;;	Dummy entry point


	PUBLIC	_adj_fpatan
_adj_fpatan	PROC	NEAR

	fpatan
	ret

_adj_fpatan	ENDP


;;; _adj_fptan - FPTAN FIX
;;
;;	Dummy entry point


	PUBLIC	_adj_fptan
_adj_fptan	PROC	NEAR

	fptan
	ret

_adj_fptan	ENDP


	end