libc/i386fp/fdiv.x


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312

#define EF_SIZE		4

! bcc 386 floating point routines (version 2) -- Fdiv, Fdivd, Fdivf
! authors: Timothy Murphy (tim@maths.tcd.ie), Bruce Evans

#include "fplib.h"

#define FRAME_SIZE	(3 * GENREG_SIZE + PC_SIZE)

	.extern	Fpushf
	.extern	fpdivzero
	.extern	fpoverflow
	.extern	fpunderflow

! double Fdiv(double x, double y) returns x / y

! pop 2 doubles from stack, divide first by second, and push quotient on stack

! we denote upper and lower dwords of x and y (or their fractions)
! by (xu,xl), (yu,yl)

	.globl	Fdivf
	.align	ALIGNMENT
Fdivf:
	sub	esp,#D_SIZE	! make space for dummy double on stack
	push	ebp
	push	edi		! save some regs
	push	esi
	mov	eax,FRAME_SIZE-PC_SIZE+D_SIZE[esp]	! move return address ...
	mov	FRAME_SIZE-PC_SIZE[esp],eax	! ... to usual spot
	call	Fpushf
	pop	esi		! yl
	pop	edi		! yu
	mov	eax,FRAME_SIZE+D_SIZE+D_LOW[esp]	! xl
	mov	edx,FRAME_SIZE+D_SIZE+D_HIGH[esp]	! xu
	jmp	division

	.globl	Fdiv
	.align	ALIGNMENT
Fdiv:
	push	ebp
	push	edi		! save some regs
	push	esi
	mov	eax,FRAME_SIZE+D_LOW[esp]	! xl
	mov	edx,FRAME_SIZE+D_HIGH[esp]	! xu
	mov	esi,FRAME_SIZE+D_SIZE+D_LOW[esp]	! yl
	mov	edi,FRAME_SIZE+D_SIZE+D_HIGH[esp]	! yu
	jmp	division

	.align	ALIGNMENT
exp_y_0:
	mov	ebx,edi
	or	ebx,esi
	beq	zerodivide
	mov	ebx,#1
fix_y:
	test	edi,edi		! XXX - sloow
	js	y_unpacked
	shld	edi,esi,#1
	shl	esi,#1
	dec	bx
	jmp	fix_y

	.align	ALIGNMENT
exp_x_0:
	mov	ecx,edx
	or	ecx,eax
	beq	retz
	mov	ecx,#1	! change exponent from 0 to 1
fix_x:
	test	edx,#1 << (REG_BIT-1-2)		! XXX - sloow
	jnz	x_unpacked
	shld	edx,eax,#1
	shl	eax,#1
	dec	cx
	jmp	fix_x

! Fdivd pops double from stack, divides it by double at [ebx],
! and pushes quotient back on stack

	.globl	Fdivd
	.align	ALIGNMENT
Fdivd:
	sub	esp,#D_SIZE	! make space for dummy double on stack
	push	ebp
	push	edi		! save some regs
	push	esi
	mov	eax,FRAME_SIZE-PC_SIZE+D_SIZE[esp]	! move return address ...
	mov	FRAME_SIZE-PC_SIZE[esp],eax	! ... to usual spot
	mov	eax,FRAME_SIZE+D_SIZE+D_LOW[esp]	! xl
	mov	edx,FRAME_SIZE+D_SIZE+D_HIGH[esp]	! xu
	mov	esi,D_LOW[ebx]	! yl
	mov	edi,D_HIGH[ebx]	! yu

division:

! The full calculations are

!	(xu,xl,0) = yu * (zu,zl) + (0,r,0)  (normal 96/32 -> 64 bit division)
!	yl * zu = yu * q1 + r1  (32*32 -> 64 bit mul and 64/32 -> 32 bit div)

! so

!	(xu,xl,0,0) = (yu,yl) * (zu,zl-q1) + (0,0,r-r1,yl*(q1-zl))

! where the calculations zl-q1, r-r1 and yl*(q1-zl) are more complicated
! than the notation suggests. They may be negative and the one with the
! multiplication may not fit in 32 bits and in both cases the overflow
! has to be moved into higher bit positions.

! See Knuth for why (zu,zl-q1) is the correct 64-bit quotient to within
! 1 bit either way (assuming the normalization x < 2 * y).

! We only need to calculate the remainder (0,0,r-r1,yl*(q1-zl)) to resolve
! tie cases. It tells whether the approximate quotient is too high or too
! low.

#define NTEMPS	5

	sub	esp,#NTEMPS*GENREG_SIZE	! space to remember values for rounding of tie case

! Offsets from esp for these values (offsets using FRAME_SIZE are invalid
! while these temps are active)
r	=	0
q1	=	4
r1	=	8
yl	=	12
zl	=	16

! Step 1: unpack and normalize x to fraction in edx:eax (left shifted as
! far as possible less 2 so that x < y, and later z < y); unpack and normalize
! y to a fraction in edi:esi (left shifted as far as possible), put difference
! of signs (= sign of quotient) in ecx(D_SIGN_MASK) and difference of exponents 
! (= exponent of quotient before normalization) in cx.

	mov	ebp,edx		! xu
	xor	ebp,edi		! xu ^ yu
	and	ebp,#D_SIGN_MASK	! sign of result is difference of signs

! Unpack y first to trap 0 / 0

	mov	ebx,edi		! remember yu for exponent of y
	shld	edi,esi,#D_BIT-D_FRAC_BIT	! extract fraction of y ...
	shl	esi,#D_BIT-D_FRAC_BIT
	and	ebx,#D_EXP_MASK	! exponent of y
	jz	exp_y_0
	shr	ebx,#D_EXP_SHIFT	! in ebx (actually in bx, with high bits 0)
	or	edi,#D_NORM_MASK << (D_BIT-D_FRAC_BIT)	! normalize
y_unpacked:

! Unpack x

	mov	ecx,edx		! remember xu for exponent of x
	shld	edx,eax,#D_BIT-D_FRAC_BIT-2	! extract fraction of x ...
	shl	eax,#D_BIT-D_FRAC_BIT-2
	and	edx,#(D_NORM_MASK << (D_BIT-D_FRAC_BIT-2+1))-1
				! XXX - above may be shifted 1 extra unnecessarily
	and	ecx,#D_EXP_MASK	! exponent of x
	jz	exp_x_0
	shr	ecx,#D_EXP_SHIFT	! in ecx (actually in cx, with high bits 0)
	or	edx,#D_NORM_MASK << (D_BIT-D_FRAC_BIT-2) ! normalize
x_unpacked:

	sub	cx,bx		! not ecx,ebx because we want to use high bit for sign
	add	cx,#D_EXP_BIAS	! adjust exponent of quotient

	or	ecx,ebp		! include sign with exponent

! Step 2: quotient of fractions -> (edx,eax)

! 2a: (xu,xl,0) div yu = (zu,zl) -> (ebx,esi)

	div	eax,edi		! (xu,xl) div yu = zu in eax; remainder (rem) in edx
	mov	ebx,eax		! save zu in ebx
	sub	eax,eax		! clear eax: (edx,eax) = (rem,0)
	div	eax,edi		! (rem,0) div yu = zl in eax
	mov	r[esp],edx
	mov	zl[esp],eax
	xchg	eax,esi		! store zl in esi; save yl in eax
	mov	yl[esp],eax

! 2b: (yl * zu) div yu -> (0,eax)

	mul	eax,ebx		! yl * zu -> (edx,eax)
	div	eax,edi		! (yl * zu) div yu in eax
	mov	q1[esp],eax
	mov	r1[esp],edx

! 2c: (xu,xl) / (yu,yl) = (zu,zl) - (yl * zu) div yu -> (edx,eax)

	mov	edx,ebx		! zu
	xchg	eax,esi		! eax <- zl; esi <- (yl * zu) div yu
	sub	eax,esi
	sbb	edx,#0

! Step 3: normalise quotient

	test	edx,#1 << (REG_BIT-2)	! is fraction too small? (can only be by 1 bit)
	jnz	div4
	shld	edx,eax,#1	! yes; multiply fraction ...
	shl	eax,#1		! ... by 2 ...
	dec	cx		! ... and decrement exponent

! Step 4: shift and round

div4:
	mov	ebx,eax		! save for rounding
	shrd	eax,edx,#D_BIT-D_FRAC_BIT-1	! shift fraction of result ...
	shr	edx,#D_BIT-D_FRAC_BIT-1		! ... to proper position
	and	ebx,#(1 << (D_BIT-D_FRAC_BIT-1))-1	! look at bits shifted out
	cmp	ebx,#D_NORM_MASK >> (D_BIT-D_FRAC_BIT)	! compare with middle value
	jb	div5		! below middle, don't round up
	ja	roundup		! above middle, round up

! The low bits don't contain enough information to resolve the tie case,
! because the quotient itself is only an approximation.
! Calculate the exact remainder.
! This case is not very common, so don't worry much about speed.
! Unfortunately we had to save extra in all cases to prepare for it.

	push	edx
	push	eax

	sub	esi,esi		! the calculation requires 33 bits - carry to here
	mov	eax,2*GENREG_SIZE+q1[esp]
	sub	eax,2*GENREG_SIZE+zl[esp]
	pushfd
	mul	dword EF_SIZE+2*GENREG_SIZE+yl[esp]
	popfd
	jnc	foo
	sub	edx,2*GENREG_SIZE+yl[esp]
	sbb	esi,#0
foo:
	add	edx,2*GENREG_SIZE+r[esp]
	adc	esi,#0
	sub	edx,2*GENREG_SIZE+r1[esp]
	sbb	esi,#0
	mov	ebx,eax
	mov	edi,edx

	pop	eax
	pop	edx

! Can finally decide rounding of tie case

	js	div5		! remainder < 0 from looking at top 64 bits
	jnz	roundup		! remainder > 0 from looking at top 64 bits
	or	edi,ebx		! test bottom 64 bits
	jnz	roundup		! remainder > 0

	test	al,#1		! at last we know it is the tie case, check parity bit
	jz	div5		! already even, otherwise round up to make even

roundup:
	add	eax,#1		! add rounding bit
	adc	edx,#0
	test	edx,#D_NORM_MASK << 1	! has fraction overflowed (very unlikely)
	jz	div5
! Why were the shifts commented out?
	shrd	eax,edx,#1	! yes, divide fraction ...
	shr	edx,#1		! ... by 2 ...
	inc	cx		! ... and increment exponent

! Step 5: put it all together

div5:
	mov	ebx,ecx		! extract sign
	and	ebx,D_SIGN_MASK
	cmp	cx,#D_EXP_INFINITE	! is exponent too big?
	jge	overflow
	test	cx,cx
	jle	underflow
	shl	ecx,#D_EXP_SHIFT

	and	edx,#D_FRAC_MASK	! remove norm bit
	or	edx,ecx		! include exponent ...
	or	edx,ebx		! ... and sign

return:
	add	esp,#NTEMPS*GENREG_SIZE	! reclaim temp space
	mov	FRAME_SIZE+D_SIZE+D_LOW[esp],eax	! "push" lower dword of product ...
	mov	FRAME_SIZE+D_SIZE+D_HIGH[esp],edx	! ... and upper dword
	pop	esi		! restore registers
	pop	edi
	pop	ebp
	ret	#D_SIZE

retz:
	sub	edx,edx		! clear upper dword
	sub	eax,eax		! ... and lower dword
	jmp	return

overflow:
	mov	edx,ecx		! put sign in usual reg
	call	fpoverflow
	mov	eax,ecx		! XXX - wrong reg
	jmp	return

underflow:
	mov	esi,edx		! put upper part of fraction in usual reg
	mov	edx,ecx		! sign
	movsx	edi,cx		! put shift in usual reg
	neg	edi
	inc	edi
	call	fpunderflow
	jmp	return

zerodivide:
	mov	edx,ebp		! sign
	call	fpdivzero
	mov	eax,ecx		! XXX - wrong reg
	jmp	return