1 files changed, 421 insertions, 0 deletions
diff --git a/libgcc/config/arc/ieee-754/arc600-dsp/divdf3.S b/libgcc/config/arc/ieee-754/arc600-dsp/divdf3.S
new file mode 100644
index 00000000000..e54d31d87d1
--- /dev/null
+++ b/libgcc/config/arc/ieee-754/arc600-dsp/divdf3.S
@@ -0,0 +1,421 @@
+/* Copyright (C) 2008-2013 Free Software Foundation, Inc.  
+   Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
+		on behalf of Synopsys Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/*
+   to calculate a := b/x as b*y, with y := 1/x:
+   - x is in the range [1..2)
+   - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
+     Precision is higher for polynoms used to evaluate input with larger
+     value.
+   - Do one newton-raphson iteration step to double the precision,
+     then multiply this with the divisor
+	-> more time to decide if dividend is subnormal
+     - the worst error propagation is on the side of the value range
+       with the least initial defect, thus giving us about 30 bits precision.
+      The truncation error for the either is less than 1 + x/2 ulp.
+      A 31 bit inverse can be simply calculated by using x with implicit 1
+      and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
+      with the bare fraction part of x, then add in y0^2 for the implicit
+      1 of x.
+    - If calculating a 31 bit inverse, the systematic error is less than
+      -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
+    - If we calculate our seed with a 32 bit fraction, we can archive a
+      tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
+      only need to take the step to calculate the 2nd stage rest and
+      rounding adjust 1/32th of the time.  However, if we use a 20 bit
+      fraction for the seed, the negative error can exceed -2 ulp/128, (2)
+      thus for a simple add / tst check, we need to do the 2nd stage
+      rest calculation/ rounding adjust 1/16th of the time.
+      (1): The inexactness of the 32 bit inverse contributes an error in the
+      range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
+      rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
+      x/2 + 1/x <= 1.5 .
+      (2): Unless proven otherwise.  I have not actually looked for an
+      example where -2 ulp/128 is exceeded, and my calculations indicate
+      that the excess, if existent, is less than -1/512 ulp.
+    ??? The algorithm is still based on the ARC700 optimized code.
+    Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply
+    results.
+ */
+#include "../arc-ieee-754.h"
+#define mlo acc2
+#define mhi acc1
+#define mul64(b,c) mullw 0,b,c` machlw 0,b,c
+#define mulu64(b,c) mululw 0,b,c` machulw 0,b,c
+
+/* N.B. fp-bit.c does double rounding on denormal numbers.  */
+#if 0 /* DEBUG */
+	.global __divdf3
+	FUNC(__divdf3)
+	.balign 4
+__divdf3:
+	push_s blink
+	push_s r2
+	push_s r3
+	push_s r0
+	bl.d __divdf3_c
+	push_s r1
+	ld_s r2,[sp,12]
+	ld_s r3,[sp,8]
+	st_s r0,[sp,12]
+	st_s r1,[sp,8]
+	pop_s r1
+	bl.d __divdf3_asm
+	pop_s r0
+	pop_s r3
+	pop_s r2
+	pop_s blink
+	cmp r0,r2
+	cmp.eq r1,r3
+	jeq_s [blink]
+	and r12,DBL0H,DBL1H
+	bic.f 0,0x7ff80000,r12 ; both NaN -> OK
+	jeq_s [blink]
+	bl abort
+	ENDFUNC(__divdf3)
+#define __divdf3 __divdf3_asm
+#endif /* DEBUG */
+
+	FUNC(__divdf3)
+	.balign 4
+.L7ff00000:
+	.long 0x7ff00000
+.Ldivtab:
+	.long 0xfc0fffe1
+	.long 0xf46ffdfb
+	.long 0xed1ffa54
+	.long 0xe61ff515
+	.long 0xdf7fee75
+	.long 0xd91fe680
+	.long 0xd2ffdd52
+	.long 0xcd1fd30c
+	.long 0xc77fc7cd
+	.long 0xc21fbbb6
+	.long 0xbcefaec0
+	.long 0xb7efa100
+	.long 0xb32f92bf
+	.long 0xae8f83b7
+	.long 0xaa2f7467
+	.long 0xa5ef6479
+	.long 0xa1cf53fa
+	.long 0x9ddf433e
+	.long 0x9a0f3216
+	.long 0x965f2091
+	.long 0x92df0f11
+	.long 0x8f6efd05
+	.long 0x8c1eeacc
+	.long 0x88eed876
+	.long 0x85dec615
+	.long 0x82eeb3b9
+	.long 0x800ea10b
+	.long 0x7d3e8e0f
+	.long 0x7a8e7b3f
+	.long 0x77ee6836
+	.long 0x756e5576
+	.long 0x72fe4293
+	.long 0x709e2f93
+	.long 0x6e4e1c7f
+	.long 0x6c0e095e
+	.long 0x69edf6c5
+	.long 0x67cde3a5
+	.long 0x65cdd125
+	.long 0x63cdbe25
+	.long 0x61ddab3f
+	.long 0x600d991f
+	.long 0x5e3d868c
+	.long 0x5c6d7384
+	.long 0x5abd615f
+	.long 0x590d4ecd
+	.long 0x576d3c83
+	.long 0x55dd2a89
+	.long 0x545d18e9
+	.long 0x52dd06e9
+	.long 0x516cf54e
+	.long 0x4ffce356
+	.long 0x4e9cd1ce
+	.long 0x4d3cbfec
+	.long 0x4becae86
+	.long 0x4aac9da4
+	.long 0x496c8c73
+	.long 0x483c7bd3
+	.long 0x470c6ae8
+	.long 0x45dc59af
+	.long 0x44bc4915
+	.long 0x43ac3924
+	.long 0x428c27fb
+	.long 0x418c187a
+	.long 0x407c07bd
+
+__divdf3_support: /* This label makes debugger output saner.  */
+	.balign 4
+.Ldenorm_dbl1:
+	brge r6, \
+		0x43500000,.Linf_NaN ; large number / denorm -> Inf
+	bmsk.f r12,DBL1H,19
+	mov.eq r12,DBL1L
+	mov.eq DBL1L,0
+	sub.eq r7,r7,32
+	norm.f r11,r12 ; flag for x/0 -> Inf check
+	beq_s .Linf_NaN
+	mov.mi r11,0
+	add.pl r11,r11,1
+	add_s r12,r12,r12
+	asl r8,r12,r11
+	rsub r12,r11,31
+	lsr r12,DBL1L,r12
+	tst_s DBL1H,DBL1H
+	or r8,r8,r12
+	lsr r4,r8,26
+	lsr DBL1H,r8,12
+	ld.as r4,[r10,r4]
+	bxor.mi DBL1H,DBL1H,31
+	sub r11,r11,11
+	asl DBL1L,DBL1L,r11
+	sub r11,r11,1
+	mulu64 (r4,r8)
+	sub r7,r7,r11
+	b.d .Lpast_denorm_dbl1
+	asl r7,r7,20
+
+.Linf_NaN:
+	tst_s DBL0L,DBL0L ; 0/0 -> NaN
+	xor_s DBL1H,DBL1H,DBL0H
+	bclr.eq.f DBL0H,DBL0H,31
+	bmsk DBL0H,DBL1H,30
+	xor_s DBL0H,DBL0H,DBL1H
+	sub.eq DBL0H,DBL0H,1
+	mov_s DBL0L,0
+	j_s.d [blink]
+	or DBL0H,DBL0H,r9
+	.balign 4
+.Lret0_2:
+	xor_s DBL1H,DBL1H,DBL0H
+	mov_s DBL0L,0
+	bmsk DBL0H,DBL1H,30
+	j_s.d [blink]
+	xor_s DBL0H,DBL0H,DBL1H
+	.balign 4
+	.global __divdf3
+/* N.B. the spacing between divtab and the sub3 to get its address must
+   be a multiple of 8.  */
+__divdf3:
+	asl r8,DBL1H,12
+	lsr r4,r8,26
+	sub3 r10,pcl,51;(.-.Ldivtab) >> 3
+	ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
+	ld.as r4,[r10,r4]
+	lsr r12,DBL1L,20
+	and.f r7,DBL1H,r9
+	or r8,r8,r12
+	mulu64 (r4,r8)
+	beq.d .Ldenorm_dbl1
+.Lpast_denorm_dbl1:
+	and.f r6,DBL0H,r9
+	breq.d r7,r9,.Linf_nan_dbl1
+	asl r4,r4,12
+	sub r4,r4,mhi
+	mululw 0,r4,r4
+	machulw r5,r4,r4
+	bne.d .Lnormal_dbl0
+	lsr r8,r8,1
+
+	.balign 4
+.Ldenorm_dbl0:
+	bmsk.f r12,DBL0H,19
+	; wb stall
+	mov.eq r12,DBL0L
+	sub.eq r6,r6,32
+	norm.f r11,r12 ; flag for 0/x -> 0 check
+	brge r7, \
+		0x43500000, .Lret0_2 ; denorm/large number -> 0
+	beq_s .Lret0_2
+	mov.mi r11,0
+	add.pl r11,r11,1
+	asl r12,r12,r11
+	sub r6,r6,r11
+	add.f 0,r6,31
+	lsr r10,DBL0L,r6
+	mov.mi r10,0
+	add r6,r6,11+32
+	neg.f r11,r6
+	asl DBL0L,DBL0L,r11
+	mov.pl DBL0L,0
+	sub r6,r6,32-1
+	b.d .Lpast_denorm_dbl0
+	asl r6,r6,20
+
+	.balign 4
+.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
+	or.f 0,r6,DBL0L
+	cmp.ne r6,r9
+	not_s DBL0L,DBL1H
+	sub_s.ne DBL0L,DBL0L,DBL0L
+	tst_s DBL0H,DBL0H
+	add_s DBL0H,DBL1H,DBL0L
+	j_s.d [blink]
+	bxor.mi DBL0H,DBL0H,31
+
+	.balign 4
+.Lnormal_dbl0:
+	breq.d r6,r9,.Linf_nan_dbl0
+	asl r12,DBL0H,11
+	lsr r10,DBL0L,21
+.Lpast_denorm_dbl0:
+	bset r8,r8,31
+	mulu64 (r5,r8)
+	add_s r12,r12,r10
+	bset r5,r12,31
+	cmp r5,r8
+	cmp.eq DBL0L,DBL1L
+	lsr.cc r5,r5,1
+	sub r4,r4,mhi ; u1.31 inverse, about 30 bit
+	mululw 0,r5,r4
+	machulw r11,r5,r4 ; result fraction highpart
+	lsr r8,r8,2 ; u3.29
+	add r5,r6, /* wait for immediate */ \
+		0x3fe00000
+	mulu64 (r11,r8) ; u-28.31
+	asl_s DBL1L,DBL1L,9 ; u-29.23:9
+	sbc r6,r5,r7
+	mov r12,mlo ; u-28.31
+	mulu64 (r11,DBL1L) ; mhi: u-28.23:9
+	add.cs DBL0L,DBL0L,DBL0L
+	asl_s DBL0L,DBL0L,6 ; u-26.25:7
+	asl r10,r11,23
+	sub_l DBL0L,DBL0L,r12
+	lsr r7,r11,9
+	sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
+	mul64 (r5,r4) ; mhi: result fraction lowpart
+	xor.f 0,DBL0H,DBL1H
+	and DBL0H,r6,r9
+	add_s DBL0H,DBL0H,r7
+	bclr r12,r9,20 ; 0x7fe00000
+	brhs.d r6,r12,.Linf_denorm
+	bxor.mi DBL0H,DBL0H,31
+	add.f r12,mhi,0x11
+	asr r9,r12,5
+	sub.mi DBL0H,DBL0H,1
+	add.f DBL0L,r9,r10
+	tst r12,0x1c
+	jne.d [blink]
+	add.cs DBL0H,DBL0H,1
+        /* work out exact rounding if we fall through here.  */
+        /* We know that the exact result cannot be represented in double
+           precision.  Find the mid-point between the two nearest
+           representable values, multiply with the divisor, and check if
+           the result is larger than the dividend.  Since we want to know
+	   only the sign bit, it is sufficient to calculate only the
+	   highpart of the lower 64 bits.  */
+	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
+	sub.f DBL0L,DBL0L,1
+	asl r12,r9,2 ; u-22.30:2
+	sub.cs DBL0H,DBL0H,1
+	sub.f r12,r12,2
+	mov r10,mlo ; rest before considering r12 in r5 : -r10
+	mululw 0,r12,DBL1L
+	machulw r7,r12,DBL1L ; mhi: u-51.32
+	asl r5,r5,25 ; s-51.7:25
+	lsr r10,r10,7 ; u-51.30:2
+	mulu64 (r12,r8) ; mlo: u-51.31:1
+	sub r5,r5,r10
+	add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
+	bset r7,r7,0 ; make sure that the result is not zero, and that
+	sub r5,r5,r7 ; a highpart zero appears negative
+	sub.f r5,r5,mlo ; rest msw
+	add.pl.f DBL0L,DBL0L,1
+	j_s.d [blink]
+	add.eq DBL0H,DBL0H,1
+
+.Linf_nan_dbl0:
+	tst_s DBL1H,DBL1H
+	j_s.d [blink]
+	bxor.mi DBL0H,DBL0H,31
+	.balign 4
+.Linf_denorm:
+	lsr r12,r6,28
+	brlo.d r12,0xc,.Linf
+.Ldenorm:
+	asr r6,r6,20
+	neg r9,r6
+	mov_s DBL0H,0
+	brhs.d r9,54,.Lret0
+	bxor.mi DBL0H,DBL0H,31
+	add r12,mhi,1
+	and r12,r12,-4
+	rsub r7,r6,5
+	asr r10,r12,28
+	bmsk r4,r12,27
+	min r7,r7,31
+	asr DBL0L,r4,r7
+	add DBL1H,r11,r10
+	abs.f r10,r4
+	sub.mi r10,r10,1
+	add.f r7,r6,32-5
+	asl r4,r4,r7
+	mov.mi r4,r10
+	add.f r10,r6,23
+	rsub r7,r6,9
+	lsr r7,DBL1H,r7
+	asl r10,DBL1H,r10
+	or.pnz DBL0H,DBL0H,r7
+	or.mi r4,r4,r10
+	mov.mi r10,r7
+	add.f DBL0L,r10,DBL0L
+	add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
+	bxor.f 0,r4,31
+	add.pnz.f DBL0L,DBL0L,1
+	add.cs.f DBL0H,DBL0H,1
+	jne_s [blink]
+	/* Calculation so far was not conclusive; calculate further rest.  */
+	mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
+	asr.f r12,r12,3
+	asl r5,r5,25 ; s-51.7:25
+	mov r11,mlo ; rest before considering r12 in r5 : -r11
+	mulu64 (r12,r8) ; u-51.31:1
+	and r9,DBL0L,1 ; tie-breaker: round to even
+	lsr r11,r11,7 ; u-51.30:2
+	mov DBL1H,mlo ; u-51.31:1
+	mulu64 (r12,DBL1L) ; u-51.62:2
+	sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
+	add_s DBL1H,DBL1H,r11
+	sub DBL1H,DBL1H,r5 ; -rest msw
+	add_s DBL1H,DBL1H,mhi ; -rest msw
+	add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
+	tst_s DBL1H,DBL1H
+	cmp.eq mlo,r9
+	add.cs.f DBL0L,DBL0L,1
+	j_s.d [blink]
+	add.cs DBL0H,DBL0H,1
+
+.Lret0:
+	/* return +- 0 */
+	j_s.d [blink]
+	mov_s DBL0L,0
+.Linf:
+	mov_s DBL0H,r9
+	mov_s DBL0L,0
+	j_s.d [blink]
+	bxor.mi DBL0H,DBL0H,31
+	ENDFUNC(__divdf3)