summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorolegendo <olegendo@138bc75d-0d04-0410-961f-82ee72b054a4>2013-06-09 21:32:37 +0000
committerolegendo <olegendo@138bc75d-0d04-0410-961f-82ee72b054a4>2013-06-09 21:32:37 +0000
commitfe9c9e23e0abf057ead745de1412855c5addcdef (patch)
tree5cfdce74e6c97d2a997232dd0f3a63aab12de94e
parent1bb66e410b45fefedd01ed3c63956e1d4c5fdf48 (diff)
downloadgcc-fe9c9e23e0abf057ead745de1412855c5addcdef.tar.gz
PR target/6526
* config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation. PR target/6526 * gcc.target/sh/pr6526.c: New. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@199873 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r--gcc/testsuite/ChangeLog5
-rw-r--r--gcc/testsuite/gcc.target/sh/pr6526.c64
-rw-r--r--libgcc/ChangeLog6
-rw-r--r--libgcc/config/sh/lib1funcs.S225
4 files changed, 230 insertions, 70 deletions
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 0ba3af62182..cae5502642f 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2013-06-09 Oleg Endo <olegendo@gcc.gnu.org>
+
+ PR target/6526
+ * gcc.target/sh/pr6526.c: New.
+
2013-06-09 Jakub Jelinek <jakub@redhat.com>
PR target/57568
diff --git a/gcc/testsuite/gcc.target/sh/pr6526.c b/gcc/testsuite/gcc.target/sh/pr6526.c
new file mode 100644
index 00000000000..a7dd6d87af6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/sh/pr6526.c
@@ -0,0 +1,64 @@
+/* Check that the XF registers are not clobbered by an integer division
+ that is done using double precision FPU division. */
+/* { dg-do run { target "sh*-*-*" } } */
+/* { dg-options "-O1 -mdiv=call-fp" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "*" } { "-m4*-single" "-m4*-single-only" } } */
+
+#include <assert.h>
+#include <stdlib.h>
+
+extern void __set_fpscr (int);
+
+void
+write_xf0 (float* f)
+{
+ __asm__ __volatile__ ("frchg; fmov.s @%0,fr0; frchg" : : "r" (f) : "memory");
+}
+
+void
+read_xf0 (float* f)
+{
+ __asm__ __volatile__ ("frchg; fmov.s fr0,@%0; frchg" : : "r" (f) : "memory");
+}
+
+int __attribute__ ((noinline))
+test_00 (int a, int b)
+{
+ return a / b;
+}
+
+unsigned int __attribute__ ((noinline))
+test_01 (unsigned a, unsigned b)
+{
+ return a / b;
+}
+
+int __attribute__ ((noinline))
+test_02 (int x)
+{
+ return x & 0;
+}
+
+int
+main (void)
+{
+ float test_value;
+ int r = 0;
+
+ /* Set FPSCR.FR to 1. */
+ __set_fpscr (0x200000);
+
+ test_value = 123;
+ write_xf0 (&test_value);
+ r += test_00 (40, 4);
+ read_xf0 (&test_value);
+ assert (test_value == 123);
+
+ test_value = 321;
+ write_xf0 (&test_value);
+ r += test_01 (50, 5);
+ read_xf0 (&test_value);
+ assert (test_value == 321);
+
+ return test_02 (r);
+}
diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog
index 085432be4b4..832b4256e69 100644
--- a/libgcc/ChangeLog
+++ b/libgcc/ChangeLog
@@ -1,3 +1,9 @@
+2013-06-09 Oleg Endo <olegendo@gcc.gnu.org>
+
+ PR target/6526
+ * config/sh/lib1funcs.S (sdivsi3_i4, udivsi3_i4): Do not change bits
+ other than FPSCR.PR and FPSCR.SZ. Add SH4A implementation.
+
2013-06-08 Walter Lee <walt@tilera.com>
* config/tilepro/atomic.h: Don't include stdint.h or features.h.
diff --git a/libgcc/config/sh/lib1funcs.S b/libgcc/config/sh/lib1funcs.S
index 5f0bbff264f..51addf360cf 100644
--- a/libgcc/config/sh/lib1funcs.S
+++ b/libgcc/config/sh/lib1funcs.S
@@ -1003,11 +1003,17 @@ hiset: sts macl,r0 ! r0 = bb*dd
ENDFUNC(GLOBAL(mulsi3))
#endif
#endif /* ! __SH5__ */
+
+/*------------------------------------------------------------------------------
+ 32 bit signed integer division that uses FPU double precision division. */
+
#ifdef L_sdivsi3_i4
.title "SH DIVIDE"
-!! 4 byte integer Divide code for the Renesas SH
+
#if defined (__SH4__) || defined (__SH2A__)
-!! args in r4 and r5, result in fpul, clobber dr0, dr2
+/* This variant is used when FPSCR.PR = 1 (double precision) is the default
+ setting.
+ Args in r4 and r5, result in fpul, clobber dr0, dr2. */
.global GLOBAL(sdivsi3_i4)
HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
@@ -1021,8 +1027,13 @@ GLOBAL(sdivsi3_i4):
ftrc dr0,fpul
ENDFUNC(GLOBAL(sdivsi3_i4))
+
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
-!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2
+/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
+ setting.
+ Args in r4 and r5, result in fpul, clobber r2, dr0, dr2.
+ For this to work, we must temporarily switch the FPU do double precision,
+ but we better do not touch FPSCR.FR. See PR 6526. */
#if ! __SH5__ || __SH5__ == 32
#if __SH5__
@@ -1031,24 +1042,43 @@ GLOBAL(sdivsi3_i4):
.global GLOBAL(sdivsi3_i4)
HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
- sts.l fpscr,@-r15
- mov #8,r2
- swap.w r2,r2
- lds r2,fpscr
- lds r4,fpul
- float fpul,dr0
- lds r5,fpul
- float fpul,dr2
- fdiv dr2,dr0
- ftrc dr0,fpul
+
+#ifndef __SH4A__
+ mov.l r3,@-r15
+ sts fpscr,r2
+ mov #8,r3
+ swap.w r3,r3 // r3 = 1 << 19 (FPSCR.PR bit)
+ or r2,r3
+ lds r3,fpscr // Set FPSCR.PR = 1.
+ lds r4,fpul
+ float fpul,dr0
+ lds r5,fpul
+ float fpul,dr2
+ fdiv dr2,dr0
+ ftrc dr0,fpul
+ lds r2,fpscr
rts
- lds.l @r15+,fpscr
+ mov.l @r15+,r3
+#else
+/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit. */
+ fpchg
+ lds r4,fpul
+ float fpul,dr0
+ lds r5,fpul
+ float fpul,dr2
+ fdiv dr2,dr0
+ ftrc dr0,fpul
+ rts
+ fpchg
+
+#endif /* __SH4A__ */
ENDFUNC(GLOBAL(sdivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#endif /* ! __SH4__ || __SH2A__ */
-#endif
+#endif /* L_sdivsi3_i4 */
+//------------------------------------------------------------------------------
#ifdef L_sdivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
sh2e/sh3e code. */
@@ -1367,54 +1397,60 @@ div0: rts
mov #0,r0
ENDFUNC(GLOBAL(sdivsi3))
-#endif /* ! __SHMEDIA__ */
-#endif
-#ifdef L_udivsi3_i4
+#endif /* ! __SHMEDIA__ */
+#endif /* L_sdivsi3 */
+
+/*------------------------------------------------------------------------------
+ 32 bit unsigned integer division that uses FPU double precision division. */
+#ifdef L_udivsi3_i4
.title "SH DIVIDE"
-!! 4 byte integer Divide code for the Renesas SH
+
#if defined (__SH4__) || defined (__SH2A__)
-!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
-!! and t bit
+/* This variant is used when FPSCR.PR = 1 (double precision) is the default
+ setting.
+ Args in r4 and r5, result in fpul,
+ clobber r0, r1, r4, r5, dr0, dr2, dr4, and t bit */
.global GLOBAL(udivsi3_i4)
HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
- mov #1,r1
- cmp/hi r1,r5
- bf trivial
- rotr r1
- xor r1,r4
- lds r4,fpul
- mova L1,r0
+ mov #1,r1
+ cmp/hi r1,r5
+ bf/s trivial
+ rotr r1
+ xor r1,r4
+ lds r4,fpul
+ mova L1,r0
#ifdef FMOVD_WORKS
- fmov.d @r0+,dr4
+ fmov.d @r0+,dr4
#else
- fmov.s @r0+,DR40
- fmov.s @r0,DR41
+ fmov.s @r0+,DR40
+ fmov.s @r0,DR41
#endif
- float fpul,dr0
- xor r1,r5
- lds r5,fpul
- float fpul,dr2
- fadd dr4,dr0
- fadd dr4,dr2
- fdiv dr2,dr0
+ float fpul,dr0
+ xor r1,r5
+ lds r5,fpul
+ float fpul,dr2
+ fadd dr4,dr0
+ fadd dr4,dr2
+ fdiv dr2,dr0
rts
- ftrc dr0,fpul
+ ftrc dr0,fpul
trivial:
rts
- lds r4,fpul
+ lds r4,fpul
.align 2
#ifdef FMOVD_WORKS
- .align 3 ! make double below 8 byte aligned.
+ .align 3 // Make the double below 8 byte aligned.
#endif
L1:
.double 2147483648
ENDFUNC(GLOBAL(udivsi3_i4))
+
#elif defined (__SH5__) && ! defined (__SH4_NOFPU__) && ! defined (__SH2A_NOFPU__)
#if ! __SH5__ || __SH5__ == 32
!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
@@ -1436,57 +1472,106 @@ GLOBAL(udivsi3_i4):
ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
+
#elif defined (__SH2A_SINGLE__) || defined (__SH2A_SINGLE_ONLY__) || defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
-!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
+/* This variant is used when FPSCR.PR = 0 (sigle precision) is the default
+ setting.
+ Args in r4 and r5, result in fpul,
+ clobber r0, r1, r4, r5, dr0, dr2, dr4.
+ For this to work, we must temporarily switch the FPU do double precision,
+ but we better do not touch FPSCR.FR. See PR 6526. */
.global GLOBAL(udivsi3_i4)
HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
- mov #1,r1
- cmp/hi r1,r5
- bf trivial
- sts.l fpscr,@-r15
- mova L1,r0
- lds.l @r0+,fpscr
- rotr r1
- xor r1,r4
- lds r4,fpul
+
+#ifndef __SH4A__
+ mov #1,r1
+ cmp/hi r1,r5
+ bf/s trivial
+ rotr r1 // r1 = 1 << 31
+ sts.l fpscr,@-r15
+ xor r1,r4
+ mov.l @(0,r15),r0
+ xor r1,r5
+ mov.l L2,r1
+ lds r4,fpul
+ or r0,r1
+ mova L1,r0
+ lds r1,fpscr
#ifdef FMOVD_WORKS
- fmov.d @r0+,dr4
+ fmov.d @r0+,dr4
#else
- fmov.s @r0+,DR40
- fmov.s @r0,DR41
+ fmov.s @r0+,DR40
+ fmov.s @r0,DR41
#endif
- float fpul,dr0
- xor r1,r5
- lds r5,fpul
- float fpul,dr2
- fadd dr4,dr0
- fadd dr4,dr2
- fdiv dr2,dr0
- ftrc dr0,fpul
+ float fpul,dr0
+ lds r5,fpul
+ float fpul,dr2
+ fadd dr4,dr0
+ fadd dr4,dr2
+ fdiv dr2,dr0
+ ftrc dr0,fpul
rts
- lds.l @r15+,fpscr
+ lds.l @r15+,fpscr
#ifdef FMOVD_WORKS
- .align 3 ! make double below 8 byte aligned.
+ .align 3 // Make the double below 8 byte aligned.
#endif
trivial:
rts
- lds r4,fpul
+ lds r4,fpul
.align 2
-L1:
-#ifndef FMOVD_WORKS
- .long 0x80000
+L2:
+#ifdef FMOVD_WORKS
+ .long 0x180000 // FPSCR.PR = 1, FPSCR.SZ = 1
#else
- .long 0x180000
+ .long 0x80000 // FPSCR.PR = 1
#endif
+L1:
+ .double 2147483648
+
+#else
+/* On SH4A we can use the fpchg instruction to flip the FPSCR.PR bit.
+ Although on SH4A fmovd usually works, it would require either additional
+ two fschg instructions or an FPSCR push + pop. It's not worth the effort
+ for loading only one double constant. */
+ mov #1,r1
+ cmp/hi r1,r5
+ bf/s trivial
+ rotr r1 // r1 = 1 << 31
+ fpchg
+ mova L1,r0
+ xor r1,r4
+ fmov.s @r0+,DR40
+ lds r4,fpul
+ fmov.s @r0,DR41
+ xor r1,r5
+ float fpul,dr0
+ lds r5,fpul
+ float fpul,dr2
+ fadd dr4,dr0
+ fadd dr4,dr2
+ fdiv dr2,dr0
+ ftrc dr0,fpul
+ rts
+ fpchg
+
+trivial:
+ rts
+ lds r4,fpul
+
+ .align 2
+L1:
.double 2147483648
+#endif /* __SH4A__ */
+
+
ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH4__ */
-#endif
+#endif /* L_udivsi3_i4 */
#ifdef L_udivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with