diff options
Diffstat (limited to 'gcc/config')
-rw-r--r-- | gcc/config/sh/crt1.asm | 134 | ||||
-rw-r--r-- | gcc/config/sh/divcost-analysis | 7 | ||||
-rw-r--r-- | gcc/config/sh/divtab-sh4-300.c | 81 | ||||
-rw-r--r-- | gcc/config/sh/embed-elf.h | 1 | ||||
-rw-r--r-- | gcc/config/sh/lib1funcs-4-300.asm | 938 | ||||
-rw-r--r-- | gcc/config/sh/sh-protos.h | 6 | ||||
-rw-r--r-- | gcc/config/sh/sh.c | 447 | ||||
-rw-r--r-- | gcc/config/sh/sh.h | 20 | ||||
-rw-r--r-- | gcc/config/sh/sh.md | 212 | ||||
-rw-r--r-- | gcc/config/sh/sh.opt | 57 | ||||
-rw-r--r-- | gcc/config/sh/sh1.md | 6 | ||||
-rw-r--r-- | gcc/config/sh/sh4-300.md | 288 | ||||
-rw-r--r-- | gcc/config/sh/sh4.md | 28 | ||||
-rw-r--r-- | gcc/config/sh/sh4a.md | 14 | ||||
-rw-r--r-- | gcc/config/sh/superh.h | 10 | ||||
-rw-r--r-- | gcc/config/sh/t-sh | 17 |
16 files changed, 2037 insertions, 229 deletions
diff --git a/gcc/config/sh/crt1.asm b/gcc/config/sh/crt1.asm index c110fa07427..7aa684434d7 100644 --- a/gcc/config/sh/crt1.asm +++ b/gcc/config/sh/crt1.asm @@ -1,4 +1,5 @@ -/* Copyright (C) 2000, 2001, 2003, 2004, 2005 Free Software Foundation, Inc. +/* Copyright (C) 2000, 2001, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. This file was pretty much copied from newlib. This file is part of GCC. @@ -894,25 +895,12 @@ ___main: nop #ifdef VBR_SETUP ! Exception handlers - .balign 256 + .section .text.vbr, "ax" vbr_start: - mov.l 2f, r0 ! load the old vbr setting (if any) - mov.l @r0, r0 - cmp/eq #0, r0 - bf 1f - ! no previous vbr - jump to own generic handler - bra handler - nop -1: ! there was a previous handler - chain them - jmp @r0 - nop - .balign 4 -2: - .long old_vbr - .balign 256 + .org 0x100 vbr_100: - #ifdef PROFILE +#ifdef PROFILE ! Note on register usage. ! we use r0..r3 as scratch in this code. If we are here due to a trapa for profiling ! then this is OK as we are just before executing any function code. @@ -1017,50 +1005,7 @@ handler_100: 2: .long old_vbr - .balign 256 -vbr_200: - mov.l 2f, r0 ! load the old vbr setting (if any) - mov.l @r0, r0 - cmp/eq #0, r0 - bf 1f - ! no previous vbr - jump to own generic handler - bra handler - nop -1: ! there was a previous handler - chain them - add #0x7f, r0 ! 0x7f - add #0x7f, r0 ! 0xfe - add #0x7f, r0 ! 0x17d - add #0x7f, r0 ! 0x1fc - add #0x4, r0 ! add 0x200 without corrupting another register - jmp @r0 - nop - .balign 4 -2: - .long old_vbr - - .balign 256 -vbr_300: - mov.l 2f, r0 ! load the old vbr setting (if any) - mov.l @r0, r0 - cmp/eq #0, r0 - bf 1f - ! no previous vbr - jump to own generic handler - bra handler - nop -1: ! there was a previous handler - chain them - rotcr r0 - rotcr r0 - add #0x7f, r0 ! 0x1fc - add #0x41, r0 ! 0x300 - rotcl r0 - rotcl r0 ! Add 0x300 without corrupting another register - jmp @r0 - nop - .balign 4 -2: - .long old_vbr - - .balign 256 + .org 0x400 vbr_400: ! Should be at vbr+0x400 mov.l 2f, r0 ! load the old vbr setting (if any) mov.l @r0, r0 @@ -1103,28 +1048,7 @@ handler: jmp @r2 nop - .balign 256 -vbr_500: - mov.l 2f, r0 ! load the old vbr setting (if any) - mov.l @r0, r0 - cmp/eq #0, r0 - ! no previous vbr - jump to own generic handler - bt handler - ! there was a previous handler - chain them - rotcr r0 - rotcr r0 - add #0x7f, r0 ! 0x1fc - add #0x7f, r0 ! 0x3f8 - add #0x42, r0 ! 0x500 - rotcl r0 - rotcl r0 ! Add 0x500 without corrupting another register - jmp @r0 - nop - .balign 4 -2: - .long old_vbr - - .balign 256 + .org 0x600 vbr_600: #ifdef PROFILE ! Should be at vbr+0x600 @@ -1140,11 +1064,48 @@ vbr_600: mov.l r6,@-r15 mov.l r7,@-r15 sts.l pr,@-r15 + sts.l mach,@-r15 + sts.l macl,@-r15 +#if defined(__SH_FPU_ANY__) + ! Save fpul and fpscr, save fr0-fr7 in 64 bit mode + ! and set the pervading precision for the timer_handler + mov #0,r0 + sts.l fpul,@-r15 + sts.l fpscr,@-r15 + lds r0,fpscr ! Clear fpscr + fmov fr0,@-r15 + fmov fr1,@-r15 + fmov fr2,@-r15 + fmov fr3,@-r15 + mov.l pervading_precision_k,r0 + fmov fr4,@-r15 + fmov fr5,@-r15 + mov.l @r0,r0 + fmov fr6,@-r15 + fmov fr7,@-r15 + lds r0,fpscr +#endif /* __SH_FPU_ANY__ */ ! Pass interrupted pc to timer_handler as first parameter (r4). stc spc, r4 mov.l timer_handler_k, r0 jsr @r0 nop +#if defined(__SH_FPU_ANY__) + mov #0,r0 + lds r0,fpscr ! Clear the fpscr + fmov @r15+,fr7 + fmov @r15+,fr6 + fmov @r15+,fr5 + fmov @r15+,fr4 + fmov @r15+,fr3 + fmov @r15+,fr2 + fmov @r15+,fr1 + fmov @r15+,fr0 + lds.l @r15+,fpscr + lds.l @r15+,fpul +#endif /* __SH_FPU_ANY__ */ + lds.l @r15+,macl + lds.l @r15+,mach lds.l @r15+,pr mov.l @r15+,r7 mov.l @r15+,r6 @@ -1157,6 +1118,13 @@ vbr_600: stc sgr, r15 ! Restore r15, destroyed by this sequence. rte nop +#if defined(__SH_FPU_ANY__) + .balign 4 +pervading_precision_k: +#define CONCAT1(A,B) A##B +#define CONCAT(A,B) CONCAT1(A,B) + .long CONCAT(__USER_LABEL_PREFIX__,__fpscr_values)+4 +#endif #else mov.l 2f, r0 ! Load the old vbr setting (if any). mov.l @r0, r0 diff --git a/gcc/config/sh/divcost-analysis b/gcc/config/sh/divcost-analysis index 541e31324b3..0296269bb52 100644 --- a/gcc/config/sh/divcost-analysis +++ b/gcc/config/sh/divcost-analysis @@ -38,12 +38,17 @@ div_r8_neg -> div_r8_neg_end: 18 div_le128_neg -> div_by_1_neg: 4 div_le128_neg -> rts 18 - absolute divisor range: + sh4-200 absolute divisor range: 1 [2..128] [129..64K) [64K..|divident|/256] >=64K,>|divident/256| udiv 18 22 38 32 30 sdiv pos: 20 24 41 35 32 sdiv neg: 15 25 42 36 33 + sh4-300 absolute divisor range: + 8 bit 16 bit 24 bit > 24 bit +udiv 15 35 28 25 +sdiv 14 36 34 31 + fp-based: diff --git a/gcc/config/sh/divtab-sh4-300.c b/gcc/config/sh/divtab-sh4-300.c new file mode 100644 index 00000000000..448b0b8af8e --- /dev/null +++ b/gcc/config/sh/divtab-sh4-300.c @@ -0,0 +1,81 @@ +/* Copyright (C) 2004, 2006 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2, or (at your option) any +later version. + +In addition to the permissions in the GNU General Public License, the +Free Software Foundation gives you unlimited permission to link the +compiled version of this file into combinations with other programs, +and to distribute those combinations without any restriction coming +from the use of this file. (The General Public License restrictions +do apply in other respects; for example, they cover modification of +the file, and distribution when not linked into a combine +executable.) + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; see the file COPYING. If not, write to +the Free Software Foundation, 51 Franklin Street, Fifth Floor, +Boston, MA 02110-1301, USA. */ + +/* Calculate division table for ST40-300 integer division + Contributed by Joern Rennecke + joern.rennecke@st.com */ + +#include <stdio.h> +#include <math.h> + +int +main () +{ + int i, j; + double q, r, err, max_err = 0, max_s_err = 0; + + puts("/* This table has been generated by divtab-sh4.c. */"); + puts ("\t.balign 4"); + for (i = -128; i < 128; i++) + { + int n = 0; + if (i == 0) + { + /* output some dummy number for 1/0. */ + puts ("LOCAL(div_table_clz):\n\t.byte\t0"); + continue; + } + for (j = i < 0 ? -i : i; j < 128; j += j) + n++; + printf ("\t.byte\t%d\n", n - 7); + } + puts("\ +/* 1/-128 .. 1/127, normalized. There is an implicit leading 1 in bit 32,\n\ + or in bit 33 for powers of two. */\n\ + .balign 4"); + for (i = -128; i < 128; i++) + { + if (i == 0) + { + puts ("LOCAL(div_table_inv):\n\t.long\t0x0"); + continue; + } + j = i < 0 ? -i : i; + while (j < 64) + j += j; + q = 4.*(1<<30)*128/j; + r = ceil (q); + printf ("\t.long\t0x%X\n", (unsigned) r); + err = r - q; + if (err > max_err) + max_err = err; + err = err * j / 128; + if (err > max_s_err) + max_s_err = err; + } + printf ("\t/* maximum error: %f scaled: %f*/\n", max_err, max_s_err); + exit (0); +} diff --git a/gcc/config/sh/embed-elf.h b/gcc/config/sh/embed-elf.h index 4497cf34636..0d817cacf85 100644 --- a/gcc/config/sh/embed-elf.h +++ b/gcc/config/sh/embed-elf.h @@ -32,6 +32,7 @@ Boston, MA 02110-1301, USA. */ #define LIBGCC_SPEC "%{!shared: \ %{m4-100*:-lic_invalidate_array_4-100} \ %{m4-200*:-lic_invalidate_array_4-200} \ + %{m4-300*|-m4-340:-lic_invalidate_array_4a %{!Os: -lgcc-4-300}} \ %{m4a*:-lic_invalidate_array_4a}} \ %{Os: -lgcc-Os-4-200} \ -lgcc \ diff --git a/gcc/config/sh/lib1funcs-4-300.asm b/gcc/config/sh/lib1funcs-4-300.asm new file mode 100644 index 00000000000..b07912425af --- /dev/null +++ b/gcc/config/sh/lib1funcs-4-300.asm @@ -0,0 +1,938 @@ +/* Copyright (C) 2004, 2006 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2, or (at your option) any +later version. + +In addition to the permissions in the GNU General Public License, the +Free Software Foundation gives you unlimited permission to link the +compiled version of this file into combinations with other programs, +and to distribute those combinations without any restriction coming +from the use of this file. (The General Public License restrictions +do apply in other respects; for example, they cover modification of +the file, and distribution when not linked into a combine +executable.) + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; see the file COPYING. If not, write to +the Free Software Foundation, 51 Franklin Street, Fifth Floor, +Boston, MA 02110-1301, USA. */ + +/* libgcc routines for the STMicroelectronics ST40-300 CPU. + Contributed by J"orn Rennecke joern.rennecke@st.com. */ + +#include "lib1funcs.h" + +#ifdef L_div_table +#if defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__) +/* This code used shld, thus is not suitable for SH1 / SH2. */ + +/* Signed / unsigned division without use of FPU, optimized for SH4-300. + Uses a lookup table for divisors in the range -128 .. +127, and + div1 with case distinction for larger divisors in three more ranges. + The code is lumped together with the table to allow the use of mova. */ +#ifdef __LITTLE_ENDIAN__ +#define L_LSB 0 +#define L_LSWMSB 1 +#define L_MSWLSB 2 +#else +#define L_LSB 3 +#define L_LSWMSB 2 +#define L_MSWLSB 1 +#endif + + .global GLOBAL(udivsi3_i4i) + .global GLOBAL(sdivsi3_i4i) + FUNC(GLOBAL(udivsi3_i4i)) + FUNC(GLOBAL(sdivsi3_i4i)) + + .balign 4 +LOCAL(div_ge8m): ! 10 cycles up to here + rotcr r1 ! signed shift must use original sign from r4 + div0s r5,r4 + mov #24,r7 + shld r7,r6 + shad r0,r1 + rotcl r6 + div1 r5,r1 + swap.w r5,r0 ! detect -0x80000000 : 0x800000 + rotcl r6 + swap.w r4,r7 + div1 r5,r1 + swap.b r7,r7 + rotcl r6 + or r7,r0 + div1 r5,r1 + swap.w r0,r7 + rotcl r6 + or r7,r0 + div1 r5,r1 + add #-0x80,r0 + rotcl r6 + extu.w r0,r0 + div1 r5,r1 + neg r0,r0 + rotcl r6 + swap.w r0,r0 + div1 r5,r1 + mov.l @r15+,r7 + and r6,r0 + rotcl r6 + div1 r5,r1 + shll2 r0 + rotcl r6 + exts.b r0,r0 + div1 r5,r1 + swap.w r0,r0 + exts.w r0,r1 + exts.b r6,r0 + mov.l @r15+,r6 + rotcl r0 + rts + sub r1,r0 + ! 31 cycles up to here + + .balign 4 +LOCAL(udiv_ge64k): ! 3 cycles up to here + mov r4,r0 + shlr8 r0 + div0u + cmp/hi r0,r5 + bt LOCAL(udiv_r8) + mov.l r5,@-r15 + shll8 r5 + ! 7 cycles up to here + .rept 8 + div1 r5,r0 + .endr + extu.b r4,r1 ! 15 cycles up to here + extu.b r0,r6 + xor r1,r0 + xor r6,r0 + swap.b r6,r6 + .rept 8 + div1 r5,r0 + .endr ! 25 cycles up to here + extu.b r0,r0 + mov.l @r15+,r5 + or r6,r0 + mov.l @r15+,r6 + rts + rotcl r0 ! 28 cycles up to here + + .balign 4 +LOCAL(udiv_r8): ! 6 cycles up to here + mov.l r4,@-r15 + shll16 r4 + shll8 r4 + ! + shll r4 + mov r0,r1 + div1 r5,r1 + mov r4,r0 + rotcl r0 + mov.l @r15+,r4 + div1 r5,r1 + ! 12 cycles up to here + .rept 6 + rotcl r0; div1 r5,r1 + .endr + mov.l @r15+,r6 ! 24 cycles up to here + rts + rotcl r0 + + .balign 4 +LOCAL(div_ge32k): ! 6 cycles up to here + mov.l r7,@-r15 + swap.w r5,r6 + exts.b r6,r7 + exts.w r6,r6 + cmp/eq r6,r7 + extu.b r1,r6 + bf/s LOCAL(div_ge8m) + cmp/hi r1,r4 ! copy sign bit of r4 into T + rotcr r1 ! signed shift must use original sign from r4 + div0s r5,r4 + shad r0,r1 + shll8 r5 + div1 r5,r1 + mov r5,r7 ! detect r4 == 0x80000000 && r5 == 0x8000(00) + div1 r5,r1 + shlr8 r7 + div1 r5,r1 + swap.w r4,r0 + div1 r5,r1 + swap.b r0,r0 + div1 r5,r1 + or r0,r7 + div1 r5,r1 + add #-80,r7 + div1 r5,r1 + swap.w r7,r0 + div1 r5,r1 + or r0,r7 + extu.b r1,r0 + xor r6,r1 + xor r0,r1 + exts.b r0,r0 + div1 r5,r1 + extu.w r7,r7 + div1 r5,r1 + neg r7,r7 ! upper 16 bit of r7 == 0 if r4 == 0x80000000 && r5 == 0x8000 + div1 r5,r1 + and r0,r7 + div1 r5,r1 + swap.w r7,r7 ! 26 cycles up to here. + div1 r5,r1 + shll8 r0 + div1 r5,r1 + exts.w r7,r7 + div1 r5,r1 + add r0,r0 + div1 r5,r1 + sub r7,r0 + extu.b r1,r1 + mov.l @r15+,r7 + rotcl r1 + mov.l @r15+,r6 + add r1,r0 + mov #-8,r1 + rts + shad r1,r5 ! 34 cycles up to here + + .balign 4 +GLOBAL(udivsi3_i4i): + mov.l r6,@-r15 + extu.w r5,r6 + cmp/eq r5,r6 + mov #0x7f,r0 + bf LOCAL(udiv_ge64k) + cmp/hi r0,r5 + bf LOCAL(udiv_le128) + mov r4,r1 + shlr8 r1 + div0u + shlr r1 + shll16 r6 + div1 r6,r1 + extu.b r4,r0 ! 7 cycles up to here + .rept 8 + div1 r6,r1 + .endr ! 15 cycles up to here + xor r1,r0 ! xor dividend with result lsb + .rept 6 + div1 r6,r1 + .endr + mov.l r7,@-r15 ! 21 cycles up to here + div1 r6,r1 + extu.b r0,r7 + div1 r6,r1 + shll8 r7 + extu.w r1,r0 + xor r7,r1 ! replace lsb of result with lsb of dividend + div1 r6,r1 + mov #0,r7 + div1 r6,r1 + ! + div1 r6,r1 + bra LOCAL(div_end) + div1 r6,r1 ! 28 cycles up to here + + /* This is link-compatible with a GLOBAL(sdivsi3) call, + but we effectively clobber only r1, macl and mach */ + /* Because negative quotients are calculated as one's complements, + -0x80000000 divided by the smallest positive number of a number + range (0x80, 0x8000, 0x800000) causes saturation in the one's + complement representation, and we have to suppress the + one's -> two's complement adjustment. Since positive numbers + don't get such an adjustment, it's OK to also compute one's -> two's + complement adjustment suppression for a dividend of 0. */ + .balign 4 +GLOBAL(sdivsi3_i4i): + mov.l r6,@-r15 + exts.b r5,r6 + cmp/eq r5,r6 + mov #-1,r1 + bt/s LOCAL(div_le128) + cmp/pz r4 + addc r4,r1 + exts.w r5,r6 + cmp/eq r5,r6 + mov #-7,r0 + bf/s LOCAL(div_ge32k) + cmp/hi r1,r4 ! copy sign bit of r4 into T + rotcr r1 + shll16 r6 ! 7 cycles up to here + shad r0,r1 + div0s r5,r4 + div1 r6,r1 + mov.l r7,@-r15 + div1 r6,r1 + mov r4,r0 ! re-compute adjusted dividend + div1 r6,r1 + mov #-31,r7 + div1 r6,r1 + shad r7,r0 + div1 r6,r1 + add r4,r0 ! adjusted dividend + div1 r6,r1 + mov.l r8,@-r15 + div1 r6,r1 + swap.w r4,r8 ! detect special case r4 = 0x80000000, r5 = 0x80 + div1 r6,r1 + swap.b r8,r8 + xor r1,r0 ! xor dividend with result lsb + div1 r6,r1 + div1 r6,r1 + or r5,r8 + div1 r6,r1 + add #-0x80,r8 ! r8 is 0 iff there is a match + div1 r6,r1 + swap.w r8,r7 ! or upper 16 bits... + div1 r6,r1 + or r7,r8 !...into lower 16 bits + div1 r6,r1 + extu.w r8,r8 + div1 r6,r1 + extu.b r0,r7 + div1 r6,r1 + shll8 r7 + exts.w r1,r0 + xor r7,r1 ! replace lsb of result with lsb of dividend + div1 r6,r1 + neg r8,r8 ! upper 16 bits of r8 are now 0xffff iff we want end adjm. + div1 r6,r1 + and r0,r8 + div1 r6,r1 + swap.w r8,r7 + div1 r6,r1 + mov.l @r15+,r8 ! 58 insns, 29 cycles up to here +LOCAL(div_end): + div1 r6,r1 + shll8 r0 + div1 r6,r1 + exts.w r7,r7 + div1 r6,r1 + add r0,r0 + div1 r6,r1 + sub r7,r0 + extu.b r1,r1 + mov.l @r15+,r7 + rotcl r1 + mov.l @r15+,r6 + rts + add r1,r0 + + .balign 4 +LOCAL(udiv_le128): ! 4 cycles up to here (or 7 for mispredict) + mova LOCAL(div_table_inv),r0 + shll2 r6 + mov.l @(r0,r6),r1 + mova LOCAL(div_table_clz),r0 + lds r4,mach + ! + ! + ! + tst r1,r1 + ! + bt 0f + dmulu.l r1,r4 +0: mov.b @(r0,r5),r1 + clrt + ! + ! + sts mach,r0 + addc r4,r0 + rotcr r0 + mov.l @r15+,r6 + rts + shld r1,r0 + + .balign 4 +LOCAL(div_le128): ! 3 cycles up to here (or 6 for mispredict) + mova LOCAL(div_table_inv),r0 + shll2 r6 + mov.l @(r0,r6),r1 + mova LOCAL(div_table_clz),r0 + neg r4,r6 + bf 0f + mov r4,r6 +0: lds r6,mach + tst r1,r1 + bt 0f + dmulu.l r1,r6 +0: div0s r4,r5 + mov.b @(r0,r5),r1 + bt/s LOCAL(le128_neg) + clrt + ! + sts mach,r0 + addc r6,r0 + rotcr r0 + mov.l @r15+,r6 + rts + shld r1,r0 + +/* Could trap divide by zero for the cost of one cycle more mispredict penalty: +... + dmulu.l r1,r6 +0: div0s r4,r5 + bt/s LOCAL(le128_neg) + tst r5,r5 + bt LOCAL(div_by_zero) + mov.b @(r0,r5),r1 + sts mach,r0 + addc r6,r0 +... +LOCAL(div_by_zero): + trapa # + .balign 4 +LOCAL(le128_neg): + bt LOCAL(div_by_zero) + mov.b @(r0,r5),r1 + sts mach,r0 + addc r6,r0 +... */ + + .balign 4 +LOCAL(le128_neg): + sts mach,r0 + addc r6,r0 + rotcr r0 + mov.l @r15+,r6 + shad r1,r0 + rts + neg r0,r0 + ENDFUNC(GLOBAL(udivsi3_i4i)) + ENDFUNC(GLOBAL(sdivsi3_i4i)) + +/* This table has been generated by divtab-sh4.c. */ + .balign 4 + .byte -7 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -2 + .byte -2 + .byte -2 + .byte -2 + .byte -1 + .byte -1 + .byte 0 +LOCAL(div_table_clz): + .byte 0 + .byte 0 + .byte -1 + .byte -1 + .byte -2 + .byte -2 + .byte -2 + .byte -2 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 +/* 1/-128 .. 1/127, normalized. There is an implicit leading 1 in bit 32, + or in bit 33 for powers of two. */ + .balign 4 + .long 0x0 + .long 0x2040811 + .long 0x4104105 + .long 0x624DD30 + .long 0x8421085 + .long 0xA6810A7 + .long 0xC9714FC + .long 0xECF56BF + .long 0x11111112 + .long 0x135C8114 + .long 0x15B1E5F8 + .long 0x18118119 + .long 0x1A7B9612 + .long 0x1CF06ADB + .long 0x1F7047DD + .long 0x21FB7813 + .long 0x24924925 + .long 0x27350B89 + .long 0x29E4129F + .long 0x2C9FB4D9 + .long 0x2F684BDB + .long 0x323E34A3 + .long 0x3521CFB3 + .long 0x38138139 + .long 0x3B13B13C + .long 0x3E22CBCF + .long 0x41414142 + .long 0x446F8657 + .long 0x47AE147B + .long 0x4AFD6A06 + .long 0x4E5E0A73 + .long 0x51D07EAF + .long 0x55555556 + .long 0x58ED2309 + .long 0x5C9882BA + .long 0x60581606 + .long 0x642C8591 + .long 0x68168169 + .long 0x6C16C16D + .long 0x702E05C1 + .long 0x745D1746 + .long 0x78A4C818 + .long 0x7D05F418 + .long 0x81818182 + .long 0x86186187 + .long 0x8ACB90F7 + .long 0x8F9C18FA + .long 0x948B0FCE + .long 0x9999999A + .long 0x9EC8E952 + .long 0xA41A41A5 + .long 0xA98EF607 + .long 0xAF286BCB + .long 0xB4E81B4F + .long 0xBACF914D + .long 0xC0E07039 + .long 0xC71C71C8 + .long 0xCD856891 + .long 0xD41D41D5 + .long 0xDAE6076C + .long 0xE1E1E1E2 + .long 0xE9131AC0 + .long 0xF07C1F08 + .long 0xF81F81F9 + .long 0x0 + .long 0x4104105 + .long 0x8421085 + .long 0xC9714FC + .long 0x11111112 + .long 0x15B1E5F8 + .long 0x1A7B9612 + .long 0x1F7047DD + .long 0x24924925 + .long 0x29E4129F + .long 0x2F684BDB + .long 0x3521CFB3 + .long 0x3B13B13C + .long 0x41414142 + .long 0x47AE147B + .long 0x4E5E0A73 + .long 0x55555556 + .long 0x5C9882BA + .long 0x642C8591 + .long 0x6C16C16D + .long 0x745D1746 + .long 0x7D05F418 + .long 0x86186187 + .long 0x8F9C18FA + .long 0x9999999A + .long 0xA41A41A5 + .long 0xAF286BCB + .long 0xBACF914D + .long 0xC71C71C8 + .long 0xD41D41D5 + .long 0xE1E1E1E2 + .long 0xF07C1F08 + .long 0x0 + .long 0x8421085 + .long 0x11111112 + .long 0x1A7B9612 + .long 0x24924925 + .long 0x2F684BDB + .long 0x3B13B13C + .long 0x47AE147B + .long 0x55555556 + .long 0x642C8591 + .long 0x745D1746 + .long 0x86186187 + .long 0x9999999A + .long 0xAF286BCB + .long 0xC71C71C8 + .long 0xE1E1E1E2 + .long 0x0 + .long 0x11111112 + .long 0x24924925 + .long 0x3B13B13C + .long 0x55555556 + .long 0x745D1746 + .long 0x9999999A + .long 0xC71C71C8 + .long 0x0 + .long 0x24924925 + .long 0x55555556 + .long 0x9999999A + .long 0x0 + .long 0x55555556 + .long 0x0 + .long 0x0 +LOCAL(div_table_inv): + .long 0x0 + .long 0x0 + .long 0x0 + .long 0x55555556 + .long 0x0 + .long 0x9999999A + .long 0x55555556 + .long 0x24924925 + .long 0x0 + .long 0xC71C71C8 + .long 0x9999999A + .long 0x745D1746 + .long 0x55555556 + .long 0x3B13B13C + .long 0x24924925 + .long 0x11111112 + .long 0x0 + .long 0xE1E1E1E2 + .long 0xC71C71C8 + .long 0xAF286BCB + .long 0x9999999A + .long 0x86186187 + .long 0x745D1746 + .long 0x642C8591 + .long 0x55555556 + .long 0x47AE147B + .long 0x3B13B13C + .long 0x2F684BDB + .long 0x24924925 + .long 0x1A7B9612 + .long 0x11111112 + .long 0x8421085 + .long 0x0 + .long 0xF07C1F08 + .long 0xE1E1E1E2 + .long 0xD41D41D5 + .long 0xC71C71C8 + .long 0xBACF914D + .long 0xAF286BCB + .long 0xA41A41A5 + .long 0x9999999A + .long 0x8F9C18FA + .long 0x86186187 + .long 0x7D05F418 + .long 0x745D1746 + .long 0x6C16C16D + .long 0x642C8591 + .long 0x5C9882BA + .long 0x55555556 + .long 0x4E5E0A73 + .long 0x47AE147B + .long 0x41414142 + .long 0x3B13B13C + .long 0x3521CFB3 + .long 0x2F684BDB + .long 0x29E4129F + .long 0x24924925 + .long 0x1F7047DD + .long 0x1A7B9612 + .long 0x15B1E5F8 + .long 0x11111112 + .long 0xC9714FC + .long 0x8421085 + .long 0x4104105 + .long 0x0 + .long 0xF81F81F9 + .long 0xF07C1F08 + .long 0xE9131AC0 + .long 0xE1E1E1E2 + .long 0xDAE6076C + .long 0xD41D41D5 + .long 0xCD856891 + .long 0xC71C71C8 + .long 0xC0E07039 + .long 0xBACF914D + .long 0xB4E81B4F + .long 0xAF286BCB + .long 0xA98EF607 + .long 0xA41A41A5 + .long 0x9EC8E952 + .long 0x9999999A + .long 0x948B0FCE + .long 0x8F9C18FA + .long 0x8ACB90F7 + .long 0x86186187 + .long 0x81818182 + .long 0x7D05F418 + .long 0x78A4C818 + .long 0x745D1746 + .long 0x702E05C1 + .long 0x6C16C16D + .long 0x68168169 + .long 0x642C8591 + .long 0x60581606 + .long 0x5C9882BA + .long 0x58ED2309 + .long 0x55555556 + .long 0x51D07EAF + .long 0x4E5E0A73 + .long 0x4AFD6A06 + .long 0x47AE147B + .long 0x446F8657 + .long 0x41414142 + .long 0x3E22CBCF + .long 0x3B13B13C + .long 0x38138139 + .long 0x3521CFB3 + .long 0x323E34A3 + .long 0x2F684BDB + .long 0x2C9FB4D9 + .long 0x29E4129F + .long 0x27350B89 + .long 0x24924925 + .long 0x21FB7813 + .long 0x1F7047DD + .long 0x1CF06ADB + .long 0x1A7B9612 + .long 0x18118119 + .long 0x15B1E5F8 + .long 0x135C8114 + .long 0x11111112 + .long 0xECF56BF + .long 0xC9714FC + .long 0xA6810A7 + .long 0x8421085 + .long 0x624DD30 + .long 0x4104105 + .long 0x2040811 + /* maximum error: 0.987342 scaled: 0.921875*/ + +#endif /* SH3 / SH4 */ + +#endif /* L_div_table */ diff --git a/gcc/config/sh/sh-protos.h b/gcc/config/sh/sh-protos.h index a0661545b56..e142b1cee68 100644 --- a/gcc/config/sh/sh-protos.h +++ b/gcc/config/sh/sh-protos.h @@ -1,6 +1,6 @@ /* Definitions of target machine for GNU compiler for Renesas / SuperH SH. Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2003, - 2004, 2005 + 2004, 2005, 2006 Free Software Foundation, Inc. Contributed by Steve Chamberlain (sac@cygnus.com). Improved by Jim Wilson (wilson@cygnus.com). @@ -69,6 +69,10 @@ extern void print_operand (FILE *, rtx, int); extern void output_pic_addr_const (FILE *, rtx); extern int expand_block_move (rtx *); extern int prepare_move_operands (rtx[], enum machine_mode mode); +extern enum rtx_code prepare_cbranch_operands (rtx *, enum machine_mode mode, + enum rtx_code comparison); +extern void expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int); +extern bool expand_cbranchdi4 (rtx *operands, enum rtx_code comparison); extern void from_compare (rtx *, int); extern int shift_insns_rtx (rtx); extern void gen_ashift (int, int, rtx); diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c index 30b87480412..9f733b852f1 100644 --- a/gcc/config/sh/sh.c +++ b/gcc/config/sh/sh.c @@ -526,10 +526,15 @@ sh_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, case OPT_m4: case OPT_m4_100: case OPT_m4_200: + case OPT_m4_300: target_flags = (target_flags & ~MASK_ARCH) | SELECT_SH4; return true; case OPT_m4_nofpu: + case OPT_m4_100_nofpu: + case OPT_m4_200_nofpu: + case OPT_m4_300_nofpu: + case OPT_m4_340: case OPT_m4_400: case OPT_m4_500: target_flags = (target_flags & ~MASK_ARCH) | SELECT_SH4_NOFPU; @@ -538,12 +543,14 @@ sh_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, case OPT_m4_single: case OPT_m4_100_single: case OPT_m4_200_single: + case OPT_m4_300_single: target_flags = (target_flags & ~MASK_ARCH) | SELECT_SH4_SINGLE; return true; case OPT_m4_single_only: case OPT_m4_100_single_only: case OPT_m4_200_single_only: + case OPT_m4_300_single_only: target_flags = (target_flags & ~MASK_ARCH) | SELECT_SH4_SINGLE_ONLY; return true; @@ -1341,6 +1348,288 @@ prepare_move_operands (rtx operands[], enum machine_mode mode) return 0; } +enum rtx_code +prepare_cbranch_operands (rtx *operands, enum machine_mode mode, + enum rtx_code comparison) +{ + rtx op1; + rtx scratch = NULL_RTX; + + if (comparison == CODE_FOR_nothing) + comparison = GET_CODE (operands[0]); + else + scratch = operands[4]; + if (GET_CODE (operands[1]) == CONST_INT + && GET_CODE (operands[2]) != CONST_INT) + { + rtx tmp = operands[1]; + + operands[1] = operands[2]; + operands[2] = tmp; + comparison = swap_condition (comparison); + } + if (GET_CODE (operands[2]) == CONST_INT) + { + HOST_WIDE_INT val = INTVAL (operands[2]); + if ((val == -1 || val == -0x81) + && (comparison == GT || comparison == LE)) + { + comparison = (comparison == GT) ? GE : LT; + operands[2] = gen_int_mode (val + 1, mode); + } + else if ((val == 1 || val == 0x80) + && (comparison == GE || comparison == LT)) + { + comparison = (comparison == GE) ? GT : LE; + operands[2] = gen_int_mode (val - 1, mode); + } + else if (val == 1 && (comparison == GEU || comparison == LTU)) + { + comparison = (comparison == GEU) ? NE : EQ; + operands[2] = CONST0_RTX (mode); + } + else if (val == 0x80 && (comparison == GEU || comparison == LTU)) + { + comparison = (comparison == GEU) ? GTU : LEU; + operands[2] = gen_int_mode (val - 1, mode); + } + else if (val == 0 && (comparison == GTU || comparison == LEU)) + comparison = (comparison == GTU) ? NE : EQ; + else if (mode == SImode + && ((val == 0x7fffffff + && (comparison == GTU || comparison == LEU)) + || ((unsigned HOST_WIDE_INT) val + == (unsigned HOST_WIDE_INT) 0x7fffffff + 1 + && (comparison == GEU || comparison == LTU)))) + { + comparison = (comparison == GTU || comparison == GEU) ? LT : GE; + operands[2] = CONST0_RTX (mode); + } + } + op1 = operands[1]; + if (!no_new_pseudos) + operands[1] = force_reg (mode, op1); + /* When we are handling DImode comparisons, we want to keep constants so + that we can optimize the component comparisons; however, memory loads + are better issued as a whole so that they can be scheduled well. + SImode equality comparisons allow I08 constants, but only when they + compare r0. Hence, if operands[1] has to be loaded from somewhere else + into a register, that register might as well be r0, and we allow the + constant. If it is already in a register, this is likely to be + allocatated to a different hard register, thus we load the constant into + a register unless it is zero. */ + if (!REG_P (operands[2]) + && (GET_CODE (operands[2]) != CONST_INT + || (mode == SImode && operands[2] != CONST0_RTX (SImode) + && ((comparison != EQ && comparison != NE) + || (REG_P (op1) && REGNO (op1) != R0_REG) + || !CONST_OK_FOR_I08 (INTVAL (operands[2])))))) + { + if (scratch && GET_MODE (scratch) == mode) + { + emit_move_insn (scratch, operands[2]); + operands[2] = scratch; + } + else if (!no_new_pseudos) + operands[2] = force_reg (mode, operands[2]); + } + return comparison; +} + +void +expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int probability) +{ + rtx (*branch_expander) (rtx) = gen_branch_true; + rtx jump; + + comparison = prepare_cbranch_operands (operands, SImode, comparison); + switch (comparison) + { + case NE: case LT: case LE: case LTU: case LEU: + comparison = reverse_condition (comparison); + branch_expander = gen_branch_false; + default: ; + } + emit_insn (gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, T_REG), + gen_rtx_fmt_ee (comparison, SImode, + operands[1], operands[2]))); + jump = emit_jump_insn (branch_expander (operands[3])); + if (probability >= 0) + REG_NOTES (jump) + = gen_rtx_EXPR_LIST (REG_BR_PROB, GEN_INT (probability), + REG_NOTES (jump)); + +} + +/* ??? How should we distribute probabilities when more than one branch + is generated. So far we only have soem ad-hoc observations: + - If the operands are random, they are likely to differ in both parts. + - If comparing items in a hash chain, the operands are random or equal; + operation should be EQ or NE. + - If items are searched in an ordered tree from the root, we can expect + the highpart to be unequal about half of the time; operation should be + an unequality comparison, operands non-constant, and overall probability + about 50%. Likewise for quicksort. + - Range checks will be often made against constants. Even if we assume for + simplicity an even distribution of the non-constant operand over a + sub-range here, the same probability could be generated with differently + wide sub-ranges - as long as the ratio of the part of the subrange that + is before the threshold to the part that comes after the threshold stays + the same. Thus, we can't really tell anything here; + assuming random distribution is at least simple. + */ + +bool +expand_cbranchdi4 (rtx *operands, enum rtx_code comparison) +{ + enum rtx_code msw_taken, msw_skip, lsw_taken; + rtx skip_label; + rtx op1h, op1l, op2h, op2l; + int num_branches; + int prob, rev_prob; + int msw_taken_prob = -1, msw_skip_prob = -1, lsw_taken_prob = -1; + + comparison = prepare_cbranch_operands (operands, DImode, comparison); + op1h = gen_highpart_mode (SImode, DImode, operands[1]); + op2h = gen_highpart_mode (SImode, DImode, operands[2]); + op1l = gen_lowpart (SImode, operands[1]); + op2l = gen_lowpart (SImode, operands[2]); + msw_taken = msw_skip = lsw_taken = CODE_FOR_nothing; + prob = split_branch_probability; + rev_prob = REG_BR_PROB_BASE - prob; + switch (comparison) + { + /* ??? Should we use the cmpeqdi_t pattern for equality comparisons? + That costs 1 cycle more when the first branch can be predicted taken, + but saves us mispredicts because only one branch needs prediction. + It also enables generating the cmpeqdi_t-1 pattern. */ + case EQ: + if (TARGET_CMPEQDI_T) + { + emit_insn (gen_cmpeqdi_t (operands[1], operands[2])); + emit_jump_insn (gen_branch_true (operands[3])); + return true; + } + msw_skip = NE; + lsw_taken = EQ; + if (prob >= 0) + { + /* If we had more precision, we'd use rev_prob - (rev_prob >> 32) . + */ + msw_skip_prob = rev_prob; + if (REG_BR_PROB_BASE <= 65535) + lsw_taken_prob = prob ? REG_BR_PROB_BASE : 0; + else + { + gcc_assert (HOST_BITS_PER_WIDEST_INT >= 64); + lsw_taken_prob + = (prob + ? (REG_BR_PROB_BASE + - ((HOST_WIDEST_INT) REG_BR_PROB_BASE * rev_prob + / ((HOST_WIDEST_INT) prob << 32))) + : 0); + } + } + break; + case NE: + if (TARGET_CMPEQDI_T) + { + emit_insn (gen_cmpeqdi_t (operands[1], operands[2])); + emit_jump_insn (gen_branch_false (operands[3])); + return true; + } + msw_taken = NE; + lsw_taken_prob = prob; + lsw_taken = NE; + lsw_taken_prob = 0; + break; + case GTU: case GT: + msw_taken = comparison; + if (GET_CODE (op2l) == CONST_INT && INTVAL (op2l) == -1) + break; + if (comparison != GTU || op2h != CONST0_RTX (SImode)) + msw_skip = swap_condition (msw_taken); + lsw_taken = GTU; + break; + case GEU: case GE: + if (op2l == CONST0_RTX (SImode)) + msw_taken = comparison; + else + { + msw_taken = comparison == GE ? GT : GTU; + msw_skip = swap_condition (msw_taken); + lsw_taken = GEU; + } + break; + case LTU: case LT: + msw_taken = comparison; + if (op2l == CONST0_RTX (SImode)) + break; + msw_skip = swap_condition (msw_taken); + lsw_taken = LTU; + break; + case LEU: case LE: + if (GET_CODE (op2l) == CONST_INT && INTVAL (op2l) == -1) + msw_taken = comparison; + else + { + lsw_taken = LEU; + if (comparison == LE) + msw_taken = LT; + else if (op2h != CONST0_RTX (SImode)) + msw_taken = LTU; + else + break; + msw_skip = swap_condition (msw_taken); + } + break; + default: return false; + } + num_branches = ((msw_taken != CODE_FOR_nothing) + + (msw_skip != CODE_FOR_nothing) + + (lsw_taken != CODE_FOR_nothing)); + if (comparison != EQ && comparison != NE && num_branches > 1) + { + if (!CONSTANT_P (operands[2]) + && prob >= (int) (REG_BR_PROB_BASE * 3 / 8U) + && prob <= (int) (REG_BR_PROB_BASE * 5 / 8U)) + { + msw_taken_prob = prob / 2U; + msw_skip_prob + = REG_BR_PROB_BASE * rev_prob / (REG_BR_PROB_BASE + rev_prob); + lsw_taken_prob = prob; + } + else + { + msw_taken_prob = prob; + msw_skip_prob = REG_BR_PROB_BASE; + /* ??? If we have a constant op2h, should we use that when + calculating lsw_taken_prob? */ + lsw_taken_prob = prob; + } + } + operands[1] = op1h; + operands[2] = op2h; + operands[4] = NULL_RTX; + if (msw_taken != CODE_FOR_nothing) + expand_cbranchsi4 (operands, msw_taken, msw_taken_prob); + if (msw_skip != CODE_FOR_nothing) + { + rtx taken_label = operands[3]; + + operands[3] = skip_label = gen_label_rtx (); + expand_cbranchsi4 (operands, msw_skip, msw_skip_prob); + operands[3] = taken_label; + } + operands[1] = op1l; + operands[2] = op2l; + if (lsw_taken != CODE_FOR_nothing) + expand_cbranchsi4 (operands, lsw_taken, lsw_taken_prob); + if (msw_skip != CODE_FOR_nothing) + emit_label (skip_label); + return true; +} + /* Prepare the operands for an scc instruction; make sure that the compare has been done. */ rtx @@ -1723,6 +2012,12 @@ output_branch (int logic, rtx insn, rtx *operands) } } +/* Output a code sequence for INSN using TEMPLATE with OPERANDS; but before, + fill in operands 9 as a label to the successor insn. + We try to use jump threading where possible. + IF CODE matches the comparison in the IF_THEN_ELSE of a following jump, + we assume the jump is taken. I.e. EQ means follow jmp and bf, NE means + follow jmp and bt, if the address is in range. */ const char * output_branchy_insn (enum rtx_code code, const char *template, rtx insn, rtx *operands) @@ -2117,6 +2412,15 @@ sh_rtx_costs (rtx x, int code, int outer_code, int *total) else if ((outer_code == AND || outer_code == IOR || outer_code == XOR) && CONST_OK_FOR_K08 (INTVAL (x))) *total = 1; + /* prepare_cmp_insn will force costly constants int registers before + the cbrach[sd]i4 pattterns can see them, so preserve potentially + interesting ones not covered by I08 above. */ + else if (outer_code == COMPARE + && ((unsigned HOST_WIDE_INT) INTVAL (x) + == (unsigned HOST_WIDE_INT) 0x7fffffff + 1 + || INTVAL (x) == 0x7fffffff + || INTVAL (x) == 0x80 || INTVAL (x) == -0x81)) + *total = 1; else *total = 8; return true; @@ -2135,6 +2439,11 @@ sh_rtx_costs (rtx x, int code, int outer_code, int *total) case CONST_DOUBLE: if (TARGET_SHMEDIA) *total = COSTS_N_INSNS (4); + /* prepare_cmp_insn will force costly constants int registers before + the cbrachdi4 patttern can see them, so preserve potentially + interesting ones. */ + else if (outer_code == COMPARE && GET_MODE (x) == DImode) + *total = 1; else *total = 10; return true; @@ -8571,23 +8880,32 @@ sh_adjust_cost (rtx insn, rtx link ATTRIBUTE_UNUSED, rtx dep_insn, int cost) } else if (REG_NOTE_KIND (link) == 0) { - enum attr_type dep_type, type; + enum attr_type type; + rtx dep_set; if (recog_memoized (insn) < 0 || recog_memoized (dep_insn) < 0) return cost; - dep_type = get_attr_type (dep_insn); - if (dep_type == TYPE_FLOAD || dep_type == TYPE_PCFLOAD) - cost--; - if ((dep_type == TYPE_LOAD_SI || dep_type == TYPE_PCLOAD_SI) - && (type = get_attr_type (insn)) != TYPE_CALL - && type != TYPE_SFUNC) - cost--; + dep_set = single_set (dep_insn); + /* The latency that we specify in the scheduling description refers + to the actual output, not to an auto-increment register; for that, + the latency is one. */ + if (dep_set && MEM_P (SET_SRC (dep_set)) && cost > 1) + { + rtx set = single_set (insn); + + if (set + && !reg_mentioned_p (SET_DEST (dep_set), SET_SRC (set)) + && (!MEM_P (SET_DEST (set)) + || !reg_mentioned_p (SET_DEST (dep_set), + XEXP (SET_DEST (set), 0)))) + cost = 1; + } /* The only input for a call that is timing-critical is the function's address. */ - if (GET_CODE(insn) == CALL_INSN) + if (GET_CODE (insn) == CALL_INSN) { rtx call = PATTERN (insn); @@ -8599,12 +8917,16 @@ sh_adjust_cost (rtx insn, rtx link ATTRIBUTE_UNUSED, rtx dep_insn, int cost) /* sibcalli_thunk uses a symbol_ref in an unspec. */ && (GET_CODE (XEXP (XEXP (call, 0), 0)) == UNSPEC || ! reg_set_p (XEXP (XEXP (call, 0), 0), dep_insn))) - cost = 0; + cost -= TARGET_SH4_300 ? 3 : 6; } /* Likewise, the most timing critical input for an sfuncs call is the function address. However, sfuncs typically start using their arguments pretty quickly. - Assume a four cycle delay before they are needed. */ + Assume a four cycle delay for SH4 before they are needed. + Cached ST40-300 calls are quicker, so assume only a one + cycle delay there. + ??? Maybe we should encode the delays till input registers + are needed by sfuncs into the sfunc call insn. */ /* All sfunc calls are parallels with at least four components. Exploit this to avoid unnecessary calls to sfunc_uses_reg. */ else if (GET_CODE (PATTERN (insn)) == PARALLEL @@ -8612,50 +8934,83 @@ sh_adjust_cost (rtx insn, rtx link ATTRIBUTE_UNUSED, rtx dep_insn, int cost) && (reg = sfunc_uses_reg (insn))) { if (! reg_set_p (reg, dep_insn)) - cost -= 4; - } - /* When the preceding instruction loads the shift amount of - the following SHAD/SHLD, the latency of the load is increased - by 1 cycle. */ - else if (TARGET_SH4 - && get_attr_type (insn) == TYPE_DYN_SHIFT - && get_attr_any_int_load (dep_insn) == ANY_INT_LOAD_YES - && reg_overlap_mentioned_p (SET_DEST (PATTERN (dep_insn)), - XEXP (SET_SRC (single_set (insn)), - 1))) - cost++; - /* When an LS group instruction with a latency of less than - 3 cycles is followed by a double-precision floating-point - instruction, FIPR, or FTRV, the latency of the first - instruction is increased to 3 cycles. */ - else if (cost < 3 - && get_attr_insn_class (dep_insn) == INSN_CLASS_LS_GROUP - && get_attr_dfp_comp (insn) == DFP_COMP_YES) - cost = 3; - /* The lsw register of a double-precision computation is ready one - cycle earlier. */ - else if (reload_completed - && get_attr_dfp_comp (dep_insn) == DFP_COMP_YES - && (use_pat = single_set (insn)) - && ! regno_use_in (REGNO (SET_DEST (single_set (dep_insn))), - SET_SRC (use_pat))) - cost -= 1; - - if (get_attr_any_fp_comp (dep_insn) == ANY_FP_COMP_YES - && get_attr_late_fp_use (insn) == LATE_FP_USE_YES) - cost -= 1; + cost -= TARGET_SH4_300 ? 1 : 4; + } + if (TARGET_HARD_SH4 && !TARGET_SH4_300) + { + enum attr_type dep_type = get_attr_type (dep_insn); + + if (dep_type == TYPE_FLOAD || dep_type == TYPE_PCFLOAD) + cost--; + else if ((dep_type == TYPE_LOAD_SI || dep_type == TYPE_PCLOAD_SI) + && (type = get_attr_type (insn)) != TYPE_CALL + && type != TYPE_SFUNC) + cost--; + /* When the preceding instruction loads the shift amount of + the following SHAD/SHLD, the latency of the load is increased + by 1 cycle. */ + if (get_attr_type (insn) == TYPE_DYN_SHIFT + && get_attr_any_int_load (dep_insn) == ANY_INT_LOAD_YES + && reg_overlap_mentioned_p (SET_DEST (PATTERN (dep_insn)), + XEXP (SET_SRC (single_set (insn)), + 1))) + cost++; + /* When an LS group instruction with a latency of less than + 3 cycles is followed by a double-precision floating-point + instruction, FIPR, or FTRV, the latency of the first + instruction is increased to 3 cycles. */ + else if (cost < 3 + && get_attr_insn_class (dep_insn) == INSN_CLASS_LS_GROUP + && get_attr_dfp_comp (insn) == DFP_COMP_YES) + cost = 3; + /* The lsw register of a double-precision computation is ready one + cycle earlier. */ + else if (reload_completed + && get_attr_dfp_comp (dep_insn) == DFP_COMP_YES + && (use_pat = single_set (insn)) + && ! regno_use_in (REGNO (SET_DEST (single_set (dep_insn))), + SET_SRC (use_pat))) + cost -= 1; + + if (get_attr_any_fp_comp (dep_insn) == ANY_FP_COMP_YES + && get_attr_late_fp_use (insn) == LATE_FP_USE_YES) + cost -= 1; + } + else if (TARGET_SH4_300) + { + /* Stores need their input register two cycles later. */ + if (dep_set && cost >= 1 + && ((type = get_attr_type (insn)) == TYPE_STORE + || type == TYPE_PSTORE + || type == TYPE_FSTORE || type == TYPE_MAC_MEM)) + { + rtx set = single_set (insn); + + if (!reg_mentioned_p (SET_SRC (set), XEXP (SET_DEST (set), 0)) + && rtx_equal_p (SET_SRC (set), SET_DEST (dep_set))) + { + cost -= 2; + /* But don't reduce the cost below 1 if the address depends + on a side effect of dep_insn. */ + if (cost < 1 + && modified_in_p (XEXP (SET_DEST (set), 0), dep_insn)) + cost = 1; + } + } + } } /* An anti-dependence penalty of two applies if the first insn is a double precision fadd / fsub / fmul. */ - else if (REG_NOTE_KIND (link) == REG_DEP_ANTI + else if (!TARGET_SH4_300 + && REG_NOTE_KIND (link) == REG_DEP_ANTI && recog_memoized (dep_insn) >= 0 - && get_attr_type (dep_insn) == TYPE_DFP_ARITH + && (get_attr_type (dep_insn) == TYPE_DFP_ARITH + || get_attr_type (dep_insn) == TYPE_DFP_MUL) /* A lot of alleged anti-flow dependences are fake, so check this one is real. */ && flow_dependent_p (dep_insn, insn)) cost = 2; - return cost; } diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h index fc4e1f282a4..1b659c75135 100644 --- a/gcc/config/sh/sh.h +++ b/gcc/config/sh/sh.h @@ -274,6 +274,7 @@ do { \ #endif #if SUPPORT_SH2 #define SUPPORT_SH3 1 +#define SUPPORT_SH2A_NOFPU 1 #endif #if SUPPORT_SH3 #define SUPPORT_SH4_NOFPU 1 @@ -281,16 +282,17 @@ do { \ #if SUPPORT_SH4_NOFPU #define SUPPORT_SH4A_NOFPU 1 #define SUPPORT_SH4AL 1 -#define SUPPORT_SH2A_NOFPU 1 #endif #if SUPPORT_SH2E #define SUPPORT_SH3E 1 +#define SUPPORT_SH2A_SINGLE_ONLY 1 #endif #if SUPPORT_SH3E #define SUPPORT_SH4_SINGLE_ONLY 1 +#endif +#if SUPPORT_SH4_SINGLE_ONLY #define SUPPORT_SH4A_SINGLE_ONLY 1 -#define SUPPORT_SH2A_SINGLE_ONLY 1 #endif #if SUPPORT_SH4 @@ -469,6 +471,11 @@ do { \ target_flags |= MASK_SMALLCODE; \ sh_div_str = SH_DIV_STR_FOR_SIZE ; \ } \ + else \ + { \ + TARGET_CBRANCHDI4 = 1; \ + TARGET_EXPAND_CBRANCHDI4 = 1; \ + } \ /* We can't meaningfully test TARGET_SHMEDIA here, because -m options \ haven't been parsed yet, hence we'd read only the default. \ sh_target_reg_class will return NO_REGS if this is not SHMEDIA, so \ @@ -608,6 +615,7 @@ do { \ else \ sh_div_strategy = SH_DIV_INV; \ } \ + TARGET_CBRANCHDI4 = 0; \ } \ /* -fprofile-arcs needs a working libgcov . In unified tree \ configurations with newlib, this requires to configure with \ @@ -668,6 +676,9 @@ do { \ sh_divsi3_libfunc = "__sdivsi3_1"; \ else \ sh_divsi3_libfunc = "__sdivsi3"; \ + if (sh_branch_cost == -1) \ + sh_branch_cost \ + = TARGET_SH5 ? 1 : ! TARGET_SH2 || TARGET_HARD_SH4 ? 2 : 1; \ if (TARGET_FMOVD) \ reg_class_from_letter['e' - 'a'] = NO_REGS; \ \ @@ -844,7 +855,7 @@ do { \ ((GET_MODE_CLASS (TYPE_MODE (TYPE)) == MODE_COMPLEX_INT \ || GET_MODE_CLASS (TYPE_MODE (TYPE)) == MODE_COMPLEX_FLOAT) \ ? (unsigned) MIN (BIGGEST_ALIGNMENT, GET_MODE_BITSIZE (TYPE_MODE (TYPE))) \ - : (unsigned) ALIGN) + : (unsigned) DATA_ALIGNMENT(TYPE, ALIGN)) /* Make arrays of chars word-aligned for the same reasons. */ #define DATA_ALIGNMENT(TYPE, ALIGN) \ @@ -2288,6 +2299,7 @@ struct sh_args { #define CONSTANT_ADDRESS_P(X) (GET_CODE (X) == LABEL_REF) /* Nonzero if the constant value X is a legitimate general operand. */ +/* can_store_by_pieces constructs VOIDmode CONST_DOUBLEs. */ #define LEGITIMATE_CONSTANT_P(X) \ (TARGET_SHMEDIA \ @@ -2298,7 +2310,7 @@ struct sh_args { || TARGET_SHMEDIA64) \ : (GET_CODE (X) != CONST_DOUBLE \ || GET_MODE (X) == DFmode || GET_MODE (X) == SFmode \ - || (TARGET_SH2E && (fp_zero_operand (X) || fp_one_operand (X))))) + || GET_MODE (X) == DImode || GET_MODE (X) == VOIDmode)) /* The macros REG_OK_FOR..._P assume that the arg is a REG rtx and check its validity for a certain class. diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md index d091dfe0eff..a37c58308e3 100644 --- a/gcc/config/sh/sh.md +++ b/gcc/config/sh/sh.md @@ -204,7 +204,9 @@ ;; load_si Likewise, SImode variant for general register. ;; fload Likewise, but load to fp register. ;; store to memory +;; fstore floating point register to memory ;; move general purpose register to register +;; movi8 8 bit immediate to general purpose register ;; mt_group other sh4 mt instructions ;; fmove register to register, floating point ;; smpy word precision integer multiply @@ -221,11 +223,15 @@ ;; sfunc special function call with known used registers ;; call function call ;; fp floating point +;; fpscr_toggle toggle a bit in the fpscr ;; fdiv floating point divide (or square root) ;; gp_fpul move from general purpose register to fpul ;; fpul_gp move from fpul to general purpose register ;; mac_gp move from mac[lh] to general purpose register -;; dfp_arith, dfp_cmp,dfp_conv +;; gp_mac move from general purpose register to mac[lh] +;; mac_mem move from mac[lh] to memory +;; mem_mac move from memory to mac[lh] +;; dfp_arith,dfp_mul, fp_cmp,dfp_cmp,dfp_conv ;; ftrc_s fix_truncsfsi2_i4 ;; dfdiv double precision floating point divide (or square root) ;; cwb ic_invalidate_line_i @@ -263,7 +269,7 @@ ;; nil no-op move, will be deleted. (define_attr "type" - "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fdiv,ftrc_s,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,mem_fpscr,gp_fpscr,cwb,movua,fsrra,fsca,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other" + "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,fstore,move,movi8,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fpscr_toggle,fdiv,ftrc_s,dfp_arith,dfp_mul,fp_cmp,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,gp_mac,mac_mem,mem_mac,mem_fpscr,gp_fpscr,cwb,movua,fsrra,fsca,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other" (const_string "other")) ;; We define a new attribute namely "insn_class".We use @@ -279,12 +285,12 @@ (define_attr "insn_class" "mt_group,ex_group,ls_group,br_group,fe_group,co_group,none" (cond [(eq_attr "type" "move,mt_group") (const_string "mt_group") - (eq_attr "type" "arith,dyn_shift") (const_string "ex_group") - (eq_attr "type" "fmove,load,pcload,load_si,pcload_si,fload,pcfload,store,gp_fpul,fpul_gp") (const_string "ls_group") + (eq_attr "type" "movi8,arith,dyn_shift") (const_string "ex_group") + (eq_attr "type" "fmove,load,pcload,load_si,pcload_si,fload,pcfload,store,fstore,gp_fpul,fpul_gp") (const_string "ls_group") (eq_attr "type" "cbranch,jump") (const_string "br_group") - (eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_conv,dfdiv") + (eq_attr "type" "fp,fp_cmp,fdiv,ftrc_s,dfp_arith,dfp_mul,dfp_conv,dfdiv") (const_string "fe_group") - (eq_attr "type" "jump_ind,smpy,dmpy,mac_gp,return,pload,prset,pstore,prget,rte,sfunc,call,dfp_cmp,mem_fpscr,gp_fpscr,cwb") (const_string "co_group")] + (eq_attr "type" "jump_ind,smpy,dmpy,mac_gp,return,pload,prset,pstore,prget,rte,sfunc,call,dfp_cmp,mem_fpscr,gp_fpscr,cwb,gp_mac,mac_mem,mem_mac") (const_string "co_group")] (const_string "none"))) ;; nil are zero instructions, and arith3 / arith3b are multiple instructions, ;; so these do not belong in an insn group, although they are modeled @@ -494,14 +500,14 @@ ;; SH4 Double-precision computation with double-precision result - ;; the two halves are ready at different times. (define_attr "dfp_comp" "yes,no" - (cond [(eq_attr "type" "dfp_arith,dfp_conv,dfdiv") (const_string "yes")] + (cond [(eq_attr "type" "dfp_arith,dfp_mul,dfp_conv,dfdiv") (const_string "yes")] (const_string "no"))) ;; Insns for which the latency of a preceding fp insn is decreased by one. (define_attr "late_fp_use" "yes,no" (const_string "no")) ;; And feeding insns for which this relevant. (define_attr "any_fp_comp" "yes,no" - (cond [(eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_conv,dfdiv") + (cond [(eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_mul,dfp_conv,dfdiv") (const_string "yes")] (const_string "no"))) @@ -609,15 +615,37 @@ [(set_attr "type" "mt_group")]) ;; ------------------------------------------------------------------------- +;; SImode compare and branch +;; ------------------------------------------------------------------------- + +(define_expand "cbranchsi4" + [(set (pc) + (if_then_else (match_operator 0 "comparison_operator" + [(match_operand:SI 1 "arith_operand" "") + (match_operand:SI 2 "arith_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (reg:SI T_REG))] + "TARGET_CBRANCHDI4" + "expand_cbranchsi4 (operands, CODE_FOR_nothing, -1); DONE;") + +;; ------------------------------------------------------------------------- ;; SImode unsigned integer comparisons ;; ------------------------------------------------------------------------- -(define_insn "cmpgeusi_t" +(define_insn_and_split "cmpgeusi_t" [(set (reg:SI T_REG) (geu:SI (match_operand:SI 0 "arith_reg_operand" "r") - (match_operand:SI 1 "arith_reg_operand" "r")))] + (match_operand:SI 1 "arith_reg_or_0_operand" "rN")))] "TARGET_SH1" "cmp/hs %1,%0" + "&& operands[0] == CONST0_RTX (SImode)" + [(pc)] + " +{ + emit_insn (gen_sett ()); + DONE; +}" [(set_attr "type" "mt_group")]) (define_insn "cmpgtusi_t" @@ -647,12 +675,64 @@ }") ;; ------------------------------------------------------------------------- -;; DImode signed integer comparisons +;; DImode compare and branch ;; ------------------------------------------------------------------------- -;; ??? Could get better scheduling by splitting the initial test from the -;; rest of the insn after reload. However, the gain would hardly justify -;; the sh.md size increase necessary to do that. + +;; arith3 patterns don't work well with the sh4-300 branch prediction mechanism. +;; Therefore, we aim to have a set of three branches that go straight to the +;; destination, i.e. only one of them is taken at any one time. +;; This mechanism should also be slightly better for the sh4-200. + +(define_expand "cbranchdi4" + [(set (pc) + (if_then_else (match_operator 0 "comparison_operator" + [(match_operand:DI 1 "arith_operand" "") + (match_operand:DI 2 "arith_operand" "")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_dup 4)) + (clobber (reg:SI T_REG))] + "TARGET_CBRANCHDI4" + " +{ + enum rtx_code comparison; + + if (TARGET_EXPAND_CBRANCHDI4) + { + if (expand_cbranchdi4 (operands, CODE_FOR_nothing)) + DONE; + } + comparison = prepare_cbranch_operands (operands, DImode, CODE_FOR_nothing); + if (comparison != GET_CODE (operands[0])) + operands[0] + = gen_rtx_fmt_ee (VOIDmode, comparison, operands[1], operands[2]); + operands[4] = gen_rtx_SCRATCH (SImode); +}") + +(define_insn_and_split "cbranchdi4_i" + [(set (pc) + (if_then_else (match_operator 0 "comparison_operator" + [(match_operand:DI 1 "arith_operand" "r,r") + (match_operand:DI 2 "arith_operand" "rN,i")]) + (label_ref (match_operand 3 "" "")) + (pc))) + (clobber (match_scratch:SI 4 "=X,&r")) + (clobber (reg:SI T_REG))] + "TARGET_CBRANCHDI4" + "#" + "&& reload_completed" + [(pc)] + " +{ + if (!expand_cbranchdi4 (operands, GET_CODE (operands[0]))) + FAIL; + DONE; +}") + +;; ------------------------------------------------------------------------- +;; DImode signed integer comparisons +;; ------------------------------------------------------------------------- (define_insn "" [(set (reg:SI T_REG) @@ -4736,7 +4816,7 @@ label: [(set (mem:SF (pre_dec:SI (reg:SI SP_REG))) (reg:SF FPUL_REG))] "TARGET_SH2E && ! TARGET_SH5" "sts.l fpul,@-r15" - [(set_attr "type" "store") + [(set_attr "type" "fstore") (set_attr "late_fp_use" "yes") (set_attr "hit_stack" "yes")]) @@ -4818,9 +4898,9 @@ label: ;; (made from (set (subreg:SI (reg:QI ###) 0) ) into T. (define_insn "movsi_i" [(set (match_operand:SI 0 "general_movdst_operand" - "=r,r,t,r,r,r,r,m,<,<,x,l,x,l,r") + "=r,r,r,t,r,r,r,r,m,<,<,x,l,x,l,r") (match_operand:SI 1 "general_movsrc_operand" - "Q,rI08,r,mr,x,l,t,r,x,l,r,r,>,>,i"))] + "Q,r,I08,r,mr,x,l,t,r,x,l,r,r,>,>,i"))] "TARGET_SH1 && ! TARGET_SH2E && ! TARGET_SH2A @@ -4829,6 +4909,7 @@ label: "@ mov.l %1,%0 mov %1,%0 + mov %1,%0 cmp/pl %1 mov.l %1,%0 sts %1,%0 @@ -4842,8 +4923,8 @@ label: lds.l %1,%0 lds.l %1,%0 fake %1,%0" - [(set_attr "type" "pcload_si,move,mt_group,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,pcload_si") - (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")]) + [(set_attr "type" "pcload_si,move,movi8,mt_group,load_si,mac_gp,prget,arith,mac_mem,store,pstore,gp_mac,prset,mem_mac,pload,pcload_si") + (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")]) ;; t/r must come after r/r, lest reload will try to reload stuff like ;; (subreg:SI (reg:SF FR14_REG) 0) into T (compiling stdlib/strtod.c -m3e -O2) @@ -4853,15 +4934,16 @@ label: ;; TARGET_FMOVD is in effect, and mode switching is done before reload. (define_insn "movsi_ie" [(set (match_operand:SI 0 "general_movdst_operand" - "=r,r,r,t,r,r,r,r,m,<,<,x,l,x,l,y,<,r,y,r,*f,y,*f,y") + "=r,r,r,r,t,r,r,r,r,m,<,<,x,l,x,l,y,<,r,y,r,*f,y,*f,y") (match_operand:SI 1 "general_movsrc_operand" - "Q,rI08,I20,r,mr,x,l,t,r,x,l,r,r,>,>,>,y,i,r,y,y,*f,*f,y"))] + "Q,r,I08,I20,r,mr,x,l,t,r,x,l,r,r,>,>,>,y,i,r,y,y,*f,*f,y"))] "(TARGET_SH2E || TARGET_SH2A) && (register_operand (operands[0], SImode) || register_operand (operands[1], SImode))" "@ mov.l %1,%0 mov %1,%0 + mov %1,%0 movi20 %1,%0 cmp/pl %1 mov.l %1,%0 @@ -4884,26 +4966,27 @@ label: flds %1,fpul fmov %1,%0 ! move optimized away" - [(set_attr "type" "pcload_si,move,move,*,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,load,store,pcload_si,gp_fpul,fpul_gp,fmove,fmove,fmove,nil") - (set_attr "late_fp_use" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes,*,*,yes,*,*,*,*") - (set_attr "length" "*,*,4,*,4,*,*,*,4,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")]) + [(set_attr "type" "pcload_si,move,movi8,move,*,load_si,mac_gp,prget,arith,store,mac_mem,pstore,gp_mac,prset,mem_mac,pload,load,fstore,pcload_si,gp_fpul,fpul_gp,fmove,fmove,fmove,nil") + (set_attr "late_fp_use" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes,*,*,yes,*,*,*,*") + (set_attr "length" "*,*,*,4,*,4,*,*,*,4,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")]) (define_insn "movsi_i_lowpart" - [(set (strict_low_part (match_operand:SI 0 "general_movdst_operand" "+r,r,r,r,r,r,m,r")) - (match_operand:SI 1 "general_movsrc_operand" "Q,rI08,mr,x,l,t,r,i"))] + [(set (strict_low_part (match_operand:SI 0 "general_movdst_operand" "+r,r,r,r,r,r,r,m,r")) + (match_operand:SI 1 "general_movsrc_operand" "Q,r,I08,mr,x,l,t,r,i"))] "TARGET_SH1 && (register_operand (operands[0], SImode) || register_operand (operands[1], SImode))" "@ mov.l %1,%0 mov %1,%0 + mov %1,%0 mov.l %1,%0 sts %1,%0 sts %1,%0 movt %0 mov.l %1,%0 fake %1,%0" - [(set_attr "type" "pcload,move,load,move,prget,move,store,pcload")]) + [(set_attr "type" "pcload,move,arith,load,mac_gp,prget,arith,store,pcload")]) (define_insn_and_split "load_ra" [(set (match_operand:SI 0 "general_movdst_operand" "") @@ -5155,19 +5238,20 @@ label: (set_attr "needs_delay_slot" "yes")]) (define_insn "movqi_i" - [(set (match_operand:QI 0 "general_movdst_operand" "=r,r,m,r,r,l") - (match_operand:QI 1 "general_movsrc_operand" "ri,m,r,t,l,r"))] + [(set (match_operand:QI 0 "general_movdst_operand" "=r,r,r,m,r,r,l") + (match_operand:QI 1 "general_movsrc_operand" "r,i,m,r,t,l,r"))] "TARGET_SH1 && (arith_reg_operand (operands[0], QImode) || arith_reg_operand (operands[1], QImode))" "@ mov %1,%0 + mov %1,%0 mov.b %1,%0 mov.b %1,%0 movt %0 sts %1,%0 lds %1,%0" - [(set_attr "type" "move,load,store,move,move,move")]) + [(set_attr "type" "move,movi8,load,store,arith,prget,prset")]) (define_insn "*movqi_media" [(set (match_operand:QI 0 "general_movdst_operand" "=r,r,r,m") @@ -5769,7 +5853,7 @@ label: (if_then_else (ne (symbol_ref "TARGET_SHCOMPACT") (const_int 0)) (const_int 10) (const_int 8))]) - (set_attr "type" "fmove,move,pcfload,fload,store,pcload,load,store,load,fload") + (set_attr "type" "fmove,move,pcfload,fload,fstore,pcload,load,store,load,fload") (set_attr "late_fp_use" "*,*,*,*,yes,*,*,*,*,*") (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes") (const_string "double") @@ -6486,7 +6570,7 @@ label: sts.l %1,%0 lds.l %1,%0 ! move optimized away" - [(set_attr "type" "fmove,move,fmove,fmove,pcfload,fload,store,pcload,load,store,fmove,fmove,load,*,fpul_gp,gp_fpul,store,load,nil") + [(set_attr "type" "fmove,move,fmove,fmove,pcfload,fload,fstore,pcload,load,store,fmove,fmove,load,*,fpul_gp,gp_fpul,fstore,load,nil") (set_attr "late_fp_use" "*,*,*,*,*,*,yes,*,*,*,*,*,*,*,yes,*,yes,*,*") (set_attr "length" "*,*,*,*,4,4,4,*,*,*,2,2,2,4,2,2,2,2,0") (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes") @@ -9929,7 +10013,7 @@ mov.l\\t1f,r0\\n\\ sts fpscr,%0 sts.l fpscr,%0" [(set_attr "length" "0,2,2,4,2,2,2,2,2") - (set_attr "type" "nil,mem_fpscr,load,mem_fpscr,gp_fpscr,move,store,mac_gp,store")]) + (set_attr "type" "nil,mem_fpscr,load,mem_fpscr,gp_fpscr,move,store,mac_gp,fstore")]) (define_peephole2 [(set (reg:PSI FPSCR_REG) @@ -9980,7 +10064,7 @@ mov.l\\t1f,r0\\n\\ (xor:PSI (reg:PSI FPSCR_REG) (const_int 1048576)))] "(TARGET_SH4 || TARGET_SH2A_DOUBLE)" "fschg" - [(set_attr "type" "fp") (set_attr "fp_set" "unknown")]) + [(set_attr "type" "fpscr_toggle") (set_attr "fp_set" "unknown")]) ;; There's no way we can use it today, since optimize mode switching ;; doesn't enable us to know from which mode we're switching to the @@ -9992,7 +10076,7 @@ mov.l\\t1f,r0\\n\\ (xor:PSI (reg:PSI FPSCR_REG) (const_int 524288)))] "TARGET_SH4A_FP && ! TARGET_FPU_SINGLE" "fpchg" - [(set_attr "type" "fp")]) + [(set_attr "type" "fpscr_toggle")]) (define_expand "addsf3" [(set (match_operand:SF 0 "arith_reg_operand" "") @@ -10124,25 +10208,12 @@ mov.l\\t1f,r0\\n\\ [(set_attr "type" "fp") (set_attr "fp_mode" "single")]) -;; Unfortunately, the combiner is unable to cope with the USE of the FPSCR -;; register in feeding fp instructions. Thus, we cannot generate fmac for -;; mixed-precision SH4 targets. To allow it to be still generated for the -;; SH3E, we use a separate insn for SH3E mulsf3. - (define_expand "mulsf3" [(set (match_operand:SF 0 "fp_arith_reg_operand" "") (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "") (match_operand:SF 2 "fp_arith_reg_operand" "")))] "TARGET_SH2E || TARGET_SHMEDIA_FPU" - " -{ - if (TARGET_SH4 || TARGET_SH2A_SINGLE) - expand_sf_binop (&gen_mulsf3_i4, operands); - else if (TARGET_SH2E) - emit_insn (gen_mulsf3_ie (operands[0], operands[1], operands[2])); - if (! TARGET_SHMEDIA) - DONE; -}") + "") (define_insn "*mulsf3_media" [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f") @@ -10152,6 +10223,27 @@ mov.l\\t1f,r0\\n\\ "fmul.s %1, %2, %0" [(set_attr "type" "fparith_media")]) +;; Unfortunately, the combiner is unable to cope with the USE of the FPSCR +;; register in feeding fp instructions. Thus, in order to generate fmac, +;; we start out with a mulsf pattern that does not depend on fpscr. +;; This is split after combine to introduce the dependency, in order to +;; get mode switching and scheduling right. +(define_insn_and_split "mulsf3_ie" + [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f") + (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0") + (match_operand:SF 2 "fp_arith_reg_operand" "f")))] + "TARGET_SH2E" + "fmul %2,%0" + "TARGET_SH4 || TARGET_SH2A_SINGLE" + [(const_int 0)] + " +{ + emit_insn (gen_mulsf3_i4 (operands[0], operands[1], operands[2], + get_fpscr_rtx ())); + DONE; +}" + [(set_attr "type" "fp")]) + (define_insn "mulsf3_i4" [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f") (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0") @@ -10162,20 +10254,12 @@ mov.l\\t1f,r0\\n\\ [(set_attr "type" "fp") (set_attr "fp_mode" "single")]) -(define_insn "mulsf3_ie" - [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f") - (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0") - (match_operand:SF 2 "fp_arith_reg_operand" "f")))] - "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)" - "fmul %2,%0" - [(set_attr "type" "fp")]) - (define_insn "mac_media" [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f") (plus:SF (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%f") (match_operand:SF 2 "fp_arith_reg_operand" "f")) (match_operand:SF 3 "fp_arith_reg_operand" "0")))] - "TARGET_SHMEDIA_FPU" + "TARGET_SHMEDIA_FPU && TARGET_FMAC" "fmac.s %1, %2, %0" [(set_attr "type" "fparith_media")]) @@ -10185,7 +10269,7 @@ mov.l\\t1f,r0\\n\\ (match_operand:SF 2 "fp_arith_reg_operand" "f")) (match_operand:SF 3 "arith_reg_operand" "0"))) (use (match_operand:PSI 4 "fpscr_operand" "c"))] - "TARGET_SH2E && ! TARGET_SH4" + "TARGET_SH2E && TARGET_FMAC" "fmac fr0,%2,%0" [(set_attr "type" "fp") (set_attr "fp_mode" "single")]) @@ -10336,7 +10420,7 @@ mov.l\\t1f,r0\\n\\ (match_operand:SF 1 "fp_arith_reg_operand" "f")))] "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)" "fcmp/gt %1,%0" - [(set_attr "type" "fp") + [(set_attr "type" "fp_cmp") (set_attr "fp_mode" "single")]) (define_insn "cmpeqsf_t" @@ -10345,7 +10429,7 @@ mov.l\\t1f,r0\\n\\ (match_operand:SF 1 "fp_arith_reg_operand" "f")))] "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)" "fcmp/eq %1,%0" - [(set_attr "type" "fp") + [(set_attr "type" "fp_cmp") (set_attr "fp_mode" "single")]) (define_insn "ieee_ccmpeqsf_t" @@ -10365,7 +10449,7 @@ mov.l\\t1f,r0\\n\\ (use (match_operand:PSI 2 "fpscr_operand" "c"))] "(TARGET_SH4 || TARGET_SH2A_SINGLE)" "fcmp/gt %1,%0" - [(set_attr "type" "fp") + [(set_attr "type" "fp_cmp") (set_attr "fp_mode" "single")]) (define_insn "cmpeqsf_t_i4" @@ -10375,7 +10459,7 @@ mov.l\\t1f,r0\\n\\ (use (match_operand:PSI 2 "fpscr_operand" "c"))] "(TARGET_SH4 || TARGET_SH2A_SINGLE)" "fcmp/eq %1,%0" - [(set_attr "type" "fp") + [(set_attr "type" "fp_cmp") (set_attr "fp_mode" "single")]) (define_insn "*ieee_ccmpeqsf_t_4" @@ -10724,7 +10808,7 @@ mov.l\\t1f,r0\\n\\ (use (match_operand:PSI 3 "fpscr_operand" "c"))] "(TARGET_SH4 || TARGET_SH2A_DOUBLE)" "fmul %2,%0" - [(set_attr "type" "dfp_arith") + [(set_attr "type" "dfp_mul") (set_attr "fp_mode" "double")]) (define_expand "divdf3" diff --git a/gcc/config/sh/sh.opt b/gcc/config/sh/sh.opt index 7f9a87e95d9..161fdd8dcaf 100644 --- a/gcc/config/sh/sh.opt +++ b/gcc/config/sh/sh.opt @@ -57,11 +57,11 @@ Target RejectNegative Condition(SUPPORT_SH2A_NOFPU) Generate SH2a FPU-less code m2a-single -Target RejectNegative Condition (SUPPORT_SH2A_SINGLE) +Target RejectNegative Condition(SUPPORT_SH2A_SINGLE) Generate default single-precision SH2a code m2a-single-only -Target RejectNegative Condition (SUPPORT_SH2A_SINGLE_ONLY) +Target RejectNegative Condition(SUPPORT_SH2A_SINGLE_ONLY) Generate only single-precision SH2a code m2e @@ -88,10 +88,33 @@ m4-200 Target RejectNegative Condition(SUPPORT_SH4) Generate SH4-200 code +;; TARGET_SH4_300 indicates if we have the ST40-300 instruction set and +;; pipeline - irrespective of ABI. +m4-300 +Target RejectNegative Condition(SUPPORT_SH4) Var(TARGET_SH4_300) +Generate SH4-300 code + m4-nofpu Target RejectNegative Condition(SUPPORT_SH4_NOFPU) Generate SH4 FPU-less code +m4-100-nofpu +Target RejectNegative Condition(SUPPORT_SH4_NOFPU) +Generate SH4-100 FPU-less code + +m4-200-nofpu +Target RejectNegative Condition(SUPPORT_SH4_NOFPU) +Generate SH4-200 FPU-less code + +m4-300-nofpu +Target RejectNegative Condition(SUPPORT_SH4_NOFPU) Var(TARGET_SH4_300) VarExists +Generate SH4-300 FPU-less code + +m4-340 +Target RejectNegative Condition(SUPPORT_SH4_NOFPU) Var(TARGET_SH4_300) VarExists +Generate code for SH4 340 series (MMU/FPU-less) +;; passes -isa=sh4-nommu-nofpu to the assembler. + m4-400 Target RejectNegative Condition(SUPPORT_SH4_NOFPU) Generate code for SH4 400 series (MMU/FPU-less) @@ -114,6 +137,10 @@ m4-200-single Target RejectNegative Condition(SUPPORT_SH4_SINGLE) Generate default single-precision SH4-200 code +m4-300-single +Target RejectNegative Condition(SUPPORT_SH4_SINGLE) Var(TARGET_SH4_300) VarExists +Generate default single-precision SH4-300 code + m4-single-only Target RejectNegative Condition(SUPPORT_SH4_SINGLE_ONLY) Generate only single-precision SH4 code @@ -126,6 +153,10 @@ m4-200-single-only Target RejectNegative Condition(SUPPORT_SH4_SINGLE_ONLY) Generate only single-precision SH4-200 code +m4-300-single-only +Target RejectNegative Condition(SUPPORT_SH4_SINGLE_ONLY) Var(TARGET_SH4_300) VarExists +Generate only single-precision SH4-300 code + m4a Target RejectNegative Mask(SH4A) Condition(SUPPORT_SH4A) Generate SH4a code @@ -182,6 +213,22 @@ mbigtable Target Report RejectNegative Mask(BIGTABLE) Generate 32-bit offsets in switch tables +mbranch-cost= +Target RejectNegative Joined UInteger Var(sh_branch_cost) Init(-1) +Cost to assume for a branch insn + +mcbranchdi +Target Var(TARGET_CBRANCHDI4) +Enable cbranchdi4 pattern + +mexpand-cbranchdi +Target Var(TARGET_EXPAND_CBRANCHDI4) +Expand cbranchdi4 pattern early into separate comparisons and branches. + +mcmpeqdi +Target Var(TARGET_CMPEQDI_T) +Emit cmpeqdi_t pattern even when -mcbranchdi and -mexpand-cbranchdi are in effect. + mcut2-workaround Target RejectNegative Var(TARGET_SH5_CUT2_WORKAROUND) Enable SH5 cut2 workaround @@ -192,7 +239,7 @@ Align doubles at 64-bit boundaries mdiv= Target RejectNegative Joined Var(sh_div_str) Init("") -Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp call-div1 call-fp call-table +Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp, call-div1, call-fp, call-table mdivsi3_libfunc= Target RejectNegative Joined Var(sh_divsi3_libfunc) Init("") @@ -201,6 +248,10 @@ Specify name for 32 bit signed division function mfmovd Target RejectNegative Mask(FMOVD) Undocumented +mfused-madd +Target Var(TARGET_FMAC) +Enable the use of the fused floating point multiply-accumulate operation + mgettrcost= Target RejectNegative Joined UInteger Var(sh_gettrcost) Init(-1) Cost to assume for gettr insn diff --git a/gcc/config/sh/sh1.md b/gcc/config/sh/sh1.md index 9dfdd86508f..1198fe737b9 100644 --- a/gcc/config/sh/sh1.md +++ b/gcc/config/sh/sh1.md @@ -1,5 +1,5 @@ ;; DFA scheduling description for Renesas / SuperH SH. -;; Copyright (C) 2004 Free Software Foundation, Inc. +;; Copyright (C) 2004, 2006 Free Software Foundation, Inc. ;; This file is part of GCC. @@ -45,7 +45,7 @@ (define_insn_reservation "sh1_load_store" 2 (and (eq_attr "pipe_model" "sh1") - (eq_attr "type" "load,pcload,pload,store,pstore")) + (eq_attr "type" "load,pcload,pload,mem_mac,store,fstore,pstore,mac_mem")) "sh1memory*2") (define_insn_reservation "sh1_arith3" 3 @@ -76,7 +76,7 @@ (define_insn_reservation "sh1_fp" 2 (and (eq_attr "pipe_model" "sh1") - (eq_attr "type" "fp,fmove")) + (eq_attr "type" "fp,fpscr_toggle,fp_cmp,fmove")) "sh1fp") (define_insn_reservation "sh1_fdiv" 13 diff --git a/gcc/config/sh/sh4-300.md b/gcc/config/sh/sh4-300.md new file mode 100644 index 00000000000..228782a67fc --- /dev/null +++ b/gcc/config/sh/sh4-300.md @@ -0,0 +1,288 @@ +;; DFA scheduling description for ST40-300. +;; Copyright (C) 2004, 2006 Free Software Foundation, Inc. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. + +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING. If not, write to +;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, +;; Boston, MA 02110-1301, USA. + +;; Load and store instructions save a cycle if they are aligned on a +;; four byte boundary. Using a function unit for stores encourages +;; gcc to separate load and store instructions by one instruction, +;; which makes it more likely that the linker will be able to word +;; align them when relaxing. + +;; The following description models the ST40-300 pipeline using the DFA based +;; scheduler. + +;; Two automata are defined to reduce number of states +;; which a single large automaton will have. (Factoring) + +(define_automaton "sh4_300_inst_pipeline,sh4_300_fpu_pipe") + +;; This unit is basically the decode unit of the processor. +;; Since SH4 is a dual issue machine,it is as if there are two +;; units so that any insn can be processed by either one +;; of the decoding unit. + +(define_cpu_unit "sh4_300_pipe_01,sh4_300_pipe_02" "sh4_300_inst_pipeline") + +;; The floating point units. + +(define_cpu_unit "sh4_300_fpt,sh4_300_fpu,sh4_300_fds" "sh4_300_fpu_pipe") + +;; integer multiplier unit + +(define_cpu_unit "sh4_300_mul" "sh4_300_inst_pipeline") + +;; LS unit + +(define_cpu_unit "sh4_300_ls" "sh4_300_inst_pipeline") + +;; The address calculator used for branch instructions. +;; This will be reserved after "issue" of branch instructions +;; and this is to make sure that no two branch instructions +;; can be issued in parallel. + +(define_cpu_unit "sh4_300_br" "sh4_300_inst_pipeline") + +;; ---------------------------------------------------- +;; This reservation is to simplify the dual issue description. + +(define_reservation "sh4_300_issue" "sh4_300_pipe_01|sh4_300_pipe_02") + +(define_reservation "all" "sh4_300_pipe_01+sh4_300_pipe_02") + +;;(define_insn_reservation "nil" 0 (eq_attr "type" "nil") "nothing") + +;; MOV RM,RN / MOV #imm8,RN / STS PR,RN +(define_insn_reservation "sh4_300_mov" 0 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "move,movi8,prget")) + "sh4_300_issue") + +;; Fixed STS from MACL / MACH +(define_insn_reservation "sh4_300_mac_gp" 0 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "mac_gp")) + "sh4_300_issue+sh4_300_mul") + +;; Fixed LDS to MACL / MACH +(define_insn_reservation "sh4_300_gp_mac" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "gp_mac")) + "sh4_300_issue+sh4_300_mul") + +;; Instructions without specific resource requirements with latency 1. + +(define_insn_reservation "sh4_300_simple_arith" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "mt_group,arith,dyn_shift,prset")) + "sh4_300_issue") + +;; Load and store instructions have no alignment peculiarities for the ST40-300, +;; but they use the load-store unit, which they share with the fmove type +;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) . +;; Loads have a latency of three. + +;; Load Store instructions. +(define_insn_reservation "sh4_300_load" 3 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "load,pcload,load_si,pcload_si,pload")) + "sh4_300_issue+sh4_300_ls") + +(define_insn_reservation "sh4_300_mac_load" 3 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "mem_mac")) + "sh4_300_issue+sh4_300_ls+sh4_300_mul") + +(define_insn_reservation "sh4_300_fload" 4 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "fload,pcfload")) + "sh4_300_issue+sh4_300_ls+sh4_300_fpt") + +;; sh_adjust_cost describes the reduced latency of the feeding insns of a store. +;; The latency of an auto-increment register is 1; the latency of the memory +;; output is not actually considered here anyway. +(define_insn_reservation "sh4_300_store" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "store,pstore")) + "sh4_300_issue+sh4_300_ls") + +(define_insn_reservation "sh4_300_fstore" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "fstore")) + "sh4_300_issue+sh4_300_ls+sh4_300_fpt") + +;; Fixed STS.L from MACL / MACH +(define_insn_reservation "sh4_300_mac_store" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "mac_mem")) + "sh4_300_issue+sh4_300_mul+sh4_300_ls") + +(define_insn_reservation "sh4_300_gp_fpul" 2 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "gp_fpul")) + "sh4_300_issue+sh4_300_fpt") + +(define_insn_reservation "sh4_300_fpul_gp" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "fpul_gp")) + "sh4_300_issue+sh4_300_fpt") + +;; Branch (BF,BF/S,BT,BT/S,BRA) +;; Branch Far (JMP,RTS,BRAF) +;; Group: BR +;; When displacement is 0 for BF / BT, we have effectively conditional +;; execution of one instruction, without pipeline disruption. +;; Otherwise, the latency depends on prediction success. +;; We can't really do much with the latency, even if we could express it, +;; but the pairing restrictions are useful to take into account. +;; ??? If the branch is likely, and not paired with a preceding insn, +;; or likely and likely not predicted, we might want to fill the delay slot. +;; However, there appears to be no machinery to make the compiler +;; recognize these scenarios. + +(define_insn_reservation "sh4_300_branch" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "cbranch,jump,return,jump_ind")) + "sh4_300_issue+sh4_300_br") + +;; RTE +(define_insn_reservation "sh4_300_return_from_exp" 9 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "rte")) + "sh4_300_pipe_01+sh4_300_pipe_02*9") + +;; OCBP, OCBWB +;; Group: CO +;; Latency: 1-5 +;; Issue Rate: 1 + +;; cwb is used for the sequence ocbwb @%0; extu.w %0,%2; or %1,%2; mov.l %0,@%2 +;; This description is likely inexact, but this pattern should not actually +;; appear when compiling for sh4-300; we should use isbi instead. +;; If a -mtune option is added later, we should use the icache array +;; dispatch method instead. +(define_insn_reservation "sh4_300_ocbwb" 3 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "cwb")) + "all*3") + +;; JSR,BSR,BSRF +;; Calls have a mandatory delay slot, which we'd like to fill with an insn +;; that can be paired with the call itself. +;; Scheduling runs before reorg, so we approximate this by saying that we +;; want the call to be paired with a preceding insn. +;; In most cases, the insn that loads the address of the call should have +;; a non-zero latency (mov rn,rm doesn't make sense since we could use rn +;; for the address then). Thus, a preceding insn that can be paired with +;; a call should be elegible for the delay slot. +;; +;; calls introduce a longisch delay that is likely to flush the pipelines +;; of the caller's instructions. Ordinary functions tend to end with a +;; load to restore a register (in the delay slot of rts), while sfuncs +;; tend to end with an EX or MT insn. But that is not actually relevant, +;; since there are no instructions that contend for memory access early. +;; We could, of course, provide exact scheduling information for specific +;; sfuncs, if that should prove useful. + +(define_insn_reservation "sh4_300_call" 16 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "call,sfunc")) + "sh4_300_issue+sh4_300_br,all*15") + +;; FMOV.S / FMOV.D +(define_insn_reservation "sh4_300_fmov" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "fmove")) + "sh4_300_issue+sh4_300_fpt") + +;; LDS to FPSCR +(define_insn_reservation "sh4_300_fpscr_load" 8 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "gp_fpscr")) + "sh4_300_issue+sh4_300_fpu+sh4_300_fpt") + +;; LDS.L to FPSCR +(define_insn_reservation "sh4_300_fpscr_load_mem" 8 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "mem_fpscr")) + "sh4_300_issue+sh4_300_fpu+sh4_300_fpt+sh4_300_ls") + + +;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W) +(define_insn_reservation "multi" 2 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "smpy,dmpy")) + "sh4_300_issue+sh4_300_mul") + +;; FPCHG, FRCHG, FSCHG +(define_insn_reservation "fpscr_toggle" 1 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "fpscr_toggle")) + "sh4_300_issue+sh4_300_fpu+sh4_300_fpt") + +;; FCMP/EQ, FCMP/GT +(define_insn_reservation "fp_cmp" 3 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "fp_cmp,dfp_cmp")) + "sh4_300_issue+sh4_300_fpu") + +;; Single precision floating point (FADD,FLOAT,FMAC,FMUL,FSUB,FTRC) +;; Double-precision floating-point (FADD,FCNVDS,FCNVSD,FLOAT,FSUB,FTRC) +(define_insn_reservation "fp_arith" 6 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "fp,ftrc_s,dfp_arith,dfp_conv")) + "sh4_300_issue+sh4_300_fpu") + +;; Single Precision FDIV/SQRT +(define_insn_reservation "fp_div" 19 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "fdiv")) + "sh4_300_issue+sh4_300_fpu+sh4_300_fds,sh4_300_fds*15") + +;; Double-precision floating-point FMUL +(define_insn_reservation "dfp_mul" 9 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "dfp_mul")) + "sh4_300_issue+sh4_300_fpu,sh4_300_fpu*3") + +;; Double precision FDIV/SQRT +(define_insn_reservation "dp_div" 35 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "dfdiv")) + "sh4_300_issue+sh4_300_fpu+sh4_300_fds,sh4_300_fds*31") + + +;; ??? We don't really want these for sh4-300. +;; this pattern itself is likely to finish in 3 cycles, but also +;; to disrupt branch prediction for taken branches for the following +;; condbranch. +(define_insn_reservation "sh4_300_arith3" 5 + (and (eq_attr "pipe_model" "sh4_300") + (eq_attr "type" "arith3")) + "sh4_300_issue,all*4") + +;; arith3b insns without brach redirection make use of the 0-offset 0-latency +;; branch feature, and thus schedule the same no matter if the branch is taken +;; or not. If the branch is redirected, the taken branch might take longer, +;; but then, we don't have to take the next branch. +;; ??? should we suppress branch redirection for sh4-300 to improve branch +;; target hit rates? +(define_insn_reservation "arith3b" 2 + (and (eq_attr "pipe_model" "sh4") + (eq_attr "type" "arith3")) + "issue,all") diff --git a/gcc/config/sh/sh4.md b/gcc/config/sh/sh4.md index 0937db8e6a3..b390ab99d05 100644 --- a/gcc/config/sh/sh4.md +++ b/gcc/config/sh/sh4.md @@ -1,5 +1,5 @@ ;; DFA scheduling description for SH4. -;; Copyright (C) 2004 Free Software Foundation, Inc. +;; Copyright (C) 2004, 2006 Free Software Foundation, Inc. ;; This file is part of GCC. @@ -209,9 +209,14 @@ (define_insn_reservation "sh4_store" 1 (and (eq_attr "pipe_model" "sh4") - (eq_attr "type" "store")) + (eq_attr "type" "store,fstore")) "issue+load_store,nothing,memory") +(define_insn_reservation "mac_mem" 1 + (and (eq_attr "pipe_model" "sh4") + (eq_attr "type" "mac_mem")) + "d_lock,nothing,memory") + ;; Load Store instructions. ;; Group: LS ;; Latency: 1 @@ -372,35 +377,42 @@ ;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W) ;; Group: CO ;; Latency: 4 / 4 -;; Issue Rate: 1 +;; Issue Rate: 2 (define_insn_reservation "multi" 4 (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "smpy,dmpy")) "d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2") -;; Fixed STS from MACL / MACH +;; Fixed STS from, and LDS to MACL / MACH ;; Group: CO ;; Latency: 3 ;; Issue Rate: 1 (define_insn_reservation "sh4_mac_gp" 3 (and (eq_attr "pipe_model" "sh4") - (eq_attr "type" "mac_gp")) + (eq_attr "type" "mac_gp,gp_mac,mem_mac")) "d_lock") ;; Single precision floating point computation FCMP/EQ, -;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG +;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRCHG, FSCHG ;; Group: FE ;; Latency: 3/4 ;; Issue Rate: 1 (define_insn_reservation "fp_arith" 3 (and (eq_attr "pipe_model" "sh4") - (eq_attr "type" "fp")) + (eq_attr "type" "fp,fp_cmp")) "issue,F01,F2") +;; We don't model the resource usage of this exactly because that would +;; introduce a bogus latency. +(define_insn_reservation "sh4_fpscr_toggle" 1 + (and (eq_attr "pipe_model" "sh4") + (eq_attr "type" "fpscr_toggle")) + "issue") + (define_insn_reservation "fp_arith_ftrc" 3 (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "ftrc_s")) @@ -437,7 +449,7 @@ (define_insn_reservation "fp_double_arith" 8 (and (eq_attr "pipe_model" "sh4") - (eq_attr "type" "dfp_arith")) + (eq_attr "type" "dfp_arith,dfp_mul")) "issue,F01,F1+F2,fpu*4,F2") ;; Double-precision FCMP (FCMP/EQ,FCMP/GT) diff --git a/gcc/config/sh/sh4a.md b/gcc/config/sh/sh4a.md index 163a4e10d85..602c6545ae9 100644 --- a/gcc/config/sh/sh4a.md +++ b/gcc/config/sh/sh4a.md @@ -1,5 +1,5 @@ ;; Scheduling description for Renesas SH4a -;; Copyright (C) 2003, 2004 Free Software Foundation, Inc. +;; Copyright (C) 2003, 2004, 2006 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; @@ -98,9 +98,11 @@ ;; MOV ;; Group: MT ;; Latency: 0 +;; ??? not sure if movi8 belongs here, but that's where it was +;; effectively before. (define_insn_reservation "sh4a_mov" 0 (and (eq_attr "cpu" "sh4a") - (eq_attr "type" "move")) + (eq_attr "type" "move,movi8,gp_mac")) "ID_or") ;; Load @@ -108,7 +110,7 @@ ;; Latency: 3 (define_insn_reservation "sh4a_load" 3 (and (eq_attr "cpu" "sh4a") - (eq_attr "type" "load,pcload")) + (eq_attr "type" "load,pcload,mem_mac")) "sh4a_ls+sh4a_memory") (define_insn_reservation "sh4a_load_si" 3 @@ -121,7 +123,7 @@ ;; Latency: 0 (define_insn_reservation "sh4a_store" 0 (and (eq_attr "cpu" "sh4a") - (eq_attr "type" "store")) + (eq_attr "type" "store,fstore,mac_mem")) "sh4a_ls+sh4a_memory") ;; CWB TYPE @@ -177,7 +179,7 @@ ;; Latency: 3 (define_insn_reservation "sh4a_fp_arith" 3 (and (eq_attr "cpu" "sh4a") - (eq_attr "type" "fp")) + (eq_attr "type" "fp,fp_cmp,fpscr_toggle")) "ID_or,sh4a_fex") (define_insn_reservation "sh4a_fp_arith_ftrc" 3 @@ -207,7 +209,7 @@ ;; Latency: 5 (define_insn_reservation "sh4a_fp_double_arith" 5 (and (eq_attr "cpu" "sh4a") - (eq_attr "type" "dfp_arith")) + (eq_attr "type" "dfp_arith,dfp_mul")) "ID_or,sh4a_fex*3") ;; Double precision FDIV/SQRT diff --git a/gcc/config/sh/superh.h b/gcc/config/sh/superh.h index 49bb6206d43..65154926e33 100644 --- a/gcc/config/sh/superh.h +++ b/gcc/config/sh/superh.h @@ -75,17 +75,17 @@ Boston, MA 02110-1301, USA. */ on newlib and provide the runtime support */ #undef SUBTARGET_CPP_SPEC #define SUBTARGET_CPP_SPEC \ -"-D__EMBEDDED_CROSS__ %{m4-100*:-D__SH4_100__} %{m4-200*:-D__SH4_200__} %{m4-400:-D__SH4_400__} %{m4-500:-D__SH4_500__} \ +"-D__EMBEDDED_CROSS__ %{m4-100*:-D__SH4_100__} %{m4-200*:-D__SH4_200__} %{m4-300*:-D__SH4_300__} %{m4-340:-D__SH4_340__} %{m4-400:-D__SH4_400__} %{m4-500:-D__SH4_500__} \ %(cppruntime)" /* Override the SUBTARGET_ASM_SPEC to add the runtime support */ #undef SUBTARGET_ASM_SPEC -#define SUBTARGET_ASM_SPEC "%{m4-100*|m4-200*:-isa=sh4} %{m4-400:-isa=sh4-nommu-nofpu} %{m4-500:-isa=sh4-nofpu} %(asruntime)" +#define SUBTARGET_ASM_SPEC "%{m4-100*|m4-200*:-isa=sh4} %{m4-400|m4-340:-isa=sh4-nommu-nofpu} %{m4-500:-isa=sh4-nofpu} %(asruntime)" /* Override the SUBTARGET_ASM_RELAX_SPEC so it doesn't interfere with the runtime support by adding -isa=sh4 in the wrong place. */ #undef SUBTARGET_ASM_RELAX_SPEC -#define SUBTARGET_ASM_RELAX_SPEC "%{!m4-100*:%{!m4-200*:%{!m4-400:%{!m4-500:-isa=sh4}}}}" +#define SUBTARGET_ASM_RELAX_SPEC "%{!m4-100*:%{!m4-200*:%{!m4-300*:%{!m4-340:%{!m4-400:%{!m4-500:-isa=sh4}}}}}}" /* Create the CC1_SPEC to add the runtime support */ #undef CC1_SPEC @@ -102,7 +102,7 @@ Boston, MA 02110-1301, USA. */ /* Override STARTFILE_SPEC to add profiling and MMU support. */ #undef STARTFILE_SPEC #define STARTFILE_SPEC \ - "%{!shared: %{!m4-400*: %{pg:gcrt1-mmu.o%s}%{!pg:crt1-mmu.o%s}}} \ - %{!shared: %{m4-400*: %{pg:gcrt1.o%s}%{!pg:crt1.o%s}}} \ + "%{!shared: %{!m4-400*:%{!m4-340*: %{pg:gcrt1-mmu.o%s}%{!pg:crt1-mmu.o%s}}}} \ + %{!shared: %{m4-340*|m4-400*: %{pg:gcrt1.o%s}%{!pg:crt1.o%s}}} \ crti.o%s \ %{!shared:crtbegin.o%s} %{shared:crtbeginS.o%s}" diff --git a/gcc/config/sh/t-sh b/gcc/config/sh/t-sh index 3ebc09d6e3c..56b6ba1c55a 100644 --- a/gcc/config/sh/t-sh +++ b/gcc/config/sh/t-sh @@ -38,11 +38,12 @@ MULTILIB_DIRNAMES= # is why sh2a and sh2a-single need their own multilibs. MULTILIB_MATCHES = $(shell \ multilibs="$(MULTILIB_OPTIONS)" ; \ - for abi in m1,m2,m3,m4-nofpu,m4-400,m4-500,m4al,m4a-nofpu m1,m2,m2a-nofpu \ - m2e,m3e,m4-single-only,m4-100-single-only,m4-200-single-only,m4a-single-only \ + for abi in m1,m2,m3,m4-nofpu,m4-100-nofpu,m4-200-nofpu,m4-400,m4-500,m4-340,m4-300-nofpu,m4al,m4a-nofpu \ + m1,m2,m2a-nofpu \ + m2e,m3e,m4-single-only,m4-100-single-only,m4-200-single-only,m4-300-single-only,m4a-single-only \ m2e,m2a-single-only \ - m4-single,m4-100-single,m4-200-single,m4a-single \ - m4,m4-100,m4-200,m4a \ + m4-single,m4-100-single,m4-200-single,m4-300-single,m4a-single \ + m4,m4-100,m4-200,m4-300,m4a \ m5-32media,m5-compact,m5-32media \ m5-32media-nofpu,m5-compact-nofpu,m5-32media-nofpu; do \ subst= ; \ @@ -76,7 +77,7 @@ gt-sh.h : s-gtype ; @true IC_EXTRA_PARTS= libic_invalidate_array_4-100.a libic_invalidate_array_4-200.a \ libic_invalidate_array_4a.a -OPT_EXTRA_PARTS= libgcc-Os-4-200.a +OPT_EXTRA_PARTS= libgcc-Os-4-200.a libgcc-4-300.a EXTRA_MULTILIB_PARTS= $(IC_EXTRA_PARTS) $(OPT_EXTRA_PARTS) $(T)ic_invalidate_array_4-100.o: $(srcdir)/config/sh/lib1funcs.asm $(GCC_PASSES) @@ -104,6 +105,12 @@ OBJS_Os_4_200=$(T)sdivsi3_i4i-Os-4-200.o $(T)udivsi3_i4i-Os-4-200.o $(T)unwind-d $(T)libgcc-Os-4-200.a: $(OBJS_Os_4_200) $(GCC_PASSES) $(AR_CREATE_FOR_TARGET) $@ $(OBJS_Os_4_200) +$(T)div_table-4-300.o: $(srcdir)/config/sh/lib1funcs-4-300.asm $(GCC_PASSES) + $(GCC_FOR_TARGET) $(MULTILIB_CFLAGS) -c -o $@ -DL_div_table -x assembler-with-cpp $< + +$(T)libgcc-4-300.a: $(T)div_table-4-300.o $(GCC_PASSES) + $(AR_CREATE_FOR_TARGET) $@ $(T)div_table-4-300.o + # Local Variables: # mode: Makefile # End: |