diff options
author | mkuvyrkov <mkuvyrkov@138bc75d-0d04-0410-961f-82ee72b054a4> | 2010-12-02 15:48:36 +0000 |
---|---|---|
committer | mkuvyrkov <mkuvyrkov@138bc75d-0d04-0410-961f-82ee72b054a4> | 2010-12-02 15:48:36 +0000 |
commit | ef9d55ed8cd17df5cde33d360f288b8e64202570 (patch) | |
tree | 68a0c47a5abe78db7a2861ab49a4d06ed4ef9c93 | |
parent | 9a51684228b88f3f16f62d81c19f7e1a08f0ff1d (diff) | |
download | gcc-ef9d55ed8cd17df5cde33d360f288b8e64202570.tar.gz |
DFA model for Core 2 and Core i7 scheduling.
* config/i386/core2.md: New DFA model for Core 2 and Core i7.
* config/i386/i386.c (ix86_option_override_internal): Update entries
for Core 2 and Core i7.
(ix86_issue_rate): Set issue rate to 3 [be realistic].
* config/i386/i386.md: Include "core2.md".
(define_attr cpu): Add "corei7".
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@167375 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r-- | gcc/ChangeLog | 11 | ||||
-rw-r--r-- | gcc/config/i386/core2.md | 691 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 10 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 5 |
4 files changed, 711 insertions, 6 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6d851c1350d..f73b43b983f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2010-12-02 Bernd Schmidt <bernds@codesourcery.com> + + DFA model for Core 2 and Core i7 scheduling. + + * config/i386/core2.md: New DFA model for Core 2 and Core i7. + * config/i386/i386.c (ix86_option_override_internal): Update entries + for Core 2 and Core i7. + (ix86_issue_rate): Set issue rate to 3 [be realistic]. + * config/i386/i386.md: Include "core2.md". + (define_attr cpu): Add "corei7". + 2010-12-02 Maxim Kuvyrkov <maxim@codesourcery.com> Define tuning for Core 2 and Core i7. diff --git a/gcc/config/i386/core2.md b/gcc/config/i386/core2.md new file mode 100644 index 00000000000..d154cdc077f --- /dev/null +++ b/gcc/config/i386/core2.md @@ -0,0 +1,691 @@ +;; Scheduling for Core 2 and derived processors. +;; Copyright (C) 2004, 2005, 2007, 2008, 2010 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; <http://www.gnu.org/licenses/>. */ + +;; The scheduling description in this file is based on the one in ppro.md, +;; with additional information obtained from +;; +;; "How to optimize for the Pentium family of microprocessors", +;; by Agner Fog, PhD. +;; +;; The major difference from the P6 pipeline is one extra decoder, and +;; one extra execute unit. Due to micro-op fusion, many insns no longer +;; need to be decoded in decoder 0, but can be handled by all of them. + +;; The core2_idiv, core2_fdiv and core2_ssediv automata are used to +;; model issue latencies of idiv, fdiv and ssediv type insns. +(define_automaton "core2_decoder,core2_core,core2_idiv,core2_fdiv,core2_ssediv,core2_load,core2_store") + +;; The CPU domain, used for Core i7 bypass latencies +(define_attr "i7_domain" "int,float,simd" + (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint") + (const_string "float") + (eq_attr "type" "sselog,sselog1,sseiadd,sseiadd1,sseishft,sseishft1,sseimul, + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt, + ssecvt1,sseicvt,ssediv,sseins,ssemuladd,sse4arg") + (cond [(eq_attr "mode" "V4DF,V8SF,V2DF,V4SF,SF,DF") + (const_string "float") + (eq_attr "mode" "SI") + (const_string "int")] + (const_string "simd")) + (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") + (const_string "simd")] + (const_string "int"))) + +;; As for the Pentium Pro, +;; - an instruction with 1 uop can be decoded by any of the three +;; decoders in one cycle. +;; - an instruction with 1 to 4 uops can be decoded only by decoder 0 +;; but still in only one cycle. +;; - a complex (microcode) instruction can also only be decoded by +;; decoder 0, and this takes an unspecified number of cycles. +;; +;; The goal is to schedule such that we have a few-one-one uops sequence +;; in each cycle, to decode as many instructions per cycle as possible. +(define_cpu_unit "c2_decoder0" "core2_decoder") +(define_cpu_unit "c2_decoder1" "core2_decoder") +(define_cpu_unit "c2_decoder2" "core2_decoder") +(define_cpu_unit "c2_decoder3" "core2_decoder") + +;; We first wish to find an instruction for c2_decoder0, so exclude +;; c2_decoder1 and c2_decoder2 from being reserved until c2_decoder 0 is +;; reserved. +(presence_set "c2_decoder1" "c2_decoder0") +(presence_set "c2_decoder2" "c2_decoder0") +(presence_set "c2_decoder3" "c2_decoder0") + +;; Most instructions can be decoded on any of the three decoders. +(define_reservation "c2_decodern" "(c2_decoder0|c2_decoder1|c2_decoder2|c2_decoder3)") + +;; The out-of-order core has six pipelines. These are similar to the +;; Pentium Pro's five pipelines. Port 2 is responsible for memory loads, +;; port 3 for store address calculations, port 4 for memory stores, and +;; ports 0, 1 and 5 for everything else. + +(define_cpu_unit "c2_p0,c2_p1,c2_p5" "core2_core") +(define_cpu_unit "c2_p2" "core2_load") +(define_cpu_unit "c2_p3,c2_p4" "core2_store") +(define_cpu_unit "c2_idiv" "core2_idiv") +(define_cpu_unit "c2_fdiv" "core2_fdiv") +(define_cpu_unit "c2_ssediv" "core2_ssediv") + +;; Only the irregular instructions have to be modeled here. A load +;; increases the latency by 2 or 3, or by nothing if the manual gives +;; a latency already. Store latencies are not accounted for. +;; +;; The simple instructions follow a very regular pattern of 1 uop per +;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store +;; on port 4 and port 3. These instructions are modelled at the bottom +;; of this file. +;; +;; For microcoded instructions we don't know how many uops are produced. +;; These instructions are the "complex" ones in the Intel manuals. All +;; we _do_ know is that they typically produce four or more uops, so +;; they can only be decoded on c2_decoder0. Modelling their latencies +;; doesn't make sense because we don't know how these instructions are +;; executed in the core. So we just model that they can only be decoded +;; on decoder 0, and say that it takes a little while before the result +;; is available. +(define_insn_reservation "c2_complex_insn" 6 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "other,multi,str")) + "c2_decoder0") + +(define_insn_reservation "c2_call" 1 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "call,callv")) + "c2_decoder0") + +;; imov with memory operands does not use the integer units. +;; imovx always decodes to one uop, and also doesn't use the integer +;; units if it has memory operands. +(define_insn_reservation "c2_imov" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "imov,imovx"))) + "c2_decodern,(c2_p0|c2_p1|c2_p5)") + +(define_insn_reservation "c2_imov_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "imov,imovx"))) + "c2_decodern,c2_p2") + +(define_insn_reservation "c2_imov_store" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "imov"))) + "c2_decodern,c2_p4+c2_p3") + +(define_insn_reservation "c2_icmov" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "icmov"))) + "c2_decoder0,(c2_p0|c2_p1|c2_p5)*2") + +(define_insn_reservation "c2_icmov_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "icmov"))) + "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5)*2") + +(define_insn_reservation "c2_push_reg" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "push"))) + "c2_decodern,c2_p4+c2_p3") + +(define_insn_reservation "c2_push_mem" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "both") + (eq_attr "type" "push"))) + "c2_decoder0,c2_p2,c2_p4+c2_p3") + +;; lea executes on port 0 with latency one and throughput 1. +(define_insn_reservation "c2_lea" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "lea"))) + "c2_decodern,c2_p0") + +;; Shift and rotate decode as two uops which can go to port 0 or 5. +;; The load and store units need to be reserved when memory operands +;; are involved. +(define_insn_reservation "c2_shift_rotate" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "c2_decodern,(c2_p0|c2_p5)") + +(define_insn_reservation "c2_shift_rotate_mem" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) + "c2_decoder0,c2_p2,(c2_p0|c2_p5),c2_p4+c2_p3") + +;; See comments in ppro.md for the corresponding reservation. +(define_insn_reservation "c2_branch" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "ibr"))) + "c2_decodern,c2_p5") + +;; ??? Indirect branches probably have worse latency than this. +(define_insn_reservation "c2_indirect_branch" 6 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (eq_attr "type" "ibr"))) + "c2_decoder0,c2_p2+c2_p5") + +(define_insn_reservation "c2_leave" 4 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "leave")) + "c2_decoder0,c2_p2+(c2_p0|c2_p1),(c2_p0|c2_p1)") + +;; mul and imul with two/three operands only execute on port 1 for HImode +;; and SImode, port 0 for DImode. +(define_insn_reservation "c2_imul_hisi" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "HI,SI") + (eq_attr "type" "imul")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_imul_hisi_mem" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "HI,SI") + (eq_attr "type" "imul")))) + "c2_decoder0,c2_p2+c2_p1") + +(define_insn_reservation "c2_imul_di" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DI") + (eq_attr "type" "imul")))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_imul_di_mem" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "DI") + (eq_attr "type" "imul")))) + "c2_decoder0,c2_p2+c2_p0") + +;; div and idiv are very similar, so we model them the same. +;; QI, HI, and SI have issue latency 12, 21, and 37, respectively. +;; These issue latencies are modelled via the c2_div automaton. +(define_insn_reservation "c2_idiv_QI" 19 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "c2_decoder0,(c2_p0+c2_idiv)*2,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9") + +(define_insn_reservation "c2_idiv_QI_load" 19 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "QI") + (eq_attr "type" "idiv")))) + "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9") + +(define_insn_reservation "c2_idiv_HI" 23 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*17") + +(define_insn_reservation "c2_idiv_HI_load" 23 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "HI") + (eq_attr "type" "idiv")))) + "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*18") + +(define_insn_reservation "c2_idiv_SI" 39 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*33") + +(define_insn_reservation "c2_idiv_SI_load" 39 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SI") + (eq_attr "type" "idiv")))) + "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*34") + +;; x87 floating point operations. + +(define_insn_reservation "c2_fxch" 0 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "fxch")) + "c2_decodern") + +(define_insn_reservation "c2_fop" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "fop"))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_fop_load" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "fop"))) + "c2_decoder0,c2_p2+c2_p1,c2_p1") + +(define_insn_reservation "c2_fop_store" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "fop"))) + "c2_decoder0,c2_p0,c2_p0,c2_p0+c2_p4+c2_p3") + +(define_insn_reservation "c2_fop_both" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "both") + (eq_attr "type" "fop"))) + "c2_decoder0,c2_p2+c2_p0,c2_p0+c2_p4+c2_p3") + +(define_insn_reservation "c2_fsgn" 1 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "fsgn")) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_fistp" 5 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "fistp")) + "c2_decoder0,c2_p0*2,c2_p4+c2_p3") + +(define_insn_reservation "c2_fcmov" 2 + (and (eq_attr "cpu" "core2,corei7") + (eq_attr "type" "fcmov")) + "c2_decoder0,c2_p0*2") + +(define_insn_reservation "c2_fcmp" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "fcmp"))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_fcmp_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "fcmp"))) + "c2_decoder0,c2_p2+c2_p1") + +(define_insn_reservation "c2_fmov" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmov"))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_fmov_load" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "c2_decodern,c2_p2") + +(define_insn_reservation "c2_fmov_XF_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "c2_decoder0,(c2_p2+c2_p0)*2") + +(define_insn_reservation "c2_fmov_store" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "!XF") + (eq_attr "type" "fmov")))) + "c2_decodern,c2_p3+c2_p4") + +(define_insn_reservation "c2_fmov_XF_store" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fmov")))) + "c2_decoder0,(c2_p3+c2_p4),(c2_p3+c2_p4)") + +;; fmul executes on port 0 with latency 5. It has issue latency 2, +;; but we don't model this. +(define_insn_reservation "c2_fmul" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "fmul"))) + "c2_decoder0,c2_p0*2") + +(define_insn_reservation "c2_fmul_load" 6 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "fmul"))) + "c2_decoder0,c2_p2+c2_p0,c2_p0") + +;; fdiv latencies depend on the mode of the operands. XFmode gives +;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18. +;; Division by a power of 2 takes only 9 cycles, but we cannot model +;; that. Throughput is equal to latency - 1, which we model using the +;; c2_div automaton. +(define_insn_reservation "c2_fdiv_SF" 18 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*16") + +(define_insn_reservation "c2_fdiv_SF_load" 19 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*16") + +(define_insn_reservation "c2_fdiv_DF" 32 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*30") + +(define_insn_reservation "c2_fdiv_DF_load" 33 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*30") + +(define_insn_reservation "c2_fdiv_XF" 38 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*36") + +(define_insn_reservation "c2_fdiv_XF_load" 39 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "XF") + (eq_attr "type" "fdiv,fpspc")))) + "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*36") + +;; MMX instructions. + +(define_insn_reservation "c2_mmx_add" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxadd,sseiadd"))) + "c2_decodern,c2_p0|c2_p5") + +(define_insn_reservation "c2_mmx_add_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "mmxadd,sseiadd"))) + "c2_decodern,c2_p2+c2_p0|c2_p5") + +(define_insn_reservation "c2_mmx_shft" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxshft"))) + "c2_decodern,c2_p0|c2_p5") + +(define_insn_reservation "c2_mmx_shft_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "mmxshft"))) + "c2_decoder0,c2_p2+c2_p1") + +(define_insn_reservation "c2_mmx_sse_shft" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "type" "sseishft") + (eq_attr "length_immediate" "!0")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_sse_shft_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "type" "sseishft") + (eq_attr "length_immediate" "!0")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_sse_shft1" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "type" "sseishft") + (eq_attr "length_immediate" "0")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_sse_shft1_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "type" "sseishft") + (eq_attr "length_immediate" "0")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_mul" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul,sseimul"))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_mmx_mul_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "mmxmul,sseimul"))) + "c2_decoder0,c2_p2+c2_p1") + +(define_insn_reservation "c2_sse_mmxcvt" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "mode" "DI") + (eq_attr "type" "mmxcvt"))) + "c2_decodern,c2_p1") + +;; FIXME: These are Pentium III only, but we cannot tell here if +;; we're generating code for PentiumPro/Pentium II or Pentium III +;; (define_insn_reservation "c2_sse_mmxshft" 2 +;; (and (eq_attr "cpu" "core2,corei7") +;; (and (eq_attr "mode" "TI") +;; (eq_attr "type" "mmxshft"))) +;; "c2_decodern,c2_p0") + +;; The sfence instruction. +(define_insn_reservation "c2_sse_sfence" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "unknown") + (eq_attr "type" "sse"))) + "c2_decoder0,c2_p4+c2_p3") + +;; FIXME: This reservation is all wrong when we're scheduling sqrtss. +(define_insn_reservation "c2_sse_SFDF" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "mode" "SF,DF") + (eq_attr "type" "sse"))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_sse_V4SF" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "mode" "V4SF") + (eq_attr "type" "sse"))) + "c2_decoder0,c2_p1*2") + +(define_insn_reservation "c2_sse_addcmp" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "sseadd,ssecmp,ssecomi"))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_sse_addcmp_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "sseadd,ssecmp,ssecomi"))) + "c2_decodern,c2_p2+c2_p1") + +(define_insn_reservation "c2_sse_mul_SF" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "type" "ssemul")))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_sse_mul_SF_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "type" "ssemul")))) + "c2_decodern,c2_p2+c2_p0") + +(define_insn_reservation "c2_sse_mul_DF" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "type" "ssemul")))) + "c2_decodern,c2_p0") + +(define_insn_reservation "c2_sse_mul_DF_load" 5 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "type" "ssemul")))) + "c2_decodern,c2_p2+c2_p0") + +(define_insn_reservation "c2_sse_div_SF" 18 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "type" "ssediv")))) + "c2_decodern,c2_p0,c2_ssediv*17") + +(define_insn_reservation "c2_sse_div_SF_load" 18 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF,V4SF") + (eq_attr "type" "ssediv")))) + "c2_decodern,(c2_p2+c2_p0),c2_ssediv*17") + +(define_insn_reservation "c2_sse_div_DF" 32 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "type" "ssediv")))) + "c2_decodern,c2_p0,c2_ssediv*31") + +(define_insn_reservation "c2_sse_div_DF_load" 32 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF,V2DF") + (eq_attr "type" "ssediv")))) + "c2_decodern,(c2_p2+c2_p0),c2_ssediv*31") + +;; FIXME: these have limited throughput +(define_insn_reservation "c2_sse_icvt_SF" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseicvt")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_sse_icvt_SF_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "SF") + (eq_attr "type" "sseicvt")))) + "c2_decodern,c2_p2+c2_p1") + +(define_insn_reservation "c2_sse_icvt_DF" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "sseicvt")))) + "c2_decoder0,c2_p0+c2_p1") + +(define_insn_reservation "c2_sse_icvt_DF_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "DF") + (eq_attr "type" "sseicvt")))) + "c2_decoder0,(c2_p2+c2_p1)") + +(define_insn_reservation "c2_sse_icvt_SI" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "sseicvt")))) + "c2_decodern,c2_p1") + +(define_insn_reservation "c2_sse_icvt_SI_load" 3 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "SI") + (eq_attr "type" "sseicvt")))) + "c2_decodern,(c2_p2+c2_p1)") + +(define_insn_reservation "c2_sse_mov" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none") + (eq_attr "type" "ssemov"))) + "c2_decodern,(c2_p0|c2_p1|c2_p5)") + +(define_insn_reservation "c2_sse_mov_load" 2 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "ssemov"))) + "c2_decodern,c2_p2") + +(define_insn_reservation "c2_sse_mov_store" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "ssemov"))) + "c2_decodern,c2_p4+c2_p3") + +;; All other instructions are modelled as simple instructions. +;; We have already modelled all i387 floating point instructions, so all +;; other instructions execute on either port 0, 1 or 5. This includes +;; the ALU units, and the MMX units. +;; +;; reg-reg instructions produce 1 uop so they can be decoded on any of +;; the three decoders. Loads benefit from micro-op fusion and can be +;; treated in the same way. +(define_insn_reservation "c2_insn" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "none,unknown") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp"))) + "c2_decodern,(c2_p0|c2_p1|c2_p5)") + +(define_insn_reservation "c2_insn_load" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "load") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp"))) + "c2_decodern,c2_p2,(c2_p0|c2_p1|c2_p5)") + +;; register-memory instructions have three uops, so they have to be +;; decoded on c2_decoder0. +(define_insn_reservation "c2_insn_store" 1 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "store") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp"))) + "c2_decoder0,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3") + +;; read-modify-store instructions produce 4 uops so they have to be +;; decoded on c2_decoder0 as well. +(define_insn_reservation "c2_insn_both" 4 + (and (eq_attr "cpu" "core2,corei7") + (and (eq_attr "memory" "both") + (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp"))) + "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3") diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 93c105dd658..f5fc2b607a6 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -3231,10 +3231,10 @@ ix86_option_override_internal (bool main_args_p) {"nocona", PROCESSOR_NOCONA, CPU_NONE, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_NO_SAHF}, - {"core2", PROCESSOR_CORE2_64, CPU_GENERIC64, + {"core2", PROCESSOR_CORE2_64, CPU_CORE2, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16}, - {"corei7", PROCESSOR_COREI7_64, CPU_GENERIC64, + {"corei7", PROCESSOR_COREI7_64, CPU_COREI7, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16}, {"atom", PROCESSOR_ATOM, CPU_ATOM, @@ -3613,12 +3613,10 @@ ix86_option_override_internal (bool main_args_p) case PROCESSOR_CORE2_64: ix86_tune = PROCESSOR_CORE2_32; - ix86_schedule = CPU_PENTIUMPRO; break; case PROCESSOR_COREI7_64: ix86_tune = PROCESSOR_COREI7_32; - ix86_schedule = CPU_PENTIUMPRO; break; default: @@ -22173,6 +22171,10 @@ ix86_issue_rate (void) case PROCESSOR_PENTIUMPRO: case PROCESSOR_PENTIUM4: + case PROCESSOR_CORE2_32: + case PROCESSOR_CORE2_64: + case PROCESSOR_COREI7_32: + case PROCESSOR_COREI7_64: case PROCESSOR_ATHLON: case PROCESSOR_K8: case PROCESSOR_AMDFAM10: diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 2002dd68509..88e6245105b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -356,8 +356,8 @@ ;; Processor type. -(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,atom, - generic64,amdfam10,bdver1" +(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,corei7, + atom,generic64,amdfam10,bdver1" (const (symbol_ref "ix86_schedule"))) ;; A basic instruction type. Refinements due to arguments to be @@ -937,6 +937,7 @@ (include "bdver1.md") (include "geode.md") (include "atom.md") +(include "core2.md") ;; Operand and operator predicates and constraints |