From af2728a4b41b45ab6d24a4142070a8cf620c1ebc Mon Sep 17 00:00:00 2001 From: Jeffrey A Law Date: Thu, 9 May 2002 23:41:39 +0000 Subject: athlon.md, [...]: New files. * athlon.md, k6.md, pentium.md, ppro.md): New files. * i386.md: Move scheduling information into new files. From-SVN: r53350 --- gcc/ChangeLog | 3 + gcc/config/i386/athlon.md | 206 +++++++++++++ gcc/config/i386/i386.md | 733 +-------------------------------------------- gcc/config/i386/k6.md | 136 +++++++++ gcc/config/i386/pentium.md | 306 +++++++++++++++++++ gcc/config/i386/ppro.md | 150 ++++++++++ 6 files changed, 805 insertions(+), 729 deletions(-) create mode 100644 gcc/config/i386/athlon.md create mode 100644 gcc/config/i386/k6.md create mode 100644 gcc/config/i386/pentium.md create mode 100644 gcc/config/i386/ppro.md diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 307bdaefaf6..19c06885752 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -62,6 +62,9 @@ doc: Thu May 9 11:50:09 2002 Jeffrey A Law (law@cygnus.com) + * athlon.md, k6.md, pentium.md, ppro.md): New files. + * i386.md: Move scheduling information into new files. + * i386.md (type attribute): Add "rotate" for rotate insns. (rotate insns): Set type to "rotate". (various attributes and function units): Treat rotate like shift. diff --git a/gcc/config/i386/athlon.md b/gcc/config/i386/athlon.md new file mode 100644 index 00000000000..d6a52f2cbdd --- /dev/null +++ b/gcc/config/i386/athlon.md @@ -0,0 +1,206 @@ +;; AMD Athlon Scheduling +;; Copyright (C) 2002 Free Software Foundation, Inc. +;; +;; This file is part of GNU CC. +;; +;; GNU CC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GNU CC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GNU CC; see the file COPYING. If not, write to +;; the Free Software Foundation, 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. */ +(define_attr "athlon_decode" "direct,vector" + (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,fcmov") + (const_string "vector") + (and (eq_attr "type" "push") + (match_operand 1 "memory_operand" "")) + (const_string "vector") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load,store") + (eq_attr "mode" "XF"))) + (const_string "vector")] + (const_string "direct"))) + +;; The Athlon does contain three pipelined FP units, three integer units and +;; three address generation units. +;; +;; The predecode logic is determining boundaries of instructions in the 64 +;; byte cache line. So the cache line straddling problem of K6 might be issue +;; here as well, but it is not noted in the documentation. +;; +;; Three DirectPath instructions decoders and only one VectorPath decoder +;; is available. They can decode three DirectPath instructions or one VectorPath +;; instruction per cycle. +;; Decoded macro instructions are then passed to 72 entry instruction control +;; unit, that passes +;; it to the specialized integer (18 entry) and fp (36 entry) schedulers. +;; +;; The load/store queue unit is not attached to the schedulers but +;; communicates with all the execution units separately instead. + +(define_function_unit "athlon_vectordec" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_decode" "vector")) + 1 1) + +(define_function_unit "athlon_directdec" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_decode" "direct")) + 1 1) + +(define_function_unit "athlon_vectordec" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_decode" "direct")) + 1 1 [(eq_attr "athlon_decode" "vector")]) + +(define_function_unit "athlon_ieu" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "alu1,negnot,alu,icmp,test,imov,imovx,lea,incdec,ishift,rotate,ibr,call,callv,icmov,cld,pop,setcc,push,pop")) + 1 1) + +(define_function_unit "athlon_ieu" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "str")) + 15 15) + +(define_function_unit "athlon_ieu" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "imul")) + 5 0) + +(define_function_unit "athlon_ieu" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "idiv")) + 42 0) + +(define_function_unit "athlon_muldiv" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "imul")) + 5 0) + +(define_function_unit "athlon_muldiv" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "idiv")) + 42 42) + +(define_attr "athlon_fpunits" "none,store,mul,add,muladd,any" + (cond [(eq_attr "type" "fop,fcmp,fistp") + (const_string "add") + (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov") + (const_string "mul") + (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both")) + (const_string "store") + (and (eq_attr "type" "fmov") (eq_attr "memory" "load")) + (const_string "any") + (and (eq_attr "type" "fmov") + (ior (match_operand:SI 1 "register_operand" "") + (match_operand 1 "immediate_operand" ""))) + (const_string "store") + (eq_attr "type" "fmov") + (const_string "muladd")] + (const_string "none"))) + +;; We use latencies 1 for definitions. This is OK to model colisions +;; in execution units. The real latencies are modeled in the "fp" pipeline. + +;; fsin, fcos: 96-192 +;; fsincos: 107-211 +;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode. +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fpspc")) + 100 1) + +;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode. +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fdiv")) + 24 1) + +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fop,fmul,fistp")) + 4 1) + +;; XFmode loads are slow. +;; XFmode store is slow too (8 cycles), but we don't need to model it, because +;; there are no dependent instructions. + +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load") + (eq_attr "mode" "XF")))) + 10 1) + +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fmov,fsgn")) + 2 1) + +;; fcmp and ftst instructions +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmp") + (eq_attr "athlon_decode" "direct"))) + 3 1) + +;; fcmpi instructions. +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmp") + (eq_attr "athlon_decode" "vector"))) + 3 1) + +(define_function_unit "athlon_fp" 3 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fcmov")) + 7 1) + +(define_function_unit "athlon_fp_mul" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_fpunits" "mul")) + 1 1) + +(define_function_unit "athlon_fp_add" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_fpunits" "add")) + 1 1) + +(define_function_unit "athlon_fp_muladd" 2 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_fpunits" "muladd,mul,add")) + 1 1) + +(define_function_unit "athlon_fp_store" 1 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "athlon_fpunits" "store")) + 1 1) + +;; We don't need to model the Address Generation Unit, since we don't model +;; the re-order buffer yet and thus we never schedule more than three operations +;; at time. Later we may want to experiment with MD_SCHED macros modeling the +;; decoders independently on the functional units. + +;(define_function_unit "athlon_agu" 3 0 +; (and (eq_attr "cpu" "athlon") +; (and (eq_attr "memory" "!none") +; (eq_attr "athlon_fpunits" "none"))) +; 1 1) + +;; Model load unit to avoid too long sequences of loads. We don't need to +;; model store queue, since it is hardly going to be bottleneck. + +(define_function_unit "athlon_load" 2 0 + (and (eq_attr "cpu" "athlon") + (eq_attr "memory" "load,both")) + 1 1) + diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index e99bd0ad6c2..fea9d981c06 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -322,735 +322,10 @@ [(set_attr "length" "128") (set_attr "type" "multi")]) -;; Pentium Scheduling -;; -;; The Pentium is an in-order core with two integer pipelines. - -;; True for insns that behave like prefixed insns on the Pentium. -(define_attr "pent_prefix" "false,true" - (if_then_else (ior (eq_attr "prefix_0f" "1") - (ior (eq_attr "prefix_data16" "1") - (eq_attr "prefix_rep" "1"))) - (const_string "true") - (const_string "false"))) - -;; Categorize how an instruction slots. - -;; The non-MMX Pentium slots an instruction with prefixes on U pipe only, -;; while MMX Pentium can slot it on either U or V. Model non-MMX Pentium -;; rules, because it results in noticeably better code on non-MMX Pentium -;; and doesn't hurt much on MMX. (Prefixed instructions are not very -;; common, so the scheduler usualy has a non-prefixed insn to pair). - -(define_attr "pent_pair" "uv,pu,pv,np" - (cond [(eq_attr "imm_disp" "true") - (const_string "np") - (ior (eq_attr "type" "alu1,alu,imov,icmp,test,lea,incdec") - (and (eq_attr "type" "pop,push") - (eq_attr "memory" "!both"))) - (if_then_else (eq_attr "pent_prefix" "true") - (const_string "pu") - (const_string "uv")) - (eq_attr "type" "ibr") - (const_string "pv") - (and (eq_attr "type" "ishift") - (match_operand 2 "const_int_operand" "")) - (const_string "pu") - (and (eq_attr "type" "rotate") - (match_operand 2 "const_int_1_operand" "")) - (const_string "pu") - (and (eq_attr "type" "call") - (match_operand 0 "constant_call_address_operand" "")) - (const_string "pv") - (and (eq_attr "type" "callv") - (match_operand 1 "constant_call_address_operand" "")) - (const_string "pv") - ] - (const_string "np"))) - -(define_automaton "pentium,pentium_fpu") - -;; Pentium do have U and V pipes. Instruction to both pipes -;; are alwyas issued together, much like on VLIW. -;; -;; predecode -;; / \ -;; decodeu decodev -;; / | | -;; fpu executeu executev -;; | | | -;; fpu retire retire -;; | -;; fpu -;; We add dummy "port" pipes allocated only first cycle of -;; instruction to specify this behaviour. - -(define_cpu_unit "pentium-portu,pentium-portv" "pentium") -(define_cpu_unit "pentium-u,pentium-v" "pentium") -(absence_set "pentium-portu" "pentium-u,pentium-v") -(presence_set "pentium-portv" "pentium-portu") - -;; Floating point instructions can overlap with new issue of integer -;; instructions. We model only first cycle of FP pipeline, as it is -;; fully pipelined. -(define_cpu_unit "pentium-fp" "pentium_fpu") - -;; There is non-pipelined multiplier unit used for complex operations. -(define_cpu_unit "pentium-fmul" "pentium_fpu") - -;; Pentium preserves memory ordering, so when load-execute-store -;; instruction is executed together with other instruction loading -;; data, the execution of the other instruction is delayed to very -;; last cycle of first instruction, when data are bypassed. -;; We model this by allocating "memory" unit when store is pending -;; and using conflicting load units together. - -(define_cpu_unit "pentium-memory" "pentium") -(define_cpu_unit "pentium-load0" "pentium") -(define_cpu_unit "pentium-load1" "pentium") -(absence_set "pentium-load0,pentium-load1" "pentium-memory") - -(define_reservation "pentium-load" "(pentium-load0 | pentium-load1)") -(define_reservation "pentium-np" "(pentium-u + pentium-v)") -(define_reservation "pentium-uv" "(pentium-u | pentium-v)") -(define_reservation "pentium-portuv" "(pentium-portu | pentium-portv)") -(define_reservation "pentium-firstu" "(pentium-u + pentium-portu)") -(define_reservation "pentium-firstv" "(pentium-v + pentium-portuv)") -(define_reservation "pentium-firstuv" "(pentium-uv + pentium-portuv)") -(define_reservation "pentium-firstuload" "(pentium-load + pentium-firstu)") -(define_reservation "pentium-firstvload" "(pentium-load + pentium-firstv)") -(define_reservation "pentium-firstuvload" "(pentium-load + pentium-firstuv) - | (pentium-firstv,pentium-v, - (pentium-load+pentium-firstv))") -(define_reservation "pentium-firstuboth" "(pentium-load + pentium-firstu - + pentium-memory)") -(define_reservation "pentium-firstvboth" "(pentium-load + pentium-firstu - + pentium-memory)") -(define_reservation "pentium-firstuvboth" "(pentium-load + pentium-firstuv - + pentium-memory) - | (pentium-firstv,pentium-v, - (pentium-load+pentium-firstv))") - -;; Few common long latency instructions -(define_insn_reservation "pent_mul" 11 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "imul")) - "pentium-np*11") - -(define_insn_reservation "pent_str" 12 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "str")) - "pentium-np*12") - -;; Integer division and some other long latency instruction block all -;; units, including the FP pipe. There is no value in modeling the -;; latency of these instructions and not modeling the latency -;; decreases the size of the DFA. -(define_insn_reservation "pent_block" 1 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "idiv")) - "pentium-np+pentium-fp") - -(define_insn_reservation "pent_cld" 2 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "cld")) - "pentium-np*2") - -;; Moves usually have one cycle penalty, but there are exceptions. -(define_insn_reservation "pent_fmov" 1 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "type" "fmov") - (eq_attr "memory" "none,load"))) - "(pentium-fp+pentium-np)") - -(define_insn_reservation "pent_fpmovxf" 3 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "type" "fmov") - (and (eq_attr "memory" "load,store") - (eq_attr "mode" "XF")))) - "(pentium-fp+pentium-np)*3") - -(define_insn_reservation "pent_fpstore" 2 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "type" "fmov") - (ior (match_operand 1 "immediate_operand" "") - (eq_attr "memory" "store")))) - "(pentium-fp+pentium-np)*2") - -(define_insn_reservation "pent_imov" 1 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "imov")) - "pentium-firstuv") - -;; Push and pop instructions have 1 cycle latency and special -;; hardware bypass allows them to be paired with other push,pop -;; and call instructions. -(define_bypass 0 "pent_push,pent_pop" "pent_push,pent_pop,pent_call") -(define_insn_reservation "pent_push" 1 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "type" "push") - (eq_attr "memory" "store"))) - "pentium-firstuv") - -(define_insn_reservation "pent_pop" 1 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "pop")) - "pentium-firstuv") - -;; Call and branch instruction can execute in either pipe, but -;; they are only pairable when in the v pipe. -(define_insn_reservation "pent_call" 10 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "call,callv")) - "pentium-firstv,pentium-v*9") - -(define_insn_reservation "pent_branch" 1 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "ibr")) - "pentium-firstv") - -;; Floating point instruction dispatch in U pipe, but continue -;; in FP pipeline allowing other isntructions to be executed. -(define_insn_reservation "pent_fp" 3 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "fop,fistp")) - "(pentium-firstu+pentium-fp),nothing,nothing") - -;; First two cycles of fmul are not pipelined. -(define_insn_reservation "pent_fmul" 3 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "fmul")) - "(pentium-firstuv+pentium-fp+pentium-fmul),pentium-fmul,nothing") - -;; Long latency FP instructions overlap with integer instructions, -;; but only last 2 cycles with FP ones. -(define_insn_reservation "pent_fdiv" 39 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "fdiv")) - "(pentium-np+pentium-fp+pentium-fmul), - (pentium-fp+pentium-fmul)*36,pentium-fmul*2") - -(define_insn_reservation "pent_fpspc" 70 - (and (eq_attr "cpu" "pentium") - (eq_attr "type" "fpspc")) - "(pentium-np+pentium-fp+pentium-fmul), - (pentium-fp+pentium-fmul)*67,pentium-fmul*2") - -;; Integer instructions. Load/execute/store takes 3 cycles, -;; load/execute 2 cycles and execute only one cycle. -(define_insn_reservation "pent_uv_both" 3 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "uv") - (eq_attr "memory" "both"))) - "pentium-firstuvboth,pentium-uv+pentium-memory,pentium-uv") - -(define_insn_reservation "pent_u_both" 3 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "pu") - (eq_attr "memory" "both"))) - "pentium-firstuboth,pentium-u+pentium-memory,pentium-u") - -(define_insn_reservation "pent_v_both" 3 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "pv") - (eq_attr "memory" "both"))) - "pentium-firstvboth,pentium-v+pentium-memory,pentium-v") - -(define_insn_reservation "pent_np_both" 3 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "np") - (eq_attr "memory" "both"))) - "pentium-np,pentium-np,pentium-np") - -(define_insn_reservation "pent_uv_load" 2 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "uv") - (eq_attr "memory" "load"))) - "pentium-firstuvload,pentium-uv") - -(define_insn_reservation "pent_u_load" 2 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "pu") - (eq_attr "memory" "load"))) - "pentium-firstuload,pentium-u") - -(define_insn_reservation "pent_v_load" 2 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "pv") - (eq_attr "memory" "load"))) - "pentium-firstvload,pentium-v") - -(define_insn_reservation "pent_np_load" 2 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "np") - (eq_attr "memory" "load"))) - "pentium-np,pentium-np") - -(define_insn_reservation "pent_uv" 1 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "uv") - (eq_attr "memory" "none"))) - "pentium-firstuv") - -(define_insn_reservation "pent_u" 1 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "pu") - (eq_attr "memory" "none"))) - "pentium-firstu") - -(define_insn_reservation "pent_v" 1 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "pv") - (eq_attr "memory" "none"))) - "pentium-firstv") - -(define_insn_reservation "pent_np" 1 - (and (eq_attr "cpu" "pentium") - (and (eq_attr "pent_pair" "np") - (eq_attr "memory" "none"))) - "pentium-np") - - -;; Pentium Pro/PII Scheduling -;; -;; The PPro has an out-of-order core, but the instruction decoders are -;; naturally in-order and asymmetric. We get best performance by scheduling -;; for the decoders, for in doing so we give the oo execution unit the -;; most choices. - -;; Categorize how many uops an ia32 instruction evaluates to: -;; one -- an instruction with 1 uop can be decoded by any of the -;; three decoders. -;; few -- an instruction with 1 to 4 uops can be decoded only by -;; decoder 0. -;; many -- a complex instruction may take an unspecified number of -;; cycles to decode in decoder 0. - -(define_attr "ppro_uops" "one,few,many" - (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str") - (const_string "many") - (eq_attr "type" "icmov,fcmov,str,cld") - (const_string "few") - (eq_attr "type" "imov") - (if_then_else (eq_attr "memory" "store,both") - (const_string "few") - (const_string "one")) - (eq_attr "memory" "!none") - (const_string "few") - ] - (const_string "one"))) - -;; Rough readiness numbers. Fine tuning happens in i386.c. -;; -;; p0 describes port 0. -;; p01 describes ports 0 and 1 as a pair; alu insns can issue to either. -;; p2 describes port 2 for loads. -;; p34 describes ports 3 and 4 for stores. -;; fpu describes the fpu accessed via port 0. -;; ??? It is less than clear if there are separate fadd and fmul units -;; that could operate in parallel. -;; -;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "ishift,rotate,lea,ibr,cld")) - 1 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "imul")) - 4 1) - -;; ??? Does the divider lock out the pipe while it works, -;; or is there a disconnected unit? -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "idiv")) - 17 17) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fop,fsgn,fistp")) - 3 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fcmov")) - 2 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fcmp")) - 1 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fmov")) - 1 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fmul")) - 5 1) - -(define_function_unit "ppro_p0" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fdiv,fpspc")) - 56 1) - -(define_function_unit "ppro_p01" 2 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "!imov,fmov")) - 1 1) - -(define_function_unit "ppro_p01" 2 0 - (and (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "imov,fmov")) - (eq_attr "memory" "none")) - 1 1) - -(define_function_unit "ppro_p2" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (ior (eq_attr "type" "pop") - (eq_attr "memory" "load,both"))) - 3 1) - -(define_function_unit "ppro_p34" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (ior (eq_attr "type" "push") - (eq_attr "memory" "store,both"))) - 1 1) - -(define_function_unit "fpu" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp")) - 1 1) - -(define_function_unit "fpu" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fmul")) - 5 2) - -(define_function_unit "fpu" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "fdiv,fpspc")) - 56 56) - -;; imul uses the fpu. ??? does it have the same throughput as fmul? -(define_function_unit "fpu" 1 0 - (and (eq_attr "cpu" "pentiumpro") - (eq_attr "type" "imul")) - 4 1) - -;; AMD K6/K6-2 Scheduling -;; -;; The K6 has similar architecture to PPro. Important difference is, that -;; there are only two decoders and they seems to be much slower than execution -;; units. So we have to pay much more attention to proper decoding for -;; schedulers. We share most of scheduler code for PPro in i386.c -;; -;; The fp unit is not pipelined and do one operation per two cycles including -;; the FXCH. -;; -;; alu describes both ALU units (ALU-X and ALU-Y). -;; alux describes X alu unit -;; fpu describes FPU unit -;; load describes load unit. -;; branch describes branch unit. -;; store decsribes store unit. This unit is not modelled completely and only -;; used to model lea operation. Otherwise it lie outside of the critical -;; path. -;; -;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. - -;; The decoder specification is in the PPro section above! - -;; Shift instructions and certain arithmetic are issued only to X pipe. -(define_function_unit "k6_alux" 1 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "ishift,rotate,alu1,negnot,cld")) - 1 1) - -;; The QI mode arithmetic is issued to X pipe only. -(define_function_unit "k6_alux" 1 0 - (and (eq_attr "cpu" "k6") - (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec") - (match_operand:QI 0 "general_operand" ""))) - 1 1) - -(define_function_unit "k6_alu" 2 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "ishift,rotate,alu1,negnot,alu,icmp,test,imovx,incdec,setcc,lea")) - 1 1) - -(define_function_unit "k6_alu" 2 0 - (and (eq_attr "cpu" "k6") - (and (eq_attr "type" "imov") - (eq_attr "memory" "none"))) - 1 1) - -(define_function_unit "k6_branch" 1 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "call,callv,ibr")) - 1 1) - -;; Load unit have two cycle latency, but we take care for it in adjust_cost -(define_function_unit "k6_load" 1 0 - (and (eq_attr "cpu" "k6") - (ior (eq_attr "type" "pop") - (eq_attr "memory" "load,both"))) - 1 1) - -(define_function_unit "k6_load" 1 0 - (and (eq_attr "cpu" "k6") - (and (eq_attr "type" "str") - (eq_attr "memory" "load,both"))) - 10 10) - -;; Lea have two instructions, so latency is probably 2 -(define_function_unit "k6_store" 1 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "lea")) - 2 1) - -(define_function_unit "k6_store" 1 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "str")) - 10 10) - -(define_function_unit "k6_store" 1 0 - (and (eq_attr "cpu" "k6") - (ior (eq_attr "type" "push") - (eq_attr "memory" "store,both"))) - 1 1) - -(define_function_unit "k6_fpu" 1 1 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "fop,fmov,fcmp,fistp")) - 2 2) - -(define_function_unit "k6_fpu" 1 1 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "fmul")) - 2 2) - -;; ??? Guess -(define_function_unit "k6_fpu" 1 1 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "fdiv,fpspc")) - 56 56) - -(define_function_unit "k6_alu" 2 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "imul")) - 2 2) - -(define_function_unit "k6_alux" 1 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "imul")) - 2 2) - -;; ??? Guess -(define_function_unit "k6_alu" 2 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "idiv")) - 17 17) - -(define_function_unit "k6_alux" 1 0 - (and (eq_attr "cpu" "k6") - (eq_attr "type" "idiv")) - 17 17) - -;; AMD Athlon Scheduling -;; -;; The Athlon does contain three pipelined FP units, three integer units and -;; three address generation units. -;; -;; The predecode logic is determining boundaries of instructions in the 64 -;; byte cache line. So the cache line straddling problem of K6 might be issue -;; here as well, but it is not noted in the documentation. -;; -;; Three DirectPath instructions decoders and only one VectorPath decoder -;; is available. They can decode three DirectPath instructions or one VectorPath -;; instruction per cycle. -;; Decoded macro instructions are then passed to 72 entry instruction control -;; unit, that passes -;; it to the specialized integer (18 entry) and fp (36 entry) schedulers. -;; -;; The load/store queue unit is not attached to the schedulers but -;; communicates with all the execution units separately instead. - -(define_attr "athlon_decode" "direct,vector" - (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,fcmov") - (const_string "vector") - (and (eq_attr "type" "push") - (match_operand 1 "memory_operand" "")) - (const_string "vector") - (and (eq_attr "type" "fmov") - (and (eq_attr "memory" "load,store") - (eq_attr "mode" "XF"))) - (const_string "vector")] - (const_string "direct"))) - -(define_function_unit "athlon_vectordec" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_decode" "vector")) - 1 1) - -(define_function_unit "athlon_directdec" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_decode" "direct")) - 1 1) - -(define_function_unit "athlon_vectordec" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_decode" "direct")) - 1 1 [(eq_attr "athlon_decode" "vector")]) - -(define_function_unit "athlon_ieu" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "alu1,negnot,alu,icmp,test,imov,imovx,lea,incdec,ishift,rotate,ibr,call,callv,icmov,cld,pop,setcc,push,pop")) - 1 1) - -(define_function_unit "athlon_ieu" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "str")) - 15 15) - -(define_function_unit "athlon_ieu" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "imul")) - 5 0) - -(define_function_unit "athlon_ieu" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "idiv")) - 42 0) - -(define_function_unit "athlon_muldiv" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "imul")) - 5 0) - -(define_function_unit "athlon_muldiv" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "idiv")) - 42 42) - -(define_attr "athlon_fpunits" "none,store,mul,add,muladd,any" - (cond [(eq_attr "type" "fop,fcmp,fistp") - (const_string "add") - (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov") - (const_string "mul") - (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both")) - (const_string "store") - (and (eq_attr "type" "fmov") (eq_attr "memory" "load")) - (const_string "any") - (and (eq_attr "type" "fmov") - (ior (match_operand:SI 1 "register_operand" "") - (match_operand 1 "immediate_operand" ""))) - (const_string "store") - (eq_attr "type" "fmov") - (const_string "muladd")] - (const_string "none"))) - -;; We use latencies 1 for definitions. This is OK to model colisions -;; in execution units. The real latencies are modeled in the "fp" pipeline. - -;; fsin, fcos: 96-192 -;; fsincos: 107-211 -;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode. -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fpspc")) - 100 1) - -;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode. -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fdiv")) - 24 1) - -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fop,fmul,fistp")) - 4 1) - -;; XFmode loads are slow. -;; XFmode store is slow too (8 cycles), but we don't need to model it, because -;; there are no dependent instructions. - -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (and (eq_attr "type" "fmov") - (and (eq_attr "memory" "load") - (eq_attr "mode" "XF")))) - 10 1) - -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fmov,fsgn")) - 2 1) - -;; fcmp and ftst instructions -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (and (eq_attr "type" "fcmp") - (eq_attr "athlon_decode" "direct"))) - 3 1) - -;; fcmpi instructions. -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (and (eq_attr "type" "fcmp") - (eq_attr "athlon_decode" "vector"))) - 3 1) - -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fcmov")) - 7 1) - -(define_function_unit "athlon_fp_mul" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "mul")) - 1 1) - -(define_function_unit "athlon_fp_add" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "add")) - 1 1) - -(define_function_unit "athlon_fp_muladd" 2 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "muladd,mul,add")) - 1 1) - -(define_function_unit "athlon_fp_store" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "store")) - 1 1) - -;; We don't need to model the Address Generation Unit, since we don't model -;; the re-order buffer yet and thus we never schedule more than three operations -;; at time. Later we may want to experiment with MD_SCHED macros modeling the -;; decoders independently on the functional units. - -;(define_function_unit "athlon_agu" 3 0 -; (and (eq_attr "cpu" "athlon") -; (and (eq_attr "memory" "!none") -; (eq_attr "athlon_fpunits" "none"))) -; 1 1) - -;; Model load unit to avoid too long sequences of loads. We don't need to -;; model store queue, since it is hardly going to be bottleneck. - -(define_function_unit "athlon_load" 2 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "memory" "load,both")) - 1 1) - +(include "pentium.md") +(include "ppro.md") +(include "k6.md") +(include "athlon.md") ;; Compare instructions. diff --git a/gcc/config/i386/k6.md b/gcc/config/i386/k6.md new file mode 100644 index 00000000000..a68983a0d66 --- /dev/null +++ b/gcc/config/i386/k6.md @@ -0,0 +1,136 @@ +;; AMD K6/K6-2 Scheduling +;; Copyright (C) 2002 ;; Free Software Foundation, Inc. +;; +;; This file is part of GNU CC. +;; +;; GNU CC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GNU CC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GNU CC; see the file COPYING. If not, write to +;; the Free Software Foundation, 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. */ +;; +;; The K6 has similar architecture to PPro. Important difference is, that +;; there are only two decoders and they seems to be much slower than execution +;; units. So we have to pay much more attention to proper decoding for +;; schedulers. We share most of scheduler code for PPro in i386.c +;; +;; The fp unit is not pipelined and do one operation per two cycles including +;; the FXCH. +;; +;; alu describes both ALU units (ALU-X and ALU-Y). +;; alux describes X alu unit +;; fpu describes FPU unit +;; load describes load unit. +;; branch describes branch unit. +;; store decsribes store unit. This unit is not modelled completely and only +;; used to model lea operation. Otherwise it lie outside of the critical +;; path. +;; +;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. + +;; The decoder specification is in the PPro section above! + +;; Shift instructions and certain arithmetic are issued only to X pipe. +(define_function_unit "k6_alux" 1 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "ishift,rotate,alu1,negnot,cld")) + 1 1) + +;; The QI mode arithmetic is issued to X pipe only. +(define_function_unit "k6_alux" 1 0 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec") + (match_operand:QI 0 "general_operand" ""))) + 1 1) + +(define_function_unit "k6_alu" 2 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "ishift,rotate,alu1,negnot,alu,icmp,test,imovx,incdec,setcc,lea")) + 1 1) + +(define_function_unit "k6_alu" 2 0 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "imov") + (eq_attr "memory" "none"))) + 1 1) + +(define_function_unit "k6_branch" 1 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "call,callv,ibr")) + 1 1) + +;; Load unit have two cycle latency, but we take care for it in adjust_cost +(define_function_unit "k6_load" 1 0 + (and (eq_attr "cpu" "k6") + (ior (eq_attr "type" "pop") + (eq_attr "memory" "load,both"))) + 1 1) + +(define_function_unit "k6_load" 1 0 + (and (eq_attr "cpu" "k6") + (and (eq_attr "type" "str") + (eq_attr "memory" "load,both"))) + 10 10) + +;; Lea have two instructions, so latency is probably 2 +(define_function_unit "k6_store" 1 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "lea")) + 2 1) + +(define_function_unit "k6_store" 1 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "str")) + 10 10) + +(define_function_unit "k6_store" 1 0 + (and (eq_attr "cpu" "k6") + (ior (eq_attr "type" "push") + (eq_attr "memory" "store,both"))) + 1 1) + +(define_function_unit "k6_fpu" 1 1 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "fop,fmov,fcmp,fistp")) + 2 2) + +(define_function_unit "k6_fpu" 1 1 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "fmul")) + 2 2) + +;; ??? Guess +(define_function_unit "k6_fpu" 1 1 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "fdiv,fpspc")) + 56 56) + +(define_function_unit "k6_alu" 2 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "imul")) + 2 2) + +(define_function_unit "k6_alux" 1 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "imul")) + 2 2) + +;; ??? Guess +(define_function_unit "k6_alu" 2 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "idiv")) + 17 17) + +(define_function_unit "k6_alux" 1 0 + (and (eq_attr "cpu" "k6") + (eq_attr "type" "idiv")) + 17 17) diff --git a/gcc/config/i386/pentium.md b/gcc/config/i386/pentium.md new file mode 100644 index 00000000000..2d418bdf119 --- /dev/null +++ b/gcc/config/i386/pentium.md @@ -0,0 +1,306 @@ +;; Pentium Scheduling +;; Copyright (C) 2002 Free Software Foundation, Inc. +;; +;; This file is part of GNU CC. +;; +;; GNU CC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GNU CC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GNU CC; see the file COPYING. If not, write to +;; the Free Software Foundation, 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. */ +;; +;; The Pentium is an in-order core with two integer pipelines. + +;; True for insns that behave like prefixed insns on the Pentium. +(define_attr "pent_prefix" "false,true" + (if_then_else (ior (eq_attr "prefix_0f" "1") + (ior (eq_attr "prefix_data16" "1") + (eq_attr "prefix_rep" "1"))) + (const_string "true") + (const_string "false"))) + +;; Categorize how an instruction slots. + +;; The non-MMX Pentium slots an instruction with prefixes on U pipe only, +;; while MMX Pentium can slot it on either U or V. Model non-MMX Pentium +;; rules, because it results in noticeably better code on non-MMX Pentium +;; and doesn't hurt much on MMX. (Prefixed instructions are not very +;; common, so the scheduler usualy has a non-prefixed insn to pair). + +(define_attr "pent_pair" "uv,pu,pv,np" + (cond [(eq_attr "imm_disp" "true") + (const_string "np") + (ior (eq_attr "type" "alu1,alu,imov,icmp,test,lea,incdec") + (and (eq_attr "type" "pop,push") + (eq_attr "memory" "!both"))) + (if_then_else (eq_attr "pent_prefix" "true") + (const_string "pu") + (const_string "uv")) + (eq_attr "type" "ibr") + (const_string "pv") + (and (eq_attr "type" "ishift") + (match_operand 2 "const_int_operand" "")) + (const_string "pu") + (and (eq_attr "type" "rotate") + (match_operand 2 "const_int_1_operand" "")) + (const_string "pu") + (and (eq_attr "type" "call") + (match_operand 0 "constant_call_address_operand" "")) + (const_string "pv") + (and (eq_attr "type" "callv") + (match_operand 1 "constant_call_address_operand" "")) + (const_string "pv") + ] + (const_string "np"))) + +(define_automaton "pentium,pentium_fpu") + +;; Pentium do have U and V pipes. Instruction to both pipes +;; are alwyas issued together, much like on VLIW. +;; +;; predecode +;; / \ +;; decodeu decodev +;; / | | +;; fpu executeu executev +;; | | | +;; fpu retire retire +;; | +;; fpu +;; We add dummy "port" pipes allocated only first cycle of +;; instruction to specify this behaviour. + +(define_cpu_unit "pentium-portu,pentium-portv" "pentium") +(define_cpu_unit "pentium-u,pentium-v" "pentium") +(absence_set "pentium-portu" "pentium-u,pentium-v") +(presence_set "pentium-portv" "pentium-portu") + +;; Floating point instructions can overlap with new issue of integer +;; instructions. We model only first cycle of FP pipeline, as it is +;; fully pipelined. +(define_cpu_unit "pentium-fp" "pentium_fpu") + +;; There is non-pipelined multiplier unit used for complex operations. +(define_cpu_unit "pentium-fmul" "pentium_fpu") + +;; Pentium preserves memory ordering, so when load-execute-store +;; instruction is executed together with other instruction loading +;; data, the execution of the other instruction is delayed to very +;; last cycle of first instruction, when data are bypassed. +;; We model this by allocating "memory" unit when store is pending +;; and using conflicting load units together. + +(define_cpu_unit "pentium-memory" "pentium") +(define_cpu_unit "pentium-load0" "pentium") +(define_cpu_unit "pentium-load1" "pentium") +(absence_set "pentium-load0,pentium-load1" "pentium-memory") + +(define_reservation "pentium-load" "(pentium-load0 | pentium-load1)") +(define_reservation "pentium-np" "(pentium-u + pentium-v)") +(define_reservation "pentium-uv" "(pentium-u | pentium-v)") +(define_reservation "pentium-portuv" "(pentium-portu | pentium-portv)") +(define_reservation "pentium-firstu" "(pentium-u + pentium-portu)") +(define_reservation "pentium-firstv" "(pentium-v + pentium-portuv)") +(define_reservation "pentium-firstuv" "(pentium-uv + pentium-portuv)") +(define_reservation "pentium-firstuload" "(pentium-load + pentium-firstu)") +(define_reservation "pentium-firstvload" "(pentium-load + pentium-firstv)") +(define_reservation "pentium-firstuvload" "(pentium-load + pentium-firstuv) + | (pentium-firstv,pentium-v, + (pentium-load+pentium-firstv))") +(define_reservation "pentium-firstuboth" "(pentium-load + pentium-firstu + + pentium-memory)") +(define_reservation "pentium-firstvboth" "(pentium-load + pentium-firstu + + pentium-memory)") +(define_reservation "pentium-firstuvboth" "(pentium-load + pentium-firstuv + + pentium-memory) + | (pentium-firstv,pentium-v, + (pentium-load+pentium-firstv))") + +;; Few common long latency instructions +(define_insn_reservation "pent_mul" 11 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "imul")) + "pentium-np*11") + +(define_insn_reservation "pent_str" 12 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "str")) + "pentium-np*12") + +;; Integer division and some other long latency instruction block all +;; units, including the FP pipe. There is no value in modeling the +;; latency of these instructions and not modeling the latency +;; decreases the size of the DFA. +(define_insn_reservation "pent_block" 1 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "idiv")) + "pentium-np+pentium-fp") + +(define_insn_reservation "pent_cld" 2 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "cld")) + "pentium-np*2") + +;; Moves usually have one cycle penalty, but there are exceptions. +(define_insn_reservation "pent_fmov" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "none,load"))) + "(pentium-fp+pentium-np)") + +(define_insn_reservation "pent_fpmovxf" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load,store") + (eq_attr "mode" "XF")))) + "(pentium-fp+pentium-np)*3") + +(define_insn_reservation "pent_fpstore" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "type" "fmov") + (ior (match_operand 1 "immediate_operand" "") + (eq_attr "memory" "store")))) + "(pentium-fp+pentium-np)*2") + +(define_insn_reservation "pent_imov" 1 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "imov")) + "pentium-firstuv") + +;; Push and pop instructions have 1 cycle latency and special +;; hardware bypass allows them to be paired with other push,pop +;; and call instructions. +(define_bypass 0 "pent_push,pent_pop" "pent_push,pent_pop,pent_call") +(define_insn_reservation "pent_push" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "type" "push") + (eq_attr "memory" "store"))) + "pentium-firstuv") + +(define_insn_reservation "pent_pop" 1 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "pop")) + "pentium-firstuv") + +;; Call and branch instruction can execute in either pipe, but +;; they are only pairable when in the v pipe. +(define_insn_reservation "pent_call" 10 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "call,callv")) + "pentium-firstv,pentium-v*9") + +(define_insn_reservation "pent_branch" 1 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "ibr")) + "pentium-firstv") + +;; Floating point instruction dispatch in U pipe, but continue +;; in FP pipeline allowing other isntructions to be executed. +(define_insn_reservation "pent_fp" 3 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "fop,fistp")) + "(pentium-firstu+pentium-fp),nothing,nothing") + +;; First two cycles of fmul are not pipelined. +(define_insn_reservation "pent_fmul" 3 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "fmul")) + "(pentium-firstuv+pentium-fp+pentium-fmul),pentium-fmul,nothing") + +;; Long latency FP instructions overlap with integer instructions, +;; but only last 2 cycles with FP ones. +(define_insn_reservation "pent_fdiv" 39 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "fdiv")) + "(pentium-np+pentium-fp+pentium-fmul), + (pentium-fp+pentium-fmul)*36,pentium-fmul*2") + +(define_insn_reservation "pent_fpspc" 70 + (and (eq_attr "cpu" "pentium") + (eq_attr "type" "fpspc")) + "(pentium-np+pentium-fp+pentium-fmul), + (pentium-fp+pentium-fmul)*67,pentium-fmul*2") + +;; Integer instructions. Load/execute/store takes 3 cycles, +;; load/execute 2 cycles and execute only one cycle. +(define_insn_reservation "pent_uv_both" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "uv") + (eq_attr "memory" "both"))) + "pentium-firstuvboth,pentium-uv+pentium-memory,pentium-uv") + +(define_insn_reservation "pent_u_both" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pu") + (eq_attr "memory" "both"))) + "pentium-firstuboth,pentium-u+pentium-memory,pentium-u") + +(define_insn_reservation "pent_v_both" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pv") + (eq_attr "memory" "both"))) + "pentium-firstvboth,pentium-v+pentium-memory,pentium-v") + +(define_insn_reservation "pent_np_both" 3 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "np") + (eq_attr "memory" "both"))) + "pentium-np,pentium-np,pentium-np") + +(define_insn_reservation "pent_uv_load" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "uv") + (eq_attr "memory" "load"))) + "pentium-firstuvload,pentium-uv") + +(define_insn_reservation "pent_u_load" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pu") + (eq_attr "memory" "load"))) + "pentium-firstuload,pentium-u") + +(define_insn_reservation "pent_v_load" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pv") + (eq_attr "memory" "load"))) + "pentium-firstvload,pentium-v") + +(define_insn_reservation "pent_np_load" 2 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "np") + (eq_attr "memory" "load"))) + "pentium-np,pentium-np") + +(define_insn_reservation "pent_uv" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "uv") + (eq_attr "memory" "none"))) + "pentium-firstuv") + +(define_insn_reservation "pent_u" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pu") + (eq_attr "memory" "none"))) + "pentium-firstu") + +(define_insn_reservation "pent_v" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "pv") + (eq_attr "memory" "none"))) + "pentium-firstv") + +(define_insn_reservation "pent_np" 1 + (and (eq_attr "cpu" "pentium") + (and (eq_attr "pent_pair" "np") + (eq_attr "memory" "none"))) + "pentium-np") + diff --git a/gcc/config/i386/ppro.md b/gcc/config/i386/ppro.md new file mode 100644 index 00000000000..f7afa4f644e --- /dev/null +++ b/gcc/config/i386/ppro.md @@ -0,0 +1,150 @@ +;; Pentium Pro/PII Scheduling +;; Copyright (C) 2002 Free Software Foundation, Inc. +;; +;; This file is part of GNU CC. +;; +;; GNU CC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GNU CC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GNU CC; see the file COPYING. If not, write to +;; the Free Software Foundation, 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. */ + +;; Categorize how many uops an ia32 instruction evaluates to: +;; one -- an instruction with 1 uop can be decoded by any of the +;; three decoders. +;; few -- an instruction with 1 to 4 uops can be decoded only by +;; decoder 0. +;; many -- a complex instruction may take an unspecified number of +;; cycles to decode in decoder 0. + +(define_attr "ppro_uops" "one,few,many" + (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str") + (const_string "many") + (eq_attr "type" "icmov,fcmov,str,cld") + (const_string "few") + (eq_attr "type" "imov") + (if_then_else (eq_attr "memory" "store,both") + (const_string "few") + (const_string "one")) + (eq_attr "memory" "!none") + (const_string "few") + ] + (const_string "one"))) + +;; +;; The PPro has an out-of-order core, but the instruction decoders are +;; naturally in-order and asymmetric. We get best performance by scheduling +;; for the decoders, for in doing so we give the oo execution unit the +;; most choices. +;; +;; Rough readiness numbers. Fine tuning happens in i386.c. +;; +;; p0 describes port 0. +;; p01 describes ports 0 and 1 as a pair; alu insns can issue to either. +;; p2 describes port 2 for loads. +;; p34 describes ports 3 and 4 for stores. +;; fpu describes the fpu accessed via port 0. +;; ??? It is less than clear if there are separate fadd and fmul units +;; that could operate in parallel. +;; +;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. + +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "ishift,rotate,lea,ibr,cld")) + 1 1) + +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "imul")) + 4 1) + +;; ??? Does the divider lock out the pipe while it works, +;; or is there a disconnected unit? +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "idiv")) + 17 17) + +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fop,fsgn,fistp")) + 3 1) + +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fcmov")) + 2 1) + +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fcmp")) + 1 1) + +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fmov")) + 1 1) + +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fmul")) + 5 1) + +(define_function_unit "ppro_p0" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fdiv,fpspc")) + 56 1) + +(define_function_unit "ppro_p01" 2 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "!imov,fmov")) + 1 1) + +(define_function_unit "ppro_p01" 2 0 + (and (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "imov,fmov")) + (eq_attr "memory" "none")) + 1 1) + +(define_function_unit "ppro_p2" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (ior (eq_attr "type" "pop") + (eq_attr "memory" "load,both"))) + 3 1) + +(define_function_unit "ppro_p34" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (ior (eq_attr "type" "push") + (eq_attr "memory" "store,both"))) + 1 1) + +(define_function_unit "fpu" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fop,fsgn,fmov,fcmp,fcmov,fistp")) + 1 1) + +(define_function_unit "fpu" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fmul")) + 5 2) + +(define_function_unit "fpu" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "fdiv,fpspc")) + 56 56) + +;; imul uses the fpu. ??? does it have the same throughput as fmul? +(define_function_unit "fpu" 1 0 + (and (eq_attr "cpu" "pentiumpro") + (eq_attr "type" "imul")) + 4 1) -- cgit v1.2.1