diff options
Diffstat (limited to 'gcc/config/i386')
-rw-r--r-- | gcc/config/i386/athlon.md | 803 | ||||
-rw-r--r-- | gcc/config/i386/att.h | 7 | ||||
-rw-r--r-- | gcc/config/i386/bsd.h | 6 | ||||
-rw-r--r-- | gcc/config/i386/freebsd-aout.h | 3 | ||||
-rw-r--r-- | gcc/config/i386/i386-coff.h | 7 | ||||
-rw-r--r-- | gcc/config/i386/i386-interix.h | 29 | ||||
-rw-r--r-- | gcc/config/i386/i386-protos.h | 6 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 709 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 87 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 1422 | ||||
-rw-r--r-- | gcc/config/i386/k6.md | 2 | ||||
-rw-r--r-- | gcc/config/i386/lynx-ng.h | 7 | ||||
-rw-r--r-- | gcc/config/i386/lynx.h | 7 | ||||
-rw-r--r-- | gcc/config/i386/pentium.md | 2 | ||||
-rw-r--r-- | gcc/config/i386/ppro.md | 4 | ||||
-rw-r--r-- | gcc/config/i386/sco5.h | 6 | ||||
-rw-r--r-- | gcc/config/i386/t-cygwin | 4 | ||||
-rw-r--r-- | gcc/config/i386/t-interix | 5 | ||||
-rw-r--r-- | gcc/config/i386/vxi386.h | 66 | ||||
-rw-r--r-- | gcc/config/i386/winnt.c | 2 | ||||
-rw-r--r-- | gcc/config/i386/xm-i386-interix.h | 32 |
21 files changed, 2287 insertions, 929 deletions
diff --git a/gcc/config/i386/athlon.md b/gcc/config/i386/athlon.md index 548f2adf422..719046fcd61 100644 --- a/gcc/config/i386/athlon.md +++ b/gcc/config/i386/athlon.md @@ -1,34 +1,5 @@ ;; AMD Athlon Scheduling -;; Copyright (C) 2002 Free Software Foundation, Inc. ;; -;; This file is part of GNU CC. -;; -;; GNU CC is free software; you can redistribute it and/or modify -;; it under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 2, or (at your option) -;; any later version. -;; -;; GNU CC is distributed in the hope that it will be useful, -;; but WITHOUT ANY WARRANTY; without even the implied warranty of -;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;; GNU General Public License for more details. -;; -;; You should have received a copy of the GNU General Public License -;; along with GNU CC; see the file COPYING. If not, write to -;; the Free Software Foundation, 59 Temple Place - Suite 330, -;; Boston, MA 02111-1307, USA. */ -(define_attr "athlon_decode" "direct,vector" - (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,fcmov") - (const_string "vector") - (and (eq_attr "type" "push") - (match_operand 1 "memory_operand" "")) - (const_string "vector") - (and (eq_attr "type" "fmov") - (and (eq_attr "memory" "load,store") - (eq_attr "mode" "XF"))) - (const_string "vector")] - (const_string "direct"))) - ;; The Athlon does contain three pipelined FP units, three integer units and ;; three address generation units. ;; @@ -46,161 +17,649 @@ ;; The load/store queue unit is not attached to the schedulers but ;; communicates with all the execution units separately instead. -(define_function_unit "athlon_vectordec" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_decode" "vector")) - 1 1) - -(define_function_unit "athlon_directdec" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_decode" "direct")) - 1 1) - -(define_function_unit "athlon_vectordec" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_decode" "direct")) - 1 1 [(eq_attr "athlon_decode" "vector")]) - -(define_function_unit "athlon_ieu" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "alu1,negnot,alu,icmp,test,imov,imovx,lea,incdec,ishift,ishift1,rotate,rotate1,ibr,call,callv,icmov,cld,pop,setcc,push,pop")) - 1 1) - -(define_function_unit "athlon_ieu" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "str")) - 15 15) - -(define_function_unit "athlon_ieu" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "imul")) - 5 0) - -(define_function_unit "athlon_ieu" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "idiv")) - 42 0) - -(define_function_unit "athlon_muldiv" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "imul")) - 5 0) - -(define_function_unit "athlon_muldiv" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "idiv")) - 42 42) - -(define_attr "athlon_fpunits" "none,store,mul,add,muladd,any" - (cond [(eq_attr "type" "fop,fcmp,fistp") - (const_string "add") - (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov") - (const_string "mul") - (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both")) - (const_string "store") - (and (eq_attr "type" "fmov") (eq_attr "memory" "load")) - (const_string "any") +(define_attr "athlon_decode" "direct,vector" + (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,leave") + (const_string "vector") + (and (eq_attr "type" "push") + (match_operand 1 "memory_operand" "")) + (const_string "vector") (and (eq_attr "type" "fmov") - (ior (match_operand:SI 1 "register_operand" "") - (match_operand 1 "immediate_operand" ""))) - (const_string "store") - (eq_attr "type" "fmov") - (const_string "muladd")] - (const_string "none"))) - -;; We use latencies 1 for definitions. This is OK to model colisions -;; in execution units. The real latencies are modeled in the "fp" pipeline. - -;; fsin, fcos: 96-192 -;; fsincos: 107-211 -;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode. -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fpspc")) - 100 1) - -;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode. -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fdiv")) - 24 1) - -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fop,fmul,fistp")) - 4 1) - -;; XFmode loads are slow. -;; XFmode store is slow too (8 cycles), but we don't need to model it, because -;; there are no dependent instructions. + (and (eq_attr "memory" "load,store") + (eq_attr "mode" "XF"))) + (const_string "vector")] + (const_string "direct"))) -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (and (eq_attr "type" "fmov") - (and (eq_attr "memory" "load") - (eq_attr "mode" "XF")))) - 10 1) +;; +;; decode0 decode1 decode2 +;; \ | / +;; instruction control unit (72 entry scheduler) +;; | | +;; integer scheduler (18) stack map +;; / | | | | \ stack rename +;; ieu0 agu0 ieu1 agu1 ieu2 agu2 scheduler +;; | agu0 | agu1 agu2 register file +;; | \ | | / | | | +;; \ /\ | / fadd fmul fstore +;; \ / \ | / fadd fmul fstore +;; imul load/store (2x) fadd fmul fstore -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fmov,fsgn")) - 2 1) +(define_automaton "athlon,athlon_load,athlon_mult,athlon_fp") +(define_cpu_unit "athlon-decode0" "athlon") +(define_cpu_unit "athlon-decode1" "athlon") +(define_cpu_unit "athlon-decode2" "athlon") +(define_cpu_unit "athlon-decodev" "athlon") +;; Model the fact that double decoded instruction may take 2 cycles +;; to decode when decoder2 and decoder0 in next cycle +;; is used (this is needed to allow troughput of 1.5 double decoded +;; instructions per cycle). +;; +;; In order to avoid dependnece between reservation of decoder +;; and other units, we model decoder as two stage fully pipelined unit +;; and only double decoded instruction may occupy unit in the first cycle. +;; With this scheme however two double instructions can be issued cycle0. +;; +;; Avoid this by using presence set requiring decoder0 to be allocated +;; too. Vector decoded instructions then can't be issued when +;; modeled as consuming decoder0+decoder1+decoder2. +;; We solve that by specialized vector decoder unit and exclusion set. +(presence_set "athlon-decode2" "athlon-decode0") +(exclusion_set "athlon-decodev" "athlon-decode0,athlon-decode1,athlon-decode2") +(define_reservation "athlon-vector" "nothing,athlon-decodev") +(define_reservation "athlon-direct0" "nothing,athlon-decode0") +(define_reservation "athlon-direct" "nothing, + (athlon-decode0 | athlon-decode1 + | athlon-decode2)") +;; Double instructions behaves like two direct instructions. +(define_reservation "athlon-double" "((athlon-decode2, athlon-decode0) + | (nothing,(athlon-decode0 + athlon-decode1)) + | (nothing,(athlon-decode1 + athlon-decode2)))") -;; fcmp and ftst instructions -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (and (eq_attr "type" "fcmp") - (eq_attr "athlon_decode" "direct"))) - 3 1) +;; Agu and ieu unit results in extremly large automatons and +;; in our approximation they are hardly filled in. Only ieu +;; unit can, as issue rate is 3 and agu unit is always used +;; first in the insn reservations. Skip the models. -;; fcmpi instructions. -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (and (eq_attr "type" "fcmp") - (eq_attr "athlon_decode" "vector"))) - 3 1) +;(define_cpu_unit "athlon-ieu0" "athlon_ieu") +;(define_cpu_unit "athlon-ieu1" "athlon_ieu") +;(define_cpu_unit "athlon-ieu2" "athlon_ieu") +;(define_reservation "athlon-ieu" "(athlon-ieu0 | athlon-ieu1 | athlon-ieu2)") +(define_reservation "athlon-ieu" "nothing") +(define_cpu_unit "athlon-ieu0" "athlon") +;(define_cpu_unit "athlon-agu0" "athlon_agu") +;(define_cpu_unit "athlon-agu1" "athlon_agu") +;(define_cpu_unit "athlon-agu2" "athlon_agu") +;(define_reservation "athlon-agu" "(athlon-agu0 | athlon-agu1 | athlon-agu2)") +(define_reservation "athlon-agu" "nothing,nothing") -(define_function_unit "athlon_fp" 3 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "type" "fcmov")) - 7 1) +(define_cpu_unit "athlon-mult" "athlon_mult") -(define_function_unit "athlon_fp_mul" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "mul")) - 1 1) +(define_cpu_unit "athlon-load0" "athlon_load") +(define_cpu_unit "athlon-load1" "athlon_load") +(define_reservation "athlon-load" "athlon-agu, + (athlon-load0 | athlon-load1)") +(define_reservation "athlon-store" "nothing") -(define_function_unit "athlon_fp_add" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "add")) - 1 1) +;; The three fp units are fully pipelined with latency of 3 +(define_cpu_unit "athlon-fadd" "athlon_fp") +(define_cpu_unit "athlon-fmul" "athlon_fp") +(define_cpu_unit "athlon-fstore" "athlon_fp") +(define_reservation "athlon-fany" "(athlon-fadd | athlon-fmul | athlon-fstore)") +(define_reservation "athlon-faddmul" "(athlon-fadd | athlon-fmul)") -(define_function_unit "athlon_fp_muladd" 2 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "muladd,mul,add")) - 1 1) -(define_function_unit "athlon_fp_store" 1 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "athlon_fpunits" "store")) - 1 1) +;; Jump instructions are executed in the branch unit compltetely transparent to us +(define_insn_reservation "athlon_branch" 0 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "ibr")) + "athlon-direct") +(define_insn_reservation "athlon_call" 0 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "call,callv")) + "athlon-vector") -;; We don't need to model the Address Generation Unit, since we don't model -;; the re-order buffer yet and thus we never schedule more than three operations -;; at time. Later we may want to experiment with MD_SCHED macros modeling the -;; decoders independently on the functional units. +;; Latency of push operation is 3 cycles, but ESP value is available +;; earlier +(define_insn_reservation "athlon_push" 2 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "push")) + "athlon-direct,nothing,athlon-store") +(define_insn_reservation "athlon_pop" 4 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "pop")) + "athlon-vector,athlon-ieu,athlon-load") +(define_insn_reservation "athlon_pop_k8" 3 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "pop")) + "athlon-double,athlon-ieu,athlon-load") +(define_insn_reservation "athlon_leave" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "leave")) + "athlon-vector,athlon-load") +(define_insn_reservation "athlon_leave_k8" 3 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "leave")) + "athlon-double,athlon-load") -;(define_function_unit "athlon_agu" 3 0 -; (and (eq_attr "cpu" "athlon") -; (and (eq_attr "memory" "!none") -; (eq_attr "athlon_fpunits" "none"))) -; 1 1) +;; Lea executes in AGU unit with 2 cycles latency. +(define_insn_reservation "athlon_lea" 2 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "lea")) + "athlon-direct,athlon-agu") -;; Model load unit to avoid too long sequences of loads. We don't need to -;; model store queue, since it is hardly going to be bottleneck. +;; Mul executes in special multiplier unit attached to IEU0 +(define_insn_reservation "athlon_imul" 5 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "imul") + (eq_attr "memory" "none,unknown"))) + "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0") +;; ??? Widening multiply is vector or double. +(define_insn_reservation "athlon_imul_k8_DI" 4 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "none,unknown")))) + "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") +(define_insn_reservation "athlon_imul_k8" 3 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "imul") + (eq_attr "memory" "none,unknown"))) + "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0") +(define_insn_reservation "athlon_imul_mem" 8 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "imul") + (eq_attr "memory" "load,both"))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu") +(define_insn_reservation "athlon_imul_mem_k8_DI" 7 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "load,both")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu") +(define_insn_reservation "athlon_imul_mem_k8" 6 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "imul") + (eq_attr "memory" "load,both"))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu") +(define_insn_reservation "athlon_idiv" 42 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "none,unknown"))) + "athlon-vector,athlon-ieu*42") +(define_insn_reservation "athlon_idiv_mem" 45 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "load,both"))) + "athlon-vector,athlon-load,athlon-ieu*42") +(define_insn_reservation "athlon_str" 15 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "str") + (eq_attr "memory" "load,both,store"))) + "athlon-vector,athlon-load,athlon-ieu*10") -(define_function_unit "athlon_load" 2 0 - (and (eq_attr "cpu" "athlon") - (eq_attr "memory" "load,both")) - 1 1) +(define_insn_reservation "athlon_idirect" 1 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-direct,athlon-ieu") +(define_insn_reservation "athlon_ivector" 2 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_idirect_loadmov" 3 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "imov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load") +(define_insn_reservation "athlon_idirect_load" 4 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-ieu") +(define_insn_reservation "athlon_ivector_load" 6 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_idirect_movstore" 1 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "imov") + (eq_attr "memory" "store"))) + "athlon-direct,athlon-agu,athlon-store") +(define_insn_reservation "athlon_idirect_both" 4 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-direct,athlon-load,athlon-ieu, + athlon-store") +(define_insn_reservation "athlon_ivector_both" 6 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-ieu, + athlon-store") +(define_insn_reservation "athlon_idirect_store" 1 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-direct,athlon-ieu, + athlon-store") +(define_insn_reservation "athlon_ivector_store" 2 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-vector,athlon-ieu,athlon-ieu, + athlon-store") +;; Athlon floatin point unit +(define_insn_reservation "athlon_fldxf" 12 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load") + (eq_attr "mode" "XF")))) + "athlon-vector,athlon-fany") +(define_insn_reservation "athlon_fldxf_k8" 13 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load") + (eq_attr "mode" "XF")))) + "athlon-vector,athlon-fany") +(define_insn_reservation "athlon_fld" 6 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fany,nothing,athlon-load") +(define_insn_reservation "athlon_fld_k8" 4 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fany,athlon-load") +(define_insn_reservation "athlon_fstxf" 10 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "store,both") + (eq_attr "mode" "XF")))) + "athlon-vector,athlon-fstore") +(define_insn_reservation "athlon_fstxf_k8" 8 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "store,both") + (eq_attr "mode" "XF")))) + "athlon-vector,athlon-fstore") +(define_insn_reservation "athlon_fst" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "store,both"))) + "athlon-direct,athlon-fstore,nothing,athlon-store") +(define_insn_reservation "athlon_fst_k8" 2 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "store,both"))) + "athlon-direct,athlon-fstore,athlon-store") +(define_insn_reservation "athlon_fist" 4 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "fistp")) + "athlon-direct,athlon-fstore,nothing") +(define_insn_reservation "athlon_fmov" 2 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "fmov")) + "athlon-direct,athlon-faddmul") +(define_insn_reservation "athlon_fadd_load" 7 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fop") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_fadd_load_k8" 6 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "fop") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_fadd" 4 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "fop")) + "athlon-direct,athlon-fadd") +(define_insn_reservation "athlon_fmul_load" 7 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fmul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_fmul_load_k8" 6 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "fmul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_fmul" 4 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "fmul")) + "athlon-direct,athlon-fmul") +(define_insn_reservation "athlon_fsgn" 2 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "fsgn")) + "athlon-direct,athlon-fmul") +(define_insn_reservation "athlon_fdiv_load" 24 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fdiv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_fdiv_load_k8" 13 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "fdiv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_fdiv" 24 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fdiv")) + "athlon-direct,athlon-fmul") +(define_insn_reservation "athlon_fdiv_k8" 11 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "fdiv")) + "athlon-direct,athlon-fmul") +(define_insn_reservation "athlon_fpspc_load" 103 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "fpspc") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_fpspc" 100 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "fpspc")) + "athlon-vector,athlon-fmul") +(define_insn_reservation "athlon_fcmov_load" 10 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmov") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_fcmov" 7 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "fcmov")) + "athlon-vector,athlon-fmul") +(define_insn_reservation "athlon_fcmov_load_k8" 17 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "fcmov") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_fcmov_k8" 15 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "fcmov")) + "athlon-vector,athlon-fmul") +(define_insn_reservation "athlon_fcomi_load" 6 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "fcmp") + (and (eq_attr "athlon_decode" "vector") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_fcomi" 3 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "athlon_decode" "vector") + (eq_attr "type" "fcmp"))) + "athlon-vector,athlon-fadd") +(define_insn_reservation "athlon_fcom_load" 5 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "fcmp") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_fcom" 2 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "fcmp")) + "athlon-direct,athlon-fadd") +(define_insn_reservation "athlon_fxch" 2 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "fxch")) + "athlon-direct,athlon-fany") +;; Athlon handle MMX operations in the FPU unit with shorter latencies +(define_insn_reservation "athlon_movlpd_load" 4 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "ssemov") + (match_operand:DF 1 "memory_operand" ""))) + "athlon-direct,athlon-load") +(define_insn_reservation "athlon_movaps_load" 4 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load")))) + "athlon-double,athlon-load") +(define_insn_reservation "athlon_movss_load" 3 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "SF,DI") + (eq_attr "memory" "load")))) + "athlon-double,athlon-load") +(define_insn_reservation "athlon_mmxsseld" 4 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fany,athlon-load") +(define_insn_reservation "athlon_mmxssest" 3 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "mmxmov,ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "store,both")))) + "athlon-double,athlon-store") +(define_insn_reservation "athlon_mmxssest_k8" 2 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "store,both"))) + "athlon-direct,athlon-store") +(define_insn_reservation "athlon_movaps" 2 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssemov") + (eq_attr "mode" "V4SF,V2DF"))) + "athlon-double,athlon-faddmul,athlon-faddmul") +(define_insn_reservation "athlon_mmxssemov" 2 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "mmxmov,ssemov")) + "athlon-direct,athlon-faddmul") +(define_insn_reservation "athlon_mmxmul_load" 6 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "mmxmul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_mmxmul" 3 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "type" "mmxmul")) + "athlon-direct,athlon-fmul") +(define_insn_reservation "athlon_mmx_load" 5 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "unit" "mmx") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load,athlon-faddmul") +(define_insn_reservation "athlon_mmx" 2 + (and (eq_attr "cpu" "athlon,k8") + (eq_attr "unit" "mmx")) + "athlon-direct,athlon-faddmul") +;; SSE operations are handled by the i387 unit as well. The latnecy +;; is same as for i387 operations for scalar operations +(define_insn_reservation "athlon_sselog_load" 6 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sselog") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_sselog_load_k8" 5 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "sselog") + (eq_attr "memory" "load"))) + "athlon-double,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_sselog" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "sselog")) + "athlon-vector,athlon-fmul") +(define_insn_reservation "athlon_sselog_k8" 3 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "sselog")) + "athlon-double,athlon-fmul") +(define_insn_reservation "athlon_ssecmp_load" 5 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "ssecmp") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_ssecmp" 2 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "ssecmp") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fadd") +(define_insn_reservation "athlon_ssecmpvector_load" 6 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fadd") +(define_insn_reservation "athlon_ssecmpvector_load_k8" 5 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fadd") +(define_insn_reservation "athlon_ssecmpvector" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssecmp")) + "athlon-vector,athlon-fadd") +(define_insn_reservation "athlon_ssecmpvector_k8" 3 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "ssecmp")) + "athlon-double,athlon-fadd") +(define_insn_reservation "athlon_sseadd_load" 7 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sseadd") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_sseadd_load_k8" 6 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "sseadd") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_sseadd" 4 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "sseadd") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fadd") +(define_insn_reservation "athlon_sseaddvector_load" 8 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_sseaddvector_load_k8" 7 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_sseaddvector" 5 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "sseadd")) + "athlon-vector,athlon-fadd") +(define_insn_reservation "athlon_sseaddvector_k8" 4 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "sseadd")) + "athlon-vector,athlon-fadd") +(define_insn_reservation "athlon_ssecvt_load" 5 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_ssecvt_load_k8" 4 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_ssecvt" 2 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "ssecvt") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fadd") +(define_insn_reservation "athlon_ssecvtvector_load" 6 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssecvt") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_ssecvtvector_load_k8" 5 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssecvt") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fadd") +(define_insn_reservation "athlon_ssecvtvector" 5 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssecvt")) + "athlon-vector,athlon-fadd") +(define_insn_reservation "athlon_ssecvtvector_k8" 3 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "ssecvt")) + "athlon-vector,athlon-fadd") +(define_insn_reservation "athlon_ssemul_load" 7 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssemul") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_ssemul_load_k8" 6 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssemul") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_ssemul" 4 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "ssemul") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fmul") +(define_insn_reservation "athlon_ssemulvector_load" 8 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_ssemulvector_load_k8" 7 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-double,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_ssemulvector" 5 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssemul")) + "athlon-vector,athlon-fmul") +(define_insn_reservation "athlon_ssemulvector_k8" 5 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "ssemul")) + "athlon-double,athlon-fmul") +(define_insn_reservation "athlon_ssediv_load" 19 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_ssediv_load_k8" 18 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_ssediv" 16 + (and (eq_attr "cpu" "athlon,k8") + (and (eq_attr "type" "ssediv") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fmul") +(define_insn_reservation "athlon_ssedivvector_load" 32 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_ssedivvector_load_k8" 35 + (and (eq_attr "cpu" "k8") + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-load,athlon-fmul") +(define_insn_reservation "athlon_ssedivvector" 29 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssediv")) + "athlon-vector,athlon-fmul") +(define_insn_reservation "athlon_ssedivvector_k8" 33 + (and (eq_attr "cpu" "k8") + (eq_attr "type" "ssediv")) + "athlon-vector,athlon-fmul") diff --git a/gcc/config/i386/att.h b/gcc/config/i386/att.h index 70ae1641365..8d9930852f7 100644 --- a/gcc/config/i386/att.h +++ b/gcc/config/i386/att.h @@ -90,13 +90,6 @@ do \ #define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER) \ sprintf ((BUF), "%s%s%ld", LOCAL_LABEL_PREFIX, (PREFIX), (long)(NUMBER)) -/* This is how to output an internal numbered label where - PREFIX is the class of label and NUM is the number within the class. */ - -#undef ASM_OUTPUT_INTERNAL_LABEL -#define ASM_OUTPUT_INTERNAL_LABEL(FILE,PREFIX,NUM) \ - fprintf (FILE, "%s%s%d:\n", LOCAL_LABEL_PREFIX, PREFIX, NUM) - /* The prefix to add to user-visible assembler symbols. */ #undef USER_LABEL_PREFIX diff --git a/gcc/config/i386/bsd.h b/gcc/config/i386/bsd.h index 69ad1688bfb..9f396ec46ae 100644 --- a/gcc/config/i386/bsd.h +++ b/gcc/config/i386/bsd.h @@ -88,12 +88,6 @@ Boston, MA 02111-1307, USA. */ #define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER) \ sprintf ((BUF), "*%s%ld", (PREFIX), (long)(NUMBER)) -/* This is how to output an internal numbered label where - PREFIX is the class of label and NUM is the number within the class. */ - -#define ASM_OUTPUT_INTERNAL_LABEL(FILE,PREFIX,NUM) \ - fprintf (FILE, "%s%d:\n", PREFIX, NUM) - /* The prefix to add to user-visible assembler symbols. */ #define USER_LABEL_PREFIX "_" diff --git a/gcc/config/i386/freebsd-aout.h b/gcc/config/i386/freebsd-aout.h index a2b616e700b..646cf13b5d8 100644 --- a/gcc/config/i386/freebsd-aout.h +++ b/gcc/config/i386/freebsd-aout.h @@ -198,7 +198,8 @@ do { \ ASM_OUTPUT_MEASURED_SIZE (FILE, FNAME); \ } while (0) -#define ASM_SPEC " %| %{fpic:-k} %{fPIC:-k}" +#define AS_NEEDS_DASH_FOR_PIPED_INPUT +#define ASM_SPEC "%{fpic:-k} %{fPIC:-k}" #define LINK_SPEC \ "%{p:%e`-p' not supported; use `-pg' and gprof(1)} \ %{shared:-Bshareable} \ diff --git a/gcc/config/i386/i386-coff.h b/gcc/config/i386/i386-coff.h index e8c5de9c65c..a4bb04ad729 100644 --- a/gcc/config/i386/i386-coff.h +++ b/gcc/config/i386/i386-coff.h @@ -60,11 +60,4 @@ Boston, MA 02111-1307, USA. */ #define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER) \ sprintf ((BUF), ".%s%ld", (PREFIX), (long)(NUMBER)) -/* This is how to output an internal numbered label where - PREFIX is the class of label and NUM is the number within the class. */ - -#undef ASM_OUTPUT_INTERNAL_LABEL -#define ASM_OUTPUT_INTERNAL_LABEL(FILE,PREFIX,NUM) \ - fprintf (FILE, ".%s%d:\n", PREFIX, NUM) - /* end of i386-coff.h */ diff --git a/gcc/config/i386/i386-interix.h b/gcc/config/i386/i386-interix.h index d309087217d..7e2290f2d3a 100644 --- a/gcc/config/i386/i386-interix.h +++ b/gcc/config/i386/i386-interix.h @@ -35,11 +35,12 @@ Boston, MA 02111-1307, USA. */ /* By default, target has a 80387, uses IEEE compatible arithmetic, and returns float values in the 387 and needs stack probes - We also align doubles to 64-bits for MSVC default compatibility */ + We also align doubles to 64-bits for MSVC default compatibility + We do bitfields MSVC-compatably by default, too. */ #undef TARGET_SUBTARGET_DEFAULT #define TARGET_SUBTARGET_DEFAULT \ (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_STACK_PROBE | \ - MASK_ALIGN_DOUBLE) + MASK_ALIGN_DOUBLE | MASK_MS_BITFIELD_LAYOUT) #undef TARGET_CPU_DEFAULT #define TARGET_CPU_DEFAULT 2 /* 486 */ @@ -243,6 +244,28 @@ Boston, MA 02111-1307, USA. */ #define TARGET_NOP_FUN_DLLIMPORT 1 #define drectve_section() /* nothing */ +/* Objective C has its own packing rules... + Objc tries to parallel the code in stor-layout.c at runtime + (see libobjc/encoding.c). This (compile-time) packing info isn't + available at runtime, so it's hopeless to try. + + And if the user tries to set the flag for objc, give an error + so he has some clue. */ + +#undef SUBTARGET_OVERRIDE_OPTIONS +#define SUBTARGET_OVERRIDE_OPTIONS \ +do { \ + if (strcmp (lang_hooks.name, "GNU Objective-C") == 0) \ + { \ + if ((target_flags & MASK_MS_BITFIELD_LAYOUT) != 0 \ + && (target_flags_explicit & MASK_MS_BITFIELD_LAYOUT) != 0) \ + { \ + error ("ms-bitfields not supported for objc"); \ + } \ + target_flags &= ~MASK_MS_BITFIELD_LAYOUT; \ + } \ +} while (0) + #define EH_FRAME_IN_DATA_SECTION #define READONLY_DATA_SECTION_ASM_OP "\t.section\t.rdata,\"r\"" @@ -273,8 +296,6 @@ while (0) #define HOST_PTR_AS_INT unsigned long #define PCC_BITFIELD_TYPE_MATTERS 1 -#define PCC_BITFIELD_TYPE_TEST TYPE_NATIVE(rec) -#define GROUP_BITFIELDS_BY_ALIGN TYPE_NATIVE(rec) /* The following two flags are usually "off" for i386, because some non-gnu tools (for the i386) don't handle them. However, we don't have that diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 4afdf668bd8..e403950ab32 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -41,6 +41,9 @@ extern int ix86_aligned_p PARAMS ((rtx)); extern int standard_80387_constant_p PARAMS ((rtx)); extern int standard_sse_constant_p PARAMS ((rtx)); extern int symbolic_reference_mentioned_p PARAMS ((rtx)); +extern bool extended_reg_mentioned_p PARAMS ((rtx)); +extern bool x86_extended_QIreg_mentioned_p PARAMS ((rtx)); +extern bool x86_extended_reg_mentioned_p PARAMS ((rtx)); extern int any_fp_register_operand PARAMS ((rtx, enum machine_mode)); extern int register_and_not_any_fp_reg_operand PARAMS ((rtx, enum machine_mode)); @@ -63,6 +66,7 @@ extern int initial_exec_symbolic_operand PARAMS ((rtx, enum machine_mode)); extern int local_exec_symbolic_operand PARAMS ((rtx, enum machine_mode)); extern int pic_symbolic_operand PARAMS ((rtx, enum machine_mode)); extern int call_insn_operand PARAMS ((rtx, enum machine_mode)); +extern int sibcall_insn_operand PARAMS ((rtx, enum machine_mode)); extern int constant_call_address_operand PARAMS ((rtx, enum machine_mode)); extern int const0_operand PARAMS ((rtx, enum machine_mode)); extern int const1_operand PARAMS ((rtx, enum machine_mode)); @@ -137,7 +141,7 @@ extern void ix86_expand_branch PARAMS ((enum rtx_code, rtx)); extern int ix86_expand_setcc PARAMS ((enum rtx_code, rtx)); extern int ix86_expand_int_movcc PARAMS ((rtx[])); extern int ix86_expand_fp_movcc PARAMS ((rtx[])); -extern void ix86_expand_call PARAMS ((rtx, rtx, rtx, rtx, rtx)); +extern void ix86_expand_call PARAMS ((rtx, rtx, rtx, rtx, rtx, int)); extern void x86_initialize_trampoline PARAMS ((rtx, rtx, rtx)); extern rtx ix86_zero_extend_to_Pmode PARAMS ((rtx)); extern void ix86_split_long_move PARAMS ((rtx[])); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 42f6d93d3c3..2eaa1c54875 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21,6 +21,8 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include "system.h" +#include "coretypes.h" +#include "tm.h" #include "rtl.h" #include "tree.h" #include "tm_p.h" @@ -55,9 +57,9 @@ struct processor_costs size_cost = { /* costs for tunning for size */ 3, /* cost of a lea instruction */ 2, /* variable shift costs */ 3, /* constant shift costs */ - 3, /* cost of starting a multiply */ + {3, 3, 3, 3, 5}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 3, /* cost of a divide/mod */ + {3, 3, 3, 3, 5}, /* cost of a divide/mod */ 3, /* cost of movsx */ 3, /* cost of movzx */ 0, /* "large" insn */ @@ -84,6 +86,7 @@ struct processor_costs size_cost = { /* costs for tunning for size */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ 2, /* cost of FADD and FSUB insns. */ 2, /* cost of FMUL instruction. */ 2, /* cost of FDIV instruction. */ @@ -99,9 +102,9 @@ struct processor_costs i386_cost = { /* 386 specific costs */ 1, /* cost of a lea instruction */ 3, /* variable shift costs */ 2, /* constant shift costs */ - 6, /* cost of starting a multiply */ + {6, 6, 6, 6, 6}, /* cost of starting a multiply */ 1, /* cost of multiply per each bit set */ - 23, /* cost of a divide/mod */ + {23, 23, 23, 23, 23}, /* cost of a divide/mod */ 3, /* cost of movsx */ 2, /* cost of movzx */ 15, /* "large" insn */ @@ -128,6 +131,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ 23, /* cost of FADD and FSUB insns. */ 27, /* cost of FMUL instruction. */ 88, /* cost of FDIV instruction. */ @@ -142,9 +146,9 @@ struct processor_costs i486_cost = { /* 486 specific costs */ 1, /* cost of a lea instruction */ 3, /* variable shift costs */ 2, /* constant shift costs */ - 12, /* cost of starting a multiply */ + {12, 12, 12, 12, 12}, /* cost of starting a multiply */ 1, /* cost of multiply per each bit set */ - 40, /* cost of a divide/mod */ + {40, 40, 40, 40, 40}, /* cost of a divide/mod */ 3, /* cost of movsx */ 2, /* cost of movzx */ 15, /* "large" insn */ @@ -171,6 +175,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ 8, /* cost of FADD and FSUB insns. */ 16, /* cost of FMUL instruction. */ 73, /* cost of FDIV instruction. */ @@ -185,9 +190,9 @@ struct processor_costs pentium_cost = { 1, /* cost of a lea instruction */ 4, /* variable shift costs */ 1, /* constant shift costs */ - 11, /* cost of starting a multiply */ + {11, 11, 11, 11, 11}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 25, /* cost of a divide/mod */ + {25, 25, 25, 25, 25}, /* cost of a divide/mod */ 3, /* cost of movsx */ 2, /* cost of movzx */ 8, /* "large" insn */ @@ -214,6 +219,7 @@ struct processor_costs pentium_cost = { 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 2, /* Branch cost */ 3, /* cost of FADD and FSUB insns. */ 3, /* cost of FMUL instruction. */ 39, /* cost of FDIV instruction. */ @@ -228,9 +234,9 @@ struct processor_costs pentiumpro_cost = { 1, /* cost of a lea instruction */ 1, /* variable shift costs */ 1, /* constant shift costs */ - 4, /* cost of starting a multiply */ + {4, 4, 4, 4, 4}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 17, /* cost of a divide/mod */ + {17, 17, 17, 17, 17}, /* cost of a divide/mod */ 1, /* cost of movsx */ 1, /* cost of movzx */ 8, /* "large" insn */ @@ -257,6 +263,7 @@ struct processor_costs pentiumpro_cost = { 3, /* MMX or SSE register to integer */ 32, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ 3, /* cost of FADD and FSUB insns. */ 5, /* cost of FMUL instruction. */ 56, /* cost of FDIV instruction. */ @@ -271,9 +278,9 @@ struct processor_costs k6_cost = { 2, /* cost of a lea instruction */ 1, /* variable shift costs */ 1, /* constant shift costs */ - 3, /* cost of starting a multiply */ + {3, 3, 3, 3, 3}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 18, /* cost of a divide/mod */ + {18, 18, 18, 18, 18}, /* cost of a divide/mod */ 2, /* cost of movsx */ 2, /* cost of movzx */ 8, /* "large" insn */ @@ -300,6 +307,7 @@ struct processor_costs k6_cost = { 6, /* MMX or SSE register to integer */ 32, /* size of prefetch block */ 1, /* number of parallel prefetches */ + 1, /* Branch cost */ 2, /* cost of FADD and FSUB insns. */ 2, /* cost of FMUL instruction. */ 56, /* cost of FDIV instruction. */ @@ -314,9 +322,9 @@ struct processor_costs athlon_cost = { 2, /* cost of a lea instruction */ 1, /* variable shift costs */ 1, /* constant shift costs */ - 5, /* cost of starting a multiply */ + {5, 5, 5, 5, 5}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 42, /* cost of a divide/mod */ + {18, 26, 42, 74, 74}, /* cost of a divide/mod */ 1, /* cost of movsx */ 1, /* cost of movzx */ 8, /* "large" insn */ @@ -343,6 +351,7 @@ struct processor_costs athlon_cost = { 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ 4, /* cost of FADD and FSUB insns. */ 4, /* cost of FMUL instruction. */ 24, /* cost of FDIV instruction. */ @@ -352,14 +361,58 @@ struct processor_costs athlon_cost = { }; static const +struct processor_costs k8_cost = { + 1, /* cost of an add instruction */ + 2, /* cost of a lea instruction */ + 1, /* variable shift costs */ + 1, /* constant shift costs */ + {3, 4, 3, 4, 5}, /* cost of starting a multiply */ + 0, /* cost of multiply per each bit set */ + {18, 26, 42, 74, 74}, /* cost of a divide/mod */ + 1, /* cost of movsx */ + 1, /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of loading integer registers */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 3, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + 4, /* cost of FADD and FSUB insns. */ + 4, /* cost of FMUL instruction. */ + 19, /* cost of FDIV instruction. */ + 2, /* cost of FABS instruction. */ + 2, /* cost of FCHS instruction. */ + 35, /* cost of FSQRT instruction. */ +}; + +static const struct processor_costs pentium4_cost = { 1, /* cost of an add instruction */ 1, /* cost of a lea instruction */ - 8, /* variable shift costs */ - 8, /* constant shift costs */ - 30, /* cost of starting a multiply */ + 4, /* variable shift costs */ + 4, /* constant shift costs */ + {15, 15, 15, 15, 15}, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 112, /* cost of a divide/mod */ + {56, 56, 56, 56, 56}, /* cost of a divide/mod */ 1, /* cost of movsx */ 1, /* cost of movzx */ 16, /* "large" insn */ @@ -386,6 +439,7 @@ struct processor_costs pentium4_cost = { 10, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ 5, /* cost of FADD and FSUB insns. */ 7, /* cost of FMUL instruction. */ 43, /* cost of FDIV instruction. */ @@ -404,52 +458,66 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_K6 (1<<PROCESSOR_K6) #define m_ATHLON (1<<PROCESSOR_ATHLON) #define m_PENT4 (1<<PROCESSOR_PENTIUM4) +#define m_K8 (1<<PROCESSOR_K8) +#define m_ATHLON_K8 (m_K8 | m_ATHLON) -const int x86_use_leave = m_386 | m_K6 | m_ATHLON; -const int x86_push_memory = m_386 | m_K6 | m_ATHLON | m_PENT4; +const int x86_use_leave = m_386 | m_K6 | m_ATHLON_K8; +const int x86_push_memory = m_386 | m_K6 | m_ATHLON_K8 | m_PENT4; const int x86_zero_extend_with_and = m_486 | m_PENT; -const int x86_movx = m_ATHLON | m_PPRO | m_PENT4 /* m_386 | m_K6 */; +const int x86_movx = m_ATHLON_K8 | m_PPRO | m_PENT4 /* m_386 | m_K6 */; const int x86_double_with_add = ~m_386; const int x86_use_bit_test = m_386; -const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6; -const int x86_cmove = m_PPRO | m_ATHLON | m_PENT4; -const int x86_3dnow_a = m_ATHLON; -const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON | m_PENT4; +const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8 | m_K6; +const int x86_cmove = m_PPRO | m_ATHLON_K8 | m_PENT4; +const int x86_3dnow_a = m_ATHLON_K8; +const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON_K8 | m_PENT4; const int x86_branch_hints = m_PENT4; const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4; const int x86_partial_reg_stall = m_PPRO; const int x86_use_loop = m_K6; -const int x86_use_fiop = ~(m_PPRO | m_ATHLON | m_PENT); +const int x86_use_fiop = ~(m_PPRO | m_ATHLON_K8 | m_PENT); const int x86_use_mov0 = m_K6; const int x86_use_cltd = ~(m_PENT | m_K6); const int x86_read_modify_write = ~m_PENT; const int x86_read_modify = ~(m_PENT | m_PPRO); const int x86_split_long_moves = m_PPRO; -const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON; +const int x86_promote_QImode = m_K6 | m_PENT | m_386 | m_486 | m_ATHLON_K8; const int x86_fast_prefix = ~(m_PENT | m_486 | m_386); const int x86_single_stringop = m_386 | m_PENT4; const int x86_qimode_math = ~(0); const int x86_promote_qi_regs = 0; const int x86_himode_math = ~(m_PPRO); const int x86_promote_hi_regs = m_PPRO; -const int x86_sub_esp_4 = m_ATHLON | m_PPRO | m_PENT4; -const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486 | m_PENT4; -const int x86_add_esp_4 = m_ATHLON | m_K6 | m_PENT4; -const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4; -const int x86_integer_DFmode_moves = ~(m_ATHLON | m_PENT4 | m_PPRO); -const int x86_partial_reg_dependency = m_ATHLON | m_PENT4; -const int x86_memory_mismatch_stall = m_ATHLON | m_PENT4; -const int x86_accumulate_outgoing_args = m_ATHLON | m_PENT4 | m_PPRO; -const int x86_prologue_using_move = m_ATHLON | m_PENT4 | m_PPRO; -const int x86_epilogue_using_move = m_ATHLON | m_PENT4 | m_PPRO; +const int x86_sub_esp_4 = m_ATHLON_K8 | m_PPRO | m_PENT4; +const int x86_sub_esp_8 = m_ATHLON_K8 | m_PPRO | m_386 | m_486 | m_PENT4; +const int x86_add_esp_4 = m_ATHLON_K8 | m_K6 | m_PENT4; +const int x86_add_esp_8 = m_ATHLON_K8 | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4; +const int x86_integer_DFmode_moves = ~(m_ATHLON_K8 | m_PENT4 | m_PPRO); +const int x86_partial_reg_dependency = m_ATHLON_K8 | m_PENT4; +const int x86_memory_mismatch_stall = m_ATHLON_K8 | m_PENT4; +const int x86_accumulate_outgoing_args = m_ATHLON_K8 | m_PENT4 | m_PPRO; +const int x86_prologue_using_move = m_ATHLON_K8 | m_PENT4 | m_PPRO; +const int x86_epilogue_using_move = m_ATHLON_K8 | m_PENT4 | m_PPRO; const int x86_decompose_lea = m_PENT4; const int x86_shift1 = ~m_486; -const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON | m_PENT4; +const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON_K8 | m_PENT4; +const int x86_sse_partial_reg_dependency = m_PENT4 | m_PPRO; +/* Set for machines where the type and dependencies are resolved on SSE register + parts insetad of whole registers, so we may maintain just lower part of + scalar values in proper format leaving the upper part undefined. */ +const int x86_sse_partial_regs = m_ATHLON_K8; +/* Athlon optimizes partial-register FPS special case, thus avoiding the + need for extra instructions beforehand */ +const int x86_sse_partial_regs_for_cvtsd2ss = 0; +const int x86_sse_typeless_stores = m_ATHLON_K8; +const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4; +const int x86_use_ffreep = m_ATHLON_K8; +const int x86_rep_movl_optimal = m_386 | m_PENT | m_PPRO | m_K6; /* In case the avreage insn count for single function invocation is lower than this constant, emit fast (but longer) prologue and epilogue code. */ -#define FAST_PROLOGUE_INSN_COUNT 30 +#define FAST_PROLOGUE_INSN_COUNT 20 /* Set by prologue expander and used by epilogue expander to determine the style used. */ @@ -755,6 +823,7 @@ static void x86_output_mi_thunk PARAMS ((FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT, tree)); static bool x86_can_output_mi_thunk PARAMS ((tree, HOST_WIDE_INT, HOST_WIDE_INT, tree)); +bool ix86_expand_carry_flag_compare PARAMS ((enum rtx_code, rtx, rtx, rtx*)); struct ix86_address { @@ -796,9 +865,12 @@ static void ix86_compute_frame_layout PARAMS ((struct ix86_frame *)); static int ix86_comp_type_attributes PARAMS ((tree, tree)); static int ix86_fntype_regparm PARAMS ((tree)); const struct attribute_spec ix86_attribute_table[]; +static bool ix86_function_ok_for_sibcall PARAMS ((tree, tree)); static tree ix86_handle_cdecl_attribute PARAMS ((tree *, tree, tree, int, bool *)); static tree ix86_handle_regparm_attribute PARAMS ((tree *, tree, tree, int, bool *)); static int ix86_value_regno PARAMS ((enum machine_mode)); +static bool ix86_ms_bitfield_layout_p PARAMS ((tree)); +static int extended_reg_mentioned_1 PARAMS ((rtx *, void *)); #if defined (DO_GLOBAL_CTORS_BODY) && defined (HAS_INIT_SECTION) static void ix86_svr3_asm_out_constructor PARAMS ((rtx, int)); @@ -897,6 +969,9 @@ static enum x86_64_reg_class merge_classes PARAMS ((enum x86_64_reg_class, #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ ia32_multipass_dfa_lookahead +#undef TARGET_FUNCTION_OK_FOR_SIBCALL +#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall + #ifdef HAVE_AS_TLS #undef TARGET_HAVE_TLS #define TARGET_HAVE_TLS true @@ -904,6 +979,9 @@ static enum x86_64_reg_class merge_classes PARAMS ((enum x86_64_reg_class, #undef TARGET_CANNOT_FORCE_CONST_MEM #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem +#undef TARGET_MS_BITFIELD_LAYOUT_P +#define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p + #undef TARGET_ASM_OUTPUT_MI_THUNK #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK @@ -937,17 +1015,17 @@ override_options () const int align_jump; const int align_jump_max_skip; const int align_func; - const int branch_cost; } const processor_target_table[PROCESSOR_max] = { - {&i386_cost, 0, 0, 4, 3, 4, 3, 4, 1}, - {&i486_cost, 0, 0, 16, 15, 16, 15, 16, 1}, - {&pentium_cost, 0, 0, 16, 7, 16, 7, 16, 1}, - {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16, 1}, - {&k6_cost, 0, 0, 32, 7, 32, 7, 32, 1}, - {&athlon_cost, 0, 0, 16, 7, 64, 7, 16, 1}, - {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0, 1} + {&i386_cost, 0, 0, 4, 3, 4, 3, 4}, + {&i486_cost, 0, 0, 16, 15, 16, 15, 16}, + {&pentium_cost, 0, 0, 16, 7, 16, 7, 16}, + {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16}, + {&k6_cost, 0, 0, 32, 7, 32, 7, 32}, + {&athlon_cost, 0, 0, 16, 7, 16, 7, 16}, + {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0}, + {&k8_cost, 0, 0, 16, 7, 16, 7, 16} }; static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES; @@ -962,7 +1040,8 @@ override_options () PTA_MMX = 4, PTA_PREFETCH_SSE = 8, PTA_3DNOW = 16, - PTA_3DNOW_A = 64 + PTA_3DNOW_A = 64, + PTA_64BIT = 128 } flags; } const processor_alias_table[] = @@ -994,6 +1073,8 @@ override_options () | PTA_3DNOW_A | PTA_SSE}, {"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE}, + {"k8", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT + | PTA_3DNOW_A | PTA_SSE | PTA_SSE2}, }; int const pta_size = ARRAY_SIZE (processor_alias_table); @@ -1033,7 +1114,7 @@ override_options () if (!ix86_cpu_string) ix86_cpu_string = cpu_names [TARGET_CPU_DEFAULT]; if (!ix86_arch_string) - ix86_arch_string = TARGET_64BIT ? "athlon-4" : "i386"; + ix86_arch_string = TARGET_64BIT ? "k8" : "i386"; if (ix86_cmodel_string != 0) { @@ -1099,6 +1180,8 @@ override_options () target_flags |= MASK_SSE2; if (processor_alias_table[i].flags & PTA_PREFETCH_SSE) x86_prefetch_sse = true; + if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT)) + error ("CPU you selected does not support x86-64 instruction set"); break; } @@ -1109,6 +1192,8 @@ override_options () if (! strcmp (ix86_cpu_string, processor_alias_table[i].name)) { ix86_cpu = processor_alias_table[i].processor; + if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT)) + error ("CPU you selected does not support x86-64 instruction set"); break; } if (processor_alias_table[i].flags & PTA_PREFETCH_SSE) @@ -1215,7 +1300,7 @@ override_options () } /* Validate -mbranch-cost= value, or provide default. */ - ix86_branch_cost = processor_target_table[ix86_cpu].branch_cost; + ix86_branch_cost = processor_target_table[ix86_cpu].cost->branch_cost; if (ix86_branch_cost_string) { i = atoi (ix86_branch_cost_string); @@ -1371,6 +1456,60 @@ const struct attribute_spec ix86_attribute_table[] = { NULL, 0, 0, false, false, false, NULL } }; +/* If PIC, we cannot make sibling calls to global functions + because the PLT requires %ebx live. + If we are returning floats on the register stack, we cannot make + sibling calls to functions that return floats. (The stack adjust + instruction will wind up after the sibcall jump, and not be executed.) */ + +static bool +ix86_function_ok_for_sibcall (decl, exp) + tree decl; + tree exp; +{ + /* If we are generating position-independent code, we cannot sibcall + optimize any indirect call, or a direct call to a global function, + as the PLT requires %ebx be live. */ + if (!TARGET_64BIT && flag_pic && (!decl || TREE_PUBLIC (decl))) + return false; + + /* If we are returning floats on the 80387 register stack, we cannot + make a sibcall from a function that doesn't return a float to a + function that does; the necessary stack adjustment will not be + executed. */ + if (STACK_REG_P (ix86_function_value (TREE_TYPE (exp))) + && ! STACK_REG_P (ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl))))) + return false; + + /* If this call is indirect, we'll need to be able to use a call-clobbered + register for the address of the target function. Make sure that all + such registers are not used for passing parameters. */ + if (!decl && !TARGET_64BIT) + { + int regparm = ix86_regparm; + tree attr, type; + + /* We're looking at the CALL_EXPR, we need the type of the function. */ + type = TREE_OPERAND (exp, 0); /* pointer expression */ + type = TREE_TYPE (type); /* pointer type */ + type = TREE_TYPE (type); /* function type */ + + attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); + if (attr) + regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); + + if (regparm >= 3) + { + /* ??? Need to count the actual number of registers to be used, + not the possible number of registers. Fix later. */ + return false; + } + } + + /* Otherwise okay. That also includes certain types of indirect calls. */ + return true; +} + /* Handle a "cdecl" or "stdcall" attribute; arguments as in struct attribute_spec.handler. */ static tree @@ -3209,6 +3348,32 @@ call_insn_operand (op, mode) return general_operand (op, Pmode); } +/* Test for a valid operand for a call instruction. Don't allow the + arg pointer register or virtual regs since they may decay into + reg + const, which the patterns can't handle. */ + +int +sibcall_insn_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + /* Disallow indirect through a virtual register. This leads to + compiler aborts when trying to eliminate them. */ + if (GET_CODE (op) == REG + && (op == arg_pointer_rtx + || op == frame_pointer_rtx + || (REGNO (op) >= FIRST_PSEUDO_REGISTER + && REGNO (op) <= LAST_VIRTUAL_REGISTER))) + return 0; + + /* Explicitly allow SYMBOL_REF even if pic. */ + if (GET_CODE (op) == SYMBOL_REF) + return 1; + + /* Otherwise we can only allow register operands. */ + return register_operand (op, Pmode); +} + int constant_call_address_operand (op, mode) rtx op; @@ -3387,6 +3552,18 @@ q_regs_operand (op, mode) return ANY_QI_REG_P (op); } +/* Return true if op is an flags register. */ + +int +flags_reg_operand (op, mode) + register rtx op; + enum machine_mode mode; +{ + if (mode != VOIDmode && GET_MODE (op) != mode) + return 0; + return REG_P (op) && REGNO (op) == FLAGS_REG && GET_MODE (op) != VOIDmode; +} + /* Return true if op is a NON_Q_REGS class register. */ int @@ -3401,6 +3578,31 @@ non_q_regs_operand (op, mode) return NON_QI_REG_P (op); } +int +zero_extended_scalar_load_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + unsigned n_elts; + if (GET_CODE (op) != MEM) + return 0; + op = maybe_get_pool_constant (op); + if (!op) + return 0; + if (GET_CODE (op) != CONST_VECTOR) + return 0; + n_elts = + (GET_MODE_SIZE (GET_MODE (op)) / + GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op)))); + for (n_elts--; n_elts > 0; n_elts--) + { + rtx elt = CONST_VECTOR_ELT (op, n_elts); + if (elt != CONST0_RTX (GET_MODE_INNER (GET_MODE (op)))) + return 0; + } + return 1; +} + /* Return 1 if OP is a comparison that can be used in the CMPSS/CMPPS insns. */ int @@ -4160,7 +4362,7 @@ output_set_got (dest) is what will be referred to by the Mach-O PIC subsystem. */ ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ()); #endif - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, "L", + (*targetm.asm_out.internal_label) (asm_out_file, "L", CODE_LABEL_NUMBER (XEXP (xops[2], 0))); if (flag_pic) @@ -4456,14 +4658,32 @@ ix86_expand_prologue () int use_mov = 0; HOST_WIDE_INT allocate; + ix86_compute_frame_layout (&frame); if (!optimize_size) { - use_fast_prologue_epilogue - = !expensive_function_p (FAST_PROLOGUE_INSN_COUNT); + int count = frame.nregs; + + /* The fast prologue uses move instead of push to save registers. This + is significantly longer, but also executes faster as modern hardware + can execute the moves in parallel, but can't do that for push/pop. + + Be curefull about choosing what prologue to emit: When function takes + many instructions to execute we may use slow version as well as in + case function is known to be outside hot spot (this is known with + feedback only). Weight the size of function by number of registers + to save as it is cheap to use one or two push instructions but very + slow to use many of them. */ + if (count) + count = (count - 1) * FAST_PROLOGUE_INSN_COUNT; + if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL + || (flag_branch_probabilities + && cfun->function_frequency < FUNCTION_FREQUENCY_HOT)) + use_fast_prologue_epilogue = 0; + else + use_fast_prologue_epilogue = !expensive_function_p (count); if (TARGET_PROLOGUE_USING_MOVE) use_mov = use_fast_prologue_epilogue; } - ix86_compute_frame_layout (&frame); /* Note: AT&T enter does NOT have reversed args. Enter is probably slower on all targets. Also sdb doesn't like it. */ @@ -8960,6 +9180,84 @@ ix86_expand_setcc (code, dest) return 1; /* DONE */ } +/* Expand comparison setting or clearing carry flag. Return true when sucesfull + and set pop for the operation. */ +bool +ix86_expand_carry_flag_compare (code, op0, op1, pop) + rtx op0, op1, *pop; + enum rtx_code code; +{ + enum machine_mode mode = + GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); + + /* Do not handle DImode compares that go trought special path. Also we can't + deal with FP compares yet. This is possible to add. */ + if ((mode == DImode && !TARGET_64BIT) || !INTEGRAL_MODE_P (mode)) + return false; + switch (code) + { + case LTU: + case GEU: + break; + + /* Convert a==0 into (unsigned)a<1. */ + case EQ: + case NE: + if (op1 != const0_rtx) + return false; + op1 = const1_rtx; + code = (code == EQ ? LTU : GEU); + break; + + /* Convert a>b into b<a or a>=b-1. */ + case GTU: + case LEU: + if (GET_CODE (op1) == CONST_INT) + { + op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); + /* Bail out on overflow. We still can swap operands but that + would force loading of the constant into register. */ + if (op1 == const0_rtx + || !x86_64_immediate_operand (op1, GET_MODE (op1))) + return false; + code = (code == GTU ? GEU : LTU); + } + else + { + rtx tmp = op1; + op1 = op0; + op0 = tmp; + code = (code == GTU ? LTU : GEU); + } + break; + + /* Convert a>0 into (unsigned)a<0x7fffffff. */ + case LT: + case GE: + if (mode == DImode || op1 != const0_rtx) + return false; + op1 = gen_int_mode (~(1 << (GET_MODE_BITSIZE (mode) - 1)), mode); + code = (code == LT ? GEU : LTU); + break; + case LE: + case GT: + if (mode == DImode || op1 != constm1_rtx) + return false; + op1 = gen_int_mode (~(1 << (GET_MODE_BITSIZE (mode) - 1)), mode); + code = (code == LE ? GEU : LTU); + break; + + default: + return false; + } + ix86_compare_op0 = op0; + ix86_compare_op1 = op1; + *pop = ix86_expand_compare (code, NULL, NULL); + if (GET_CODE (*pop) != LTU && GET_CODE (*pop) != GEU) + abort (); + return true; +} + int ix86_expand_int_movcc (operands) rtx operands[]; @@ -8968,30 +9266,7 @@ ix86_expand_int_movcc (operands) rtx compare_seq, compare_op; rtx second_test, bypass_test; enum machine_mode mode = GET_MODE (operands[0]); - - /* When the compare code is not LTU or GEU, we can not use sbbl case. - In case comparsion is done with immediate, we can convert it to LTU or - GEU by altering the integer. */ - - if ((code == LEU || code == GTU) - && GET_CODE (ix86_compare_op1) == CONST_INT - && mode != HImode - && INTVAL (ix86_compare_op1) != -1 - /* For x86-64, the immediate field in the instruction is 32-bit - signed, so we can't increment a DImode value above 0x7fffffff. */ - && (!TARGET_64BIT - || GET_MODE (ix86_compare_op0) != DImode - || INTVAL (ix86_compare_op1) != 0x7fffffff) - && GET_CODE (operands[2]) == CONST_INT - && GET_CODE (operands[3]) == CONST_INT) - { - if (code == LEU) - code = LTU; - else - code = GEU; - ix86_compare_op1 = gen_int_mode (INTVAL (ix86_compare_op1) + 1, - GET_MODE (ix86_compare_op0)); - } + bool sign_bit_compare_p = false;; start_sequence (); compare_op = ix86_expand_compare (code, &second_test, &bypass_test); @@ -9000,10 +9275,14 @@ ix86_expand_int_movcc (operands) compare_code = GET_CODE (compare_op); + if ((ix86_compare_op1 == const0_rtx && (code == GE || code == LT)) + || (ix86_compare_op1 == constm1_rtx && (code == GT || code == LE))) + sign_bit_compare_p = true; + /* Don't attempt mode expansion here -- if we had to expand 5 or 6 HImode insns, we'd be swallowed in word prefix ops. */ - if (mode != HImode + if ((mode != HImode || TARGET_FAST_PREFIX) && (mode != DImode || TARGET_64BIT) && GET_CODE (operands[2]) == CONST_INT && GET_CODE (operands[3]) == CONST_INT) @@ -9013,32 +9292,53 @@ ix86_expand_int_movcc (operands) HOST_WIDE_INT cf = INTVAL (operands[3]); HOST_WIDE_INT diff; - if ((compare_code == LTU || compare_code == GEU) - && !second_test && !bypass_test) + diff = ct - cf; + /* Sign bit compares are better done using shifts than we do by using + sbb. */ + if (sign_bit_compare_p + || ix86_expand_carry_flag_compare (code, ix86_compare_op0, + ix86_compare_op1, &compare_op)) { /* Detect overlap between destination and compare sources. */ rtx tmp = out; - /* To simplify rest of code, restrict to the GEU case. */ - if (compare_code == LTU) + if (!sign_bit_compare_p) { - HOST_WIDE_INT tmp = ct; - ct = cf; - cf = tmp; - compare_code = reverse_condition (compare_code); - code = reverse_condition (code); - } - diff = ct - cf; + compare_code = GET_CODE (compare_op); + + /* To simplify rest of code, restrict to the GEU case. */ + if (compare_code == LTU) + { + HOST_WIDE_INT tmp = ct; + ct = cf; + cf = tmp; + compare_code = reverse_condition (compare_code); + code = reverse_condition (code); + } + diff = ct - cf; - if (reg_overlap_mentioned_p (out, ix86_compare_op0) - || reg_overlap_mentioned_p (out, ix86_compare_op1)) - tmp = gen_reg_rtx (mode); + if (reg_overlap_mentioned_p (out, ix86_compare_op0) + || reg_overlap_mentioned_p (out, ix86_compare_op1)) + tmp = gen_reg_rtx (mode); - emit_insn (compare_seq); - if (mode == DImode) - emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp)); + if (mode == DImode) + emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp)); + else + emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp))); + } else - emit_insn (gen_x86_movsicc_0_m1 (tmp)); + { + if (code == GT || code == GE) + code = reverse_condition (code); + else + { + HOST_WIDE_INT tmp = ct; + ct = cf; + cf = tmp; + } + tmp = emit_store_flag (tmp, code, ix86_compare_op0, + ix86_compare_op1, VOIDmode, 0, -1); + } if (diff == 1) { @@ -9052,7 +9352,7 @@ ix86_expand_int_movcc (operands) if (ct) tmp = expand_simple_binop (mode, PLUS, tmp, GEN_INT (ct), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), 1, OPTAB_DIRECT); } else if (cf == -1) { @@ -9065,7 +9365,7 @@ ix86_expand_int_movcc (operands) */ tmp = expand_simple_binop (mode, IOR, tmp, GEN_INT (ct), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), 1, OPTAB_DIRECT); } else if (diff == -1 && ct) { @@ -9077,11 +9377,11 @@ ix86_expand_int_movcc (operands) * * Size 8 - 11. */ - tmp = expand_simple_unop (mode, NOT, tmp, tmp, 1); + tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); if (cf) tmp = expand_simple_binop (mode, PLUS, - tmp, GEN_INT (cf), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), GEN_INT (cf), + copy_rtx (tmp), 1, OPTAB_DIRECT); } else { @@ -9099,26 +9399,25 @@ ix86_expand_int_movcc (operands) { cf = ct; ct = 0; - tmp = expand_simple_unop (mode, NOT, tmp, tmp, 1); + tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); } tmp = expand_simple_binop (mode, AND, - tmp, + copy_rtx (tmp), gen_int_mode (cf - ct, mode), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), 1, OPTAB_DIRECT); if (ct) tmp = expand_simple_binop (mode, PLUS, - tmp, GEN_INT (ct), - tmp, 1, OPTAB_DIRECT); + copy_rtx (tmp), GEN_INT (ct), + copy_rtx (tmp), 1, OPTAB_DIRECT); } - if (tmp != out) - emit_move_insn (out, tmp); + if (!rtx_equal_p (tmp, out)) + emit_move_insn (copy_rtx (out), copy_rtx (tmp)); return 1; /* DONE */ } - diff = ct - cf; if (diff < 0) { HOST_WIDE_INT tmp; @@ -9194,8 +9493,10 @@ ix86_expand_int_movcc (operands) } } + if ((diff == 1 || diff == 2 || diff == 4 || diff == 8 || diff == 3 || diff == 5 || diff == 9) + && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL) && (mode != DImode || x86_64_sign_extended_value (GEN_INT (cf)))) { /* @@ -9237,15 +9538,14 @@ ix86_expand_int_movcc (operands) tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf)); nops++; } - if (tmp != out - && (GET_CODE (tmp) != SUBREG || SUBREG_REG (tmp) != out)) + if (!rtx_equal_p (tmp, out)) { if (nops == 1) out = force_operand (tmp, out); else - emit_insn (gen_rtx_SET (VOIDmode, out, tmp)); + emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp))); } - if (out != operands[0]) + if (!rtx_equal_p (out, operands[0])) emit_move_insn (operands[0], copy_rtx (out)); return 1; /* DONE */ @@ -9265,12 +9565,10 @@ ix86_expand_int_movcc (operands) * This is reasonably steep, but branch mispredict costs are * high on modern cpus, so consider failing only if optimizing * for space. - * - * %%% Parameterize branch_cost on the tuning architecture, then - * use that. The 80386 couldn't care less about mispredicts. */ - if (!optimize_size && !TARGET_CMOVE) + if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) + && BRANCH_COST >= 2) { if (cf == 0) { @@ -9324,31 +9622,31 @@ ix86_expand_int_movcc (operands) out = emit_store_flag (out, code, ix86_compare_op0, ix86_compare_op1, VOIDmode, 0, 1); - out = expand_simple_binop (mode, PLUS, out, constm1_rtx, - out, 1, OPTAB_DIRECT); + out = expand_simple_binop (mode, PLUS, copy_rtx (out), constm1_rtx, + copy_rtx (out), 1, OPTAB_DIRECT); } - out = expand_simple_binop (mode, AND, out, + out = expand_simple_binop (mode, AND, copy_rtx (out), gen_int_mode (cf - ct, mode), - out, 1, OPTAB_DIRECT); + copy_rtx (out), 1, OPTAB_DIRECT); if (ct) - out = expand_simple_binop (mode, PLUS, out, GEN_INT (ct), - out, 1, OPTAB_DIRECT); - if (out != operands[0]) - emit_move_insn (operands[0], out); + out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct), + copy_rtx (out), 1, OPTAB_DIRECT); + if (!rtx_equal_p (out, operands[0])) + emit_move_insn (operands[0], copy_rtx (out)); return 1; /* DONE */ } } - if (!TARGET_CMOVE) + if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) { /* Try a few things more with specific constants and a variable. */ optab op; rtx var, orig_out, out, tmp; - if (optimize_size) + if (BRANCH_COST <= 2) return 0; /* FAIL */ /* If one of the two operands is an interesting constant, load a @@ -9357,9 +9655,9 @@ ix86_expand_int_movcc (operands) if (GET_CODE (operands[2]) == CONST_INT) { var = operands[3]; - if (INTVAL (operands[2]) == 0) + if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) operands[3] = constm1_rtx, op = and_optab; - else if (INTVAL (operands[2]) == -1) + else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx) operands[3] = const0_rtx, op = ior_optab; else return 0; /* FAIL */ @@ -9367,9 +9665,9 @@ ix86_expand_int_movcc (operands) else if (GET_CODE (operands[3]) == CONST_INT) { var = operands[2]; - if (INTVAL (operands[3]) == 0) + if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) operands[2] = constm1_rtx, op = and_optab; - else if (INTVAL (operands[3]) == -1) + else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx) operands[2] = const0_rtx, op = ior_optab; else return 0; /* FAIL */ @@ -9388,8 +9686,8 @@ ix86_expand_int_movcc (operands) /* Mask in the interesting variable. */ out = expand_binop (mode, op, var, tmp, orig_out, 0, OPTAB_WIDEN); - if (out != orig_out) - emit_move_insn (orig_out, out); + if (!rtx_equal_p (out, orig_out)) + emit_move_insn (copy_rtx (orig_out), copy_rtx (out)); return 1; /* DONE */ } @@ -9422,27 +9720,33 @@ ix86_expand_int_movcc (operands) emit_move_insn (tmp, operands[2]); operands[2] = tmp; } + if (! register_operand (operands[2], VOIDmode) - && ! register_operand (operands[3], VOIDmode)) + && (mode == QImode + || ! register_operand (operands[3], VOIDmode))) operands[2] = force_reg (mode, operands[2]); + if (mode == QImode + && ! register_operand (operands[3], VOIDmode)) + operands[3] = force_reg (mode, operands[3]); + emit_insn (compare_seq); emit_insn (gen_rtx_SET (VOIDmode, operands[0], gen_rtx_IF_THEN_ELSE (mode, compare_op, operands[2], operands[3]))); if (bypass_test) - emit_insn (gen_rtx_SET (VOIDmode, operands[0], + emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]), gen_rtx_IF_THEN_ELSE (mode, bypass_test, - operands[3], - operands[0]))); + copy_rtx (operands[3]), + copy_rtx (operands[0])))); if (second_test) - emit_insn (gen_rtx_SET (VOIDmode, operands[0], + emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]), gen_rtx_IF_THEN_ELSE (mode, second_test, - operands[2], - operands[0]))); + copy_rtx (operands[2]), + copy_rtx (operands[0])))); return 1; /* DONE */ } @@ -9483,8 +9787,14 @@ ix86_expand_fp_movcc (operands) if (rtx_equal_p (operands[2], op0) && rtx_equal_p (operands[3], op1)) { /* Check for min operation. */ - if (code == LT) + if (code == LT || code == UNLE) { + if (code == UNLE) + { + rtx tmp = op0; + op0 = op1; + op1 = tmp; + } operands[0] = force_reg (GET_MODE (operands[0]), operands[0]); if (memory_operand (op0, VOIDmode)) op0 = force_reg (GET_MODE (operands[0]), op0); @@ -9495,8 +9805,14 @@ ix86_expand_fp_movcc (operands) return 1; } /* Check for max operation. */ - if (code == GT) + if (code == GT || code == UNGE) { + if (code == UNGE) + { + rtx tmp = op0; + op0 = op1; + op1 = tmp; + } operands[0] = force_reg (GET_MODE (operands[0]), operands[0]); if (memory_operand (op0, VOIDmode)) op0 = force_reg (GET_MODE (operands[0]), op0); @@ -10242,8 +10558,12 @@ ix86_expand_movstr (dst, src, count_exp, align_exp) /* In case we don't know anything about the alignment, default to library version, since it is usually equally fast and result in - shorter code. */ - if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD) + shorter code. + + Also emit call when we know that the count is large and call overhead + will not be important. */ + if (!TARGET_INLINE_ALL_STRINGOPS + && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) { end_sequence (); return 0; @@ -10457,8 +10777,12 @@ ix86_expand_clrstr (src, count_exp, align_exp) /* In case we don't know anything about the alignment, default to library version, since it is usually equally fast and result in - shorter code. */ - if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD) + shorter code. + + Also emit call when we know that the count is large and call overhead + will not be important. */ + if (!TARGET_INLINE_ALL_STRINGOPS + && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) return 0; if (TARGET_SINGLE_STRINGOP) @@ -10828,8 +11152,9 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx) } void -ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop) +ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop, sibcall) rtx retval, fnaddr, callarg1, callarg2, pop; + int sibcall; { rtx use = NULL, call; @@ -10861,6 +11186,15 @@ ix86_expand_call (retval, fnaddr, callarg1, callarg2, pop) fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); fnaddr = gen_rtx_MEM (QImode, fnaddr); } + if (sibcall && TARGET_64BIT + && !constant_call_address_operand (XEXP (fnaddr, 0), Pmode)) + { + rtx addr; + addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); + fnaddr = gen_rtx_REG (Pmode, 40); + emit_move_insn (fnaddr, addr); + fnaddr = gen_rtx_MEM (QImode, fnaddr); + } call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1); if (retval) @@ -11060,6 +11394,7 @@ ix86_issue_rate () case PROCESSOR_PENTIUMPRO: case PROCESSOR_PENTIUM4: case PROCESSOR_ATHLON: + case PROCESSOR_K8: return 3; default: @@ -11271,16 +11606,10 @@ ix86_adjust_cost (insn, link, dep_insn, cost) break; case PROCESSOR_ATHLON: + case PROCESSOR_K8: memory = get_attr_memory (insn); dep_memory = get_attr_memory (dep_insn); - if (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH) - { - if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV) - cost += 2; - else - cost += 3; - } /* Show ability of reorder buffer to hide latency of load by executing in parallel with previous instruction in case previous instruction is not needed to compute the address. */ @@ -11554,7 +11883,7 @@ ix86_variable_issue (dump, sched_verbose, insn, can_issue_more) static int ia32_use_dfa_pipeline_interface () { - if (ix86_cpu == PROCESSOR_PENTIUM) + if (TARGET_PENTIUM || TARGET_ATHLON_K8) return 1; return 0; } @@ -12764,7 +13093,8 @@ safe_vector_operand (x, mode) : gen_rtx_SUBREG (DImode, x, 0))); else emit_insn (gen_sse_clrv4sf (mode == V4SFmode ? x - : gen_rtx_SUBREG (V4SFmode, x, 0))); + : gen_rtx_SUBREG (V4SFmode, x, 0), + CONST0_RTX (V4SFmode))); return x; } @@ -13434,7 +13764,7 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_SSE_ZERO: target = gen_reg_rtx (V4SFmode); - emit_insn (gen_sse_clrv4sf (target)); + emit_insn (gen_sse_clrv4sf (target, CONST0_RTX (V4SFmode))); return target; case IX86_BUILTIN_MMX_ZERO: @@ -14058,6 +14388,17 @@ x86_order_regs_for_local_alloc () reg_alloc_order [pos++] = 0; } +#ifndef TARGET_USE_MS_BITFIELD_LAYOUT +#define TARGET_USE_MS_BITFIELD_LAYOUT 0 +#endif + +static bool +ix86_ms_bitfield_layout_p (record_type) + tree record_type ATTRIBUTE_UNUSED; +{ + return TARGET_USE_MS_BITFIELD_LAYOUT; +} + /* Returns an expression indicating where the this parameter is located on entry to the FUNCTION. */ @@ -14317,7 +14658,7 @@ x86_machine_dependent_reorg (first) { edge e; - if (!TARGET_ATHLON || !optimize || optimize_size) + if (!TARGET_ATHLON_K8 || !optimize || optimize_size) return; for (e = EXIT_BLOCK_PTR->pred; e; e = e->pred_next) { @@ -14328,25 +14669,69 @@ x86_machine_dependent_reorg (first) if (!returnjump_p (ret) || !maybe_hot_bb_p (bb)) continue; - prev = prev_nonnote_insn (ret); + for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev)) + if (active_insn_p (prev) || GET_CODE (prev) == CODE_LABEL) + break; if (prev && GET_CODE (prev) == CODE_LABEL) { edge e; for (e = bb->pred; e; e = e->pred_next) - if (EDGE_FREQUENCY (e) && e->src->index > 0 + if (EDGE_FREQUENCY (e) && e->src->index >= 0 && !(e->flags & EDGE_FALLTHRU)) insert = 1; } if (!insert) { - prev = prev_real_insn (ret); + prev = prev_active_insn (ret); if (prev && GET_CODE (prev) == JUMP_INSN && any_condjump_p (prev)) insert = 1; + /* Empty functions get branch misspredict even when the jump destination + is not visible to us. */ + if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED) + insert = 1; } if (insert) emit_insn_before (gen_nop (), ret); } } +/* Return nonzero when QImode register that must be represented via REX prefix + is used. */ +bool +x86_extended_QIreg_mentioned_p (insn) + rtx insn; +{ + int i; + extract_insn_cached (insn); + for (i = 0; i < recog_data.n_operands; i++) + if (REG_P (recog_data.operand[i]) + && REGNO (recog_data.operand[i]) >= 4) + return true; + return false; +} + +/* Return nonzero when P points to register encoded via REX prefix. + Called via for_each_rtx. */ +static int +extended_reg_mentioned_1 (p, data) + rtx *p; + void *data ATTRIBUTE_UNUSED; +{ + unsigned int regno; + if (!REG_P (*p)) + return 0; + regno = REGNO (*p); + return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno); +} + +/* Return true when INSN mentions register that must be encoded using REX + prefix. */ +bool +x86_extended_reg_mentioned_p (insn) + rtx insn; +{ + return for_each_rtx (&PATTERN (insn), extended_reg_mentioned_1, NULL); +} + #include "gt-i386.h" diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 493a2b5bf9c..c1f40dc209c 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -41,9 +41,11 @@ struct processor_costs { const int lea; /* cost of a lea instruction */ const int shift_var; /* variable shift costs */ const int shift_const; /* constant shift costs */ - const int mult_init; /* cost of starting a multiply */ + const int mult_init[5]; /* cost of starting a multiply + in QImode, HImode, SImode, DImode, TImode*/ const int mult_bit; /* cost of multiply per each bit set */ - const int divide; /* cost of a divide/mod */ + const int divide[5]; /* cost of a divide/mod + in QImode, HImode, SImode, DImode, TImode*/ int movsx; /* The cost of movsx operation. */ int movzx; /* The cost of movzx operation. */ const int large_insn; /* insns larger than this cost more */ @@ -75,6 +77,7 @@ struct processor_costs { const int prefetch_block; /* bytes moved to cache for prefetch. */ const int simultaneous_prefetches; /* number of parallel prefetch operations. */ + const int branch_cost; /* Default value for BRANCH_COST. */ const int fadd; /* cost of FADD and FSUB instructions. */ const int fmul; /* cost of FMUL instruction. */ const int fdiv; /* cost of FDIV instruction. */ @@ -118,8 +121,9 @@ extern int target_flags; #define MASK_3DNOW_A 0x00020000 /* Support Athlon 3Dnow builtins */ #define MASK_128BIT_LONG_DOUBLE 0x00040000 /* long double size is 128bit */ #define MASK_64BIT 0x00080000 /* Produce 64bit code */ +#define MASK_MS_BITFIELD_LAYOUT 0x00100000 /* Use native (MS) bitfield layout */ -/* Unused: 0x03f0000 */ +/* Unused: 0x03e0000 */ /* ... overlap with subtarget options starts by 0x04000000. */ #define MASK_NO_RED_ZONE 0x04000000 /* Do not use red zone */ @@ -204,6 +208,8 @@ extern int target_flags; #define TARGET_K6 (ix86_cpu == PROCESSOR_K6) #define TARGET_ATHLON (ix86_cpu == PROCESSOR_ATHLON) #define TARGET_PENTIUM4 (ix86_cpu == PROCESSOR_PENTIUM4) +#define TARGET_K8 (ix86_cpu == PROCESSOR_K8) +#define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON) #define CPUMASK (1 << ix86_cpu) extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and; @@ -221,6 +227,9 @@ extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall; extern const int x86_accumulate_outgoing_args, x86_prologue_using_move; extern const int x86_epilogue_using_move, x86_decompose_lea; extern const int x86_arch_always_fancy_math_387, x86_shift1; +extern const int x86_sse_partial_reg_dependency, x86_sse_partial_regs; +extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor; +extern const int x86_use_ffreep, x86_sse_partial_regs_for_cvtsd2ss; extern int x86_prefetch_sse; #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK) @@ -257,12 +266,22 @@ extern int x86_prefetch_sse; #define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK) #define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & CPUMASK) #define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & CPUMASK) +#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \ + (x86_sse_partial_reg_dependency & CPUMASK) +#define TARGET_SSE_PARTIAL_REGS (x86_sse_partial_regs & CPUMASK) +#define TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS \ + (x86_sse_partial_regs_for_cvtsd2ss & CPUMASK) +#define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & CPUMASK) +#define TARGET_SSE_TYPELESS_LOAD0 (x86_sse_typeless_load0 & CPUMASK) +#define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & CPUMASK) #define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK) #define TARGET_PROLOGUE_USING_MOVE (x86_prologue_using_move & CPUMASK) #define TARGET_EPILOGUE_USING_MOVE (x86_epilogue_using_move & CPUMASK) #define TARGET_DECOMPOSE_LEA (x86_decompose_lea & CPUMASK) #define TARGET_PREFETCH_SSE (x86_prefetch_sse) #define TARGET_SHIFT1 (x86_shift1 & CPUMASK) +#define TARGET_USE_FFREEP (x86_use_ffreep & CPUMASK) +#define TARGET_REP_MOVL_OPTIMAL (x86_rep_movl_optimal & CPUMASK) #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE) @@ -282,6 +301,8 @@ extern int x86_prefetch_sse; #define TARGET_RED_ZONE (!(target_flags & MASK_NO_RED_ZONE)) +#define TARGET_USE_MS_BITFIELD_LAYOUT (target_flags & MASK_MS_BITFIELD_LAYOUT) + #define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU) #define TARGET_SUN_TLS (ix86_tls_dialect == TLS_DIALECT_SUN) @@ -374,6 +395,10 @@ extern int x86_prefetch_sse; N_("Generate 64bit x86-64 code") }, \ { "32", -MASK_64BIT, \ N_("Generate 32bit i386 code") }, \ + { "ms-bitfields", MASK_MS_BITFIELD_LAYOUT, \ + N_("Use native (MS) bitfield layout") }, \ + { "no-ms-bitfields", -MASK_MS_BITFIELD_LAYOUT, \ + N_("Use gcc default bitfield layout") }, \ { "red-zone", -MASK_NO_RED_ZONE, \ N_("Use red-zone in the x86-64 code") }, \ { "no-red-zone", MASK_NO_RED_ZONE, \ @@ -540,6 +565,8 @@ extern int x86_prefetch_sse; if (last_cpu_char != 'n') \ builtin_define ("__tune_athlon_sse__"); \ } \ + else if (TARGET_K8) \ + builtin_define ("__tune_k8__"); \ else if (TARGET_PENTIUM4) \ builtin_define ("__tune_pentium4__"); \ \ @@ -598,6 +625,11 @@ extern int x86_prefetch_sse; if (last_arch_char != 'n') \ builtin_define ("__athlon_sse__"); \ } \ + else if (ix86_arch == PROCESSOR_K8) \ + { \ + builtin_define ("__k8"); \ + builtin_define ("__k8__"); \ + } \ else if (ix86_arch == PROCESSOR_PENTIUM4) \ { \ builtin_define ("__pentium4"); \ @@ -619,11 +651,12 @@ extern int x86_prefetch_sse; #define TARGET_CPU_DEFAULT_k6_3 10 #define TARGET_CPU_DEFAULT_athlon 11 #define TARGET_CPU_DEFAULT_athlon_sse 12 +#define TARGET_CPU_DEFAULT_k8 13 #define TARGET_CPU_DEFAULT_NAMES {"i386", "i486", "pentium", "pentium-mmx",\ "pentiumpro", "pentium2", "pentium3", \ "pentium4", "k6", "k6-2", "k6-3",\ - "athlon", "athlon-4"} + "athlon", "athlon-4", "k8"} #ifndef CC1_SPEC #define CC1_SPEC "%(cc1_cpu) " @@ -1335,6 +1368,9 @@ enum reg_class (((N) >= FIRST_SSE_REG && (N) <= LAST_SSE_REG) \ || ((N) >= FIRST_REX_SSE_REG && (N) <= LAST_REX_SSE_REG)) +#define REX_SSE_REGNO_P(N) \ + ((N) >= FIRST_REX_SSE_REG && (N) <= LAST_REX_SSE_REG) + #define SSE_REGNO(N) \ ((N) < 8 ? FIRST_SSE_REG + (N) : FIRST_REX_SSE_REG + (N) - 8) #define SSE_REG_P(N) (REG_P (N) && SSE_REGNO_P (REGNO (N))) @@ -1716,18 +1752,6 @@ typedef struct ix86_args { #define FUNCTION_ARG_PARTIAL_NREGS(CUM, MODE, TYPE, NAMED) 0 -/* If PIC, we cannot make sibling calls to global functions - because the PLT requires %ebx live. - If we are returning floats on the register stack, we cannot make - sibling calls to functions that return floats. (The stack adjust - instruction will wind up after the sibcall jump, and not be executed.) */ -#define FUNCTION_OK_FOR_SIBCALL(DECL) \ - ((DECL) \ - && (! flag_pic || ! TREE_PUBLIC (DECL)) \ - && (! TARGET_FLOAT_RETURNS_IN_80387 \ - || ! FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (TREE_TYPE (DECL)))) \ - || FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (TREE_TYPE (cfun->decl)))))) - /* Perform any needed actions needed for a function that is receiving a variable number of arguments. @@ -1839,12 +1863,6 @@ typedef struct ix86_args { /* Addressing modes, and classification of registers for them. */ -/* #define HAVE_POST_INCREMENT 0 */ -/* #define HAVE_POST_DECREMENT 0 */ - -/* #define HAVE_PRE_DECREMENT 0 */ -/* #define HAVE_PRE_INCREMENT 0 */ - /* Macros to check register numbers against specific register classes. */ /* These assume that REGNO is a hard or pseudo reg number. @@ -2602,6 +2620,14 @@ do { \ #define TOPLEVEL_COSTS_N_INSNS(N) \ do { total = COSTS_N_INSNS (N); goto egress_rtx_costs; } while (0) +/* Return index of given mode in mult and division cost tables. */ +#define MODE_INDEX(mode) \ + ((mode) == QImode ? 0 \ + : (mode) == HImode ? 1 \ + : (mode) == SImode ? 2 \ + : (mode) == DImode ? 3 \ + : 4) + /* Like `CONST_COSTS' but applies to nonconstant RTL expressions. This can be used, for example, to indicate how costly a multiply instruction is. In writing this macro, you can use the construct @@ -2687,10 +2713,12 @@ do { \ } \ \ TOPLEVEL_COSTS_N_INSNS (ix86_cost->mult_init \ + [MODE_INDEX (GET_MODE (X))] \ + nbits * ix86_cost->mult_bit); \ } \ else /* This is arbitrary */ \ TOPLEVEL_COSTS_N_INSNS (ix86_cost->mult_init \ + [MODE_INDEX (GET_MODE (X))] \ + 7 * ix86_cost->mult_bit); \ \ case DIV: \ @@ -2700,7 +2728,8 @@ do { \ if (FLOAT_MODE_P (GET_MODE (X))) \ TOPLEVEL_COSTS_N_INSNS (ix86_cost->fdiv); \ else \ - TOPLEVEL_COSTS_N_INSNS (ix86_cost->divide); \ + TOPLEVEL_COSTS_N_INSNS (ix86_cost->divide \ + [MODE_INDEX (GET_MODE (X))]); \ break; \ \ case PLUS: \ @@ -3040,14 +3069,6 @@ extern int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER]; ? ((GLOBAL) ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | DW_EH_PE_sdata4\ : DW_EH_PE_absptr) -/* Store in OUTPUT a string (made with alloca) containing - an assembler-name for a local static variable named NAME. - LABELNO is an integer which is different for each call. */ - -#define ASM_FORMAT_PRIVATE_NAME(OUTPUT, NAME, LABELNO) \ -( (OUTPUT) = (char *) alloca (strlen ((NAME)) + 10), \ - sprintf ((OUTPUT), "%s.%d", (NAME), (LABELNO))) - /* This is how to output an insn to push a register on the stack. It need not be very fast code. */ @@ -3241,6 +3262,7 @@ do { \ LABEL_REF, SUBREG, REG, MEM}}, \ {"pic_symbolic_operand", {CONST}}, \ {"call_insn_operand", {REG, SUBREG, MEM, SYMBOL_REF}}, \ + {"sibcall_insn_operand", {REG, SUBREG, SYMBOL_REF}}, \ {"constant_call_address_operand", {SYMBOL_REF, CONST}}, \ {"const0_operand", {CONST_INT, CONST_DOUBLE}}, \ {"const1_operand", {CONST_INT}}, \ @@ -3252,6 +3274,7 @@ do { \ SYMBOL_REF, LABEL_REF, SUBREG, REG, MEM}}, \ {"nonmemory_no_elim_operand", {CONST_INT, REG, SUBREG}}, \ {"index_register_operand", {SUBREG, REG}}, \ + {"flags_reg_operand", {REG}}, \ {"q_regs_operand", {SUBREG, REG}}, \ {"non_q_regs_operand", {SUBREG, REG}}, \ {"fcmov_comparison_operator", {EQ, NE, LTU, GTU, LEU, GEU, UNORDERED, \ @@ -3286,6 +3309,7 @@ do { \ {"register_and_not_any_fp_reg_operand", {REG}}, \ {"fp_register_operand", {REG}}, \ {"register_and_not_fp_reg_operand", {REG}}, \ + {"zero_extended_scalar_load_operand", {MEM}}, \ /* A list of predicates that do special things with modes, and so should not elicit warnings for VOIDmode match_operand. */ @@ -3305,6 +3329,7 @@ enum processor_type PROCESSOR_K6, PROCESSOR_ATHLON, PROCESSOR_PENTIUM4, + PROCESSOR_K8, PROCESSOR_max }; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index befbfe49569..d625f586d46 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -132,7 +132,7 @@ ;; Processor type. This attribute must exactly match the processor_type ;; enumeration in i386.h. -(define_attr "cpu" "i386,i486,pentium,pentiumpro,k6,athlon,pentium4" +(define_attr "cpu" "i386,i486,pentium,pentiumpro,k6,athlon,pentium4,k8" (const (symbol_ref "ix86_cpu"))) ;; A basic instruction type. Refinements due to arguments to be @@ -142,7 +142,7 @@ alu,alu1,negnot,imov,imovx,lea, incdec,ishift,ishift1,rotate,rotate1,imul,idiv, icmp,test,ibr,setcc,icmov, - push,pop,call,callv, + push,pop,call,callv,leave, str,cld, fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp, sselog,sseiadd,sseishft,sseimul, @@ -152,7 +152,7 @@ ;; Main data type used by the insn (define_attr "mode" - "unknown,none,QI,HI,SI,DI,unknownfp,SF,DF,XF,TI,V4SF,V2DF,V2SF" + "unknown,none,QI,HI,SI,DI,SF,DF,XF,TI,V4SF,V2DF,V2SF" (const_string "unknown")) ;; The CPU unit operations uses. @@ -170,7 +170,7 @@ ;; The (bounding maximum) length of an instruction immediate. (define_attr "length_immediate" "" - (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv") + (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave") (const_int 0) (eq_attr "unit" "i387,sse,mmx") (const_int 0) @@ -232,9 +232,24 @@ (const_int 1) (const_int 0))) +;; Set when 0f opcode prefix is used. +(define_attr "prefix_rex" "" + (cond [(and (eq_attr "mode" "DI") + (eq_attr "type" "!push,pop,call,callv,leave,ibr")) + (const_int 1) + (and (eq_attr "mode" "QI") + (ne (symbol_ref "x86_extended_QIreg_mentioned_p (insn)") + (const_int 0))) + (const_int 1) + (ne (symbol_ref "x86_extended_reg_mentioned_p (insn)") + (const_int 0)) + (const_int 1) + ] + (const_int 0))) + ;; Set when modrm byte is used. (define_attr "modrm" "" - (cond [(eq_attr "type" "str,cld") + (cond [(eq_attr "type" "str,cld,leave") (const_int 0) (eq_attr "unit" "i387") (const_int 0) @@ -273,7 +288,8 @@ (attr "length_address")))] (plus (plus (attr "modrm") (plus (attr "prefix_0f") - (const_int 1))) + (plus (attr "prefix_rex") + (const_int 1)))) (plus (attr "prefix_rep") (plus (attr "prefix_data16") (plus (attr "length_immediate") @@ -288,7 +304,7 @@ (const_string "unknown") (eq_attr "type" "lea,fcmov,fpspc,cld") (const_string "none") - (eq_attr "type" "fistp") + (eq_attr "type" "fistp,leave") (const_string "both") (eq_attr "type" "push") (if_then_else (match_operand 1 "memory_operand" "") @@ -754,7 +770,13 @@ return "ftst\;fnstsw\t%0"; } [(set_attr "type" "multi") - (set_attr "mode" "unknownfp")]) + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF")))]) ;; We may not use "#" to split and emit these, since the REG_DEAD notes ;; used to manage the reg stack popping would not be preserved. @@ -857,7 +879,13 @@ && GET_MODE (operands[0]) == GET_MODE (operands[1])" "* return output_fp_compare (insn, operands, 0, 1);" [(set_attr "type" "fcmp") - (set_attr "mode" "unknownfp")]) + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF")))]) (define_insn "*cmpfp_2u_1" [(set (match_operand:HI 0 "register_operand" "=a") @@ -871,7 +899,13 @@ && GET_MODE (operands[1]) == GET_MODE (operands[2])" "* return output_fp_compare (insn, operands, 2, 1);" [(set_attr "type" "multi") - (set_attr "mode" "unknownfp")]) + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF")))]) ;; Patterns to match the SImode-in-memory ficom instructions. ;; @@ -911,7 +945,7 @@ ;; FP compares, step 2 ;; Move the fpsw to ax. -(define_insn "x86_fnstsw_1" +(define_insn "*x86_fnstsw_1" [(set (match_operand:HI 0 "register_operand" "=a") (unspec:HI [(reg 18)] UNSPEC_FNSTSW))] "TARGET_80387" @@ -946,7 +980,13 @@ && GET_MODE (operands[0]) == GET_MODE (operands[0])" "* return output_fp_compare (insn, operands, 1, 0);" [(set_attr "type" "fcmp") - (set_attr "mode" "unknownfp") + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF"))) (set_attr "athlon_decode" "vector")]) (define_insn "*cmpfp_i_sse" @@ -958,7 +998,10 @@ && GET_MODE (operands[0]) == GET_MODE (operands[0])" "* return output_fp_compare (insn, operands, 1, 0);" [(set_attr "type" "fcmp,ssecmp") - (set_attr "mode" "unknownfp") + (set (attr "mode") + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) (set_attr "athlon_decode" "vector")]) (define_insn "*cmpfp_i_sse_only" @@ -969,7 +1012,10 @@ && GET_MODE (operands[0]) == GET_MODE (operands[0])" "* return output_fp_compare (insn, operands, 1, 0);" [(set_attr "type" "ssecmp") - (set_attr "mode" "unknownfp") + (set (attr "mode") + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) (set_attr "athlon_decode" "vector")]) (define_insn "*cmpfp_iu" @@ -982,7 +1028,13 @@ && GET_MODE (operands[0]) == GET_MODE (operands[1])" "* return output_fp_compare (insn, operands, 1, 1);" [(set_attr "type" "fcmp") - (set_attr "mode" "unknownfp") + (set (attr "mode") + (cond [(match_operand:SF 1 "" "") + (const_string "SF") + (match_operand:DF 1 "" "") + (const_string "DF") + ] + (const_string "XF"))) (set_attr "athlon_decode" "vector")]) (define_insn "*cmpfp_iu_sse" @@ -994,7 +1046,10 @@ && GET_MODE (operands[0]) == GET_MODE (operands[1])" "* return output_fp_compare (insn, operands, 1, 1);" [(set_attr "type" "fcmp,ssecmp") - (set_attr "mode" "unknownfp") + (set (attr "mode") + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) (set_attr "athlon_decode" "vector")]) (define_insn "*cmpfp_iu_sse_only" @@ -1005,7 +1060,10 @@ && GET_MODE (operands[0]) == GET_MODE (operands[1])" "* return output_fp_compare (insn, operands, 1, 1);" [(set_attr "type" "ssecmp") - (set_attr "mode" "unknownfp") + (set (attr "mode") + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) (set_attr "athlon_decode" "vector")]) ;; Move instructions. @@ -2011,22 +2069,11 @@ { switch (which_alternative) { - case 0: - /* %%% We loose REG_DEAD notes for controling pops if we split late. */ - operands[0] = gen_rtx_MEM (SFmode, stack_pointer_rtx); - operands[2] = stack_pointer_rtx; - operands[3] = GEN_INT (4); - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{l}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{l}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - case 1: return "push{l}\t%1"; - case 2: - return "#"; default: + /* This insn should be already splitted before reg-stack. */ abort (); } } @@ -2040,23 +2087,11 @@ { switch (which_alternative) { - case 0: - /* %%% We loose REG_DEAD notes for controling pops if we split late. */ - operands[0] = gen_rtx_MEM (SFmode, stack_pointer_rtx); - operands[2] = stack_pointer_rtx; - operands[3] = GEN_INT (8); - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{q}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{q}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - case 1: return "push{q}\t%q1"; - case 2: - return "#"; - default: + /* This insn should be already splitted before reg-stack. */ abort (); } } @@ -2104,7 +2139,12 @@ case 0: if (REG_P (operands[1]) && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "fstp\t%y0"; + { + if (REGNO (operands[0]) == FIRST_STACK_REG + && TARGET_USE_FFREEP) + return "ffreep\t%y0"; + return "fstp\t%y0"; + } else if (STACK_TOP_P (operands[0])) return "fld%z1\t%y1"; else @@ -2130,12 +2170,12 @@ case 4: return "mov{l}\t{%1, %0|%0, %1}"; case 5: - if (TARGET_SSE2 && !TARGET_ATHLON) + if (get_attr_mode (insn) == MODE_TI) return "pxor\t%0, %0"; else return "xorps\t%0, %0"; case 6: - if (TARGET_PARTIAL_REG_DEPENDENCY) + if (get_attr_mode (insn) == MODE_V4SF) return "movaps\t{%1, %0|%0, %1}"; else return "movss\t{%1, %0|%0, %1}"; @@ -2155,7 +2195,40 @@ } } [(set_attr "type" "fmov,fmov,fmov,imov,imov,ssemov,ssemov,ssemov,ssemov,mmxmov,mmxmov,mmxmov") - (set_attr "mode" "SF,SF,SF,SI,SI,TI,SF,SF,SF,SI,SI,DI")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "3,4,9,10") + (const_string "SI") + (eq_attr "alternative" "5") + (if_then_else + (and (and (ne (symbol_ref "TARGET_SSE_LOAD0_BY_PXOR") + (const_int 0)) + (ne (symbol_ref "TARGET_SSE2") + (const_int 0))) + (eq (symbol_ref "optimize_size") + (const_int 0))) + (const_string "TI") + (const_string "V4SF")) + /* For architectures resolving dependencies on + whole SSE registers use APS move to break dependency + chains, otherwise use short move to avoid extra work. + + Do the same for architectures resolving dependencies on + the parts. While in DF mode it is better to always handle + just register parts, the SF mode is different due to lack + of instructions to load just part of the register. It is + better to maintain the whole registers in single format + to avoid problems on using packed logical operations. */ + (eq_attr "alternative" "6") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_int 0)) + (ne (symbol_ref "TARGET_SSE_PARTIAL_REGS") + (const_int 0))) + (const_string "V4SF") + (const_string "SF")) + (eq_attr "alternative" "11") + (const_string "DI")] + (const_string "SF")))]) (define_insn "*swapsf" [(set (match_operand:SF 0 "register_operand" "+f") @@ -2188,26 +2261,8 @@ (match_operand:DF 1 "general_no_elim_operand" "f#Y,Fo#fY,*r#fY,Y#f"))] "!TARGET_64BIT && !TARGET_INTEGER_DFMODE_MOVES" { - switch (which_alternative) - { - case 0: - /* %%% We loose REG_DEAD notes for controling pops if we split late. */ - operands[0] = gen_rtx_MEM (DFmode, stack_pointer_rtx); - operands[2] = stack_pointer_rtx; - operands[3] = GEN_INT (8); - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{l}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{l}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - - case 1: - case 2: - case 3: - return "#"; - - default: - abort (); - } + /* This insn should be already splitted before reg-stack. */ + abort (); } [(set_attr "type" "multi") (set_attr "mode" "DF,SI,SI,DF")]) @@ -2217,32 +2272,8 @@ (match_operand:DF 1 "general_no_elim_operand" "f#rY,rFo#fY,Y#rf"))] "TARGET_64BIT || TARGET_INTEGER_DFMODE_MOVES" { - switch (which_alternative) - { - case 0: - /* %%% We loose REG_DEAD notes for controling pops if we split late. */ - operands[0] = gen_rtx_MEM (DFmode, stack_pointer_rtx); - operands[2] = stack_pointer_rtx; - operands[3] = GEN_INT (8); - if (TARGET_64BIT) - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{q}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{q}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - else - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{l}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{l}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - - - case 1: - case 2: - return "#"; - - default: - abort (); - } + /* This insn should be already splitted before reg-stack. */ + abort (); } [(set_attr "type" "multi") (set_attr "mode" "DF,SI,DF")]) @@ -2279,7 +2310,7 @@ [(set (match_operand:DF 0 "nonimmediate_operand" "=f#Y,m,f#Y,*r,o,Y#f,Y#f,Y#f,m") (match_operand:DF 1 "general_operand" "fm#Y,f#Y,G,*roF,F*r,C,Y#f,YHm#f,Y#f"))] "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM) - && (optimize_size || !TARGET_INTEGER_DFMODE_MOVES) + && ((optimize_size || !TARGET_INTEGER_DFMODE_MOVES) && !TARGET_64BIT) && (reload_in_progress || reload_completed || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) || GET_CODE (operands[1]) != CONST_DOUBLE @@ -2290,7 +2321,12 @@ case 0: if (REG_P (operands[1]) && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "fstp\t%y0"; + { + if (REGNO (operands[0]) == FIRST_STACK_REG + && TARGET_USE_FFREEP) + return "ffreep\t%y0"; + return "fstp\t%y0"; + } else if (STACK_TOP_P (operands[0])) return "fld%z1\t%y1"; else @@ -2316,31 +2352,84 @@ case 4: return "#"; case 5: - if (TARGET_ATHLON) - return "xorpd\t%0, %0"; - else - return "pxor\t%0, %0"; + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "xorps\t%0, %0"; + case MODE_V2DF: + return "xorpd\t%0, %0"; + case MODE_TI: + return "pxor\t%0, %0"; + default: + abort (); + } case 6: - if (TARGET_PARTIAL_REG_DEPENDENCY) - return "movapd\t{%1, %0|%0, %1}"; + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "movaps\t{%1, %0|%0, %1}"; + case MODE_V2DF: + return "movapd\t{%1, %0|%0, %1}"; + case MODE_DF: + return "movsd\t{%1, %0|%0, %1}"; + default: + abort (); + } + case 7: + if (get_attr_mode (insn) == MODE_V2DF) + return "movlpd\t{%1, %0|%0, %1}"; else return "movsd\t{%1, %0|%0, %1}"; - case 7: case 8: - return "movsd\t{%1, %0|%0, %1}"; + return "movsd\t{%1, %0|%0, %1}"; default: abort(); } } [(set_attr "type" "fmov,fmov,fmov,multi,multi,ssemov,ssemov,ssemov,ssemov") - (set_attr "mode" "DF,DF,DF,SI,SI,TI,DF,DF,DF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "3,4") + (const_string "SI") + /* xorps is one byte shorter. */ + (eq_attr "alternative" "5") + (cond [(ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_LOAD0_BY_PXOR") + (const_int 0)) + (const_string "TI")] + (const_string "V2DF")) + /* For architectures resolving dependencies on + whole SSE registers use APD move to break dependency + chains, otherwise use short move to avoid extra work. + + movaps encodes one byte shorter. */ + (eq_attr "alternative" "6") + (cond + [(ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_int 0)) + (const_string "V2DF")] + (const_string "DF")) + /* For achitectures resolving dependencies on register + parts we may avoid extra work to zero out upper part + of register. */ + (eq_attr "alternative" "7") + (if_then_else + (ne (symbol_ref "TARGET_SSE_PARTIAL_REGS") + (const_int 0)) + (const_string "V2DF") + (const_string "DF"))] + (const_string "DF")))]) (define_insn "*movdf_integer" [(set (match_operand:DF 0 "nonimmediate_operand" "=f#Yr,m,f#Yr,r#Yf,o,Y#rf,Y#rf,Y#rf,m") (match_operand:DF 1 "general_operand" "fm#Yr,f#Yr,G,roF#Yf,Fr#Yf,C,Y#rf,Ym#rf,Y#rf"))] "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM) - && !optimize_size && TARGET_INTEGER_DFMODE_MOVES + && ((!optimize_size && TARGET_INTEGER_DFMODE_MOVES) || TARGET_64BIT) && (reload_in_progress || reload_completed || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE) || GET_CODE (operands[1]) != CONST_DOUBLE @@ -2351,7 +2440,12 @@ case 0: if (REG_P (operands[1]) && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "fstp\t%y0"; + { + if (REGNO (operands[0]) == FIRST_STACK_REG + && TARGET_USE_FFREEP) + return "ffreep\t%y0"; + return "fstp\t%y0"; + } else if (STACK_TOP_P (operands[0])) return "fld%z1\t%y1"; else @@ -2378,16 +2472,34 @@ return "#"; case 5: - if (TARGET_ATHLON) - return "xorpd\t%0, %0"; - else - return "pxor\t%0, %0"; + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "xorps\t%0, %0"; + case MODE_V2DF: + return "xorpd\t%0, %0"; + case MODE_TI: + return "pxor\t%0, %0"; + default: + abort (); + } case 6: - if (TARGET_PARTIAL_REG_DEPENDENCY) - return "movapd\t{%1, %0|%0, %1}"; + switch (get_attr_mode (insn)) + { + case MODE_V4SF: + return "movaps\t{%1, %0|%0, %1}"; + case MODE_V2DF: + return "movapd\t{%1, %0|%0, %1}"; + case MODE_DF: + return "movsd\t{%1, %0|%0, %1}"; + default: + abort (); + } + case 7: + if (get_attr_mode (insn) == MODE_V2DF) + return "movlpd\t{%1, %0|%0, %1}"; else return "movsd\t{%1, %0|%0, %1}"; - case 7: case 8: return "movsd\t{%1, %0|%0, %1}"; @@ -2396,7 +2508,42 @@ } } [(set_attr "type" "fmov,fmov,fmov,multi,multi,ssemov,ssemov,ssemov,ssemov") - (set_attr "mode" "DF,DF,DF,SI,SI,TI,DF,DF,DF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "3,4") + (const_string "SI") + /* xorps is one byte shorter. */ + (eq_attr "alternative" "5") + (cond [(ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_LOAD0_BY_PXOR") + (const_int 0)) + (const_string "TI")] + (const_string "V2DF")) + /* For architectures resolving dependencies on + whole SSE registers use APD move to break dependency + chains, otherwise use short move to avoid extra work. + + movaps encodes one byte shorter. */ + (eq_attr "alternative" "6") + (cond + [(ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (ne (symbol_ref "TARGET_SSE_PARTIAL_REG_DEPENDENCY") + (const_int 0)) + (const_string "V2DF")] + (const_string "DF")) + /* For achitectures resolving dependencies on register + parts we may avoid extra work to zero out upper part + of register. */ + (eq_attr "alternative" "7") + (if_then_else + (ne (symbol_ref "TARGET_SSE_PARTIAL_REGS") + (const_int 0)) + (const_string "V2DF") + (const_string "DF"))] + (const_string "DF")))]) (define_split [(set (match_operand:DF 0 "nonimmediate_operand" "") @@ -2451,25 +2598,8 @@ (match_operand:XF 1 "general_no_elim_operand" "f,Fo,*r"))] "!TARGET_64BIT && optimize_size" { - switch (which_alternative) - { - case 0: - /* %%% We loose REG_DEAD notes for controling pops if we split late. */ - operands[0] = gen_rtx_MEM (XFmode, stack_pointer_rtx); - operands[2] = stack_pointer_rtx; - operands[3] = GEN_INT (12); - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{l}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{l}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - - case 1: - case 2: - return "#"; - - default: - abort (); - } + /* This insn should be already splitted before reg-stack. */ + abort (); } [(set_attr "type" "multi") (set_attr "mode" "XF,SI,SI")]) @@ -2479,25 +2609,8 @@ (match_operand:TF 1 "general_no_elim_operand" "f,Fo,*r"))] "optimize_size" { - switch (which_alternative) - { - case 0: - /* %%% We loose REG_DEAD notes for controling pops if we split late. */ - operands[0] = gen_rtx_MEM (XFmode, stack_pointer_rtx); - operands[2] = stack_pointer_rtx; - operands[3] = GEN_INT (16); - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{l}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{l}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - - case 1: - case 2: - return "#"; - - default: - abort (); - } + /* This insn should be already splitted before reg-stack. */ + abort (); } [(set_attr "type" "multi") (set_attr "mode" "XF,SI,SI")]) @@ -2507,24 +2620,8 @@ (match_operand:XF 1 "general_no_elim_operand" "f#r,ro#f"))] "!TARGET_64BIT && !optimize_size" { - switch (which_alternative) - { - case 0: - /* %%% We loose REG_DEAD notes for controling pops if we split late. */ - operands[0] = gen_rtx_MEM (XFmode, stack_pointer_rtx); - operands[2] = stack_pointer_rtx; - operands[3] = GEN_INT (12); - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{l}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{l}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - - case 1: - return "#"; - - default: - abort (); - } + /* This insn should be already splitted before reg-stack. */ + abort (); } [(set_attr "type" "multi") (set_attr "mode" "XF,SI")]) @@ -2534,30 +2631,8 @@ (match_operand:TF 1 "general_no_elim_operand" "f#r,rFo#f"))] "!optimize_size" { - switch (which_alternative) - { - case 0: - /* %%% We loose REG_DEAD notes for controling pops if we split late. */ - operands[0] = gen_rtx_MEM (XFmode, stack_pointer_rtx); - operands[2] = stack_pointer_rtx; - operands[3] = GEN_INT (16); - if (TARGET_64BIT) - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{q}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{q}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - else - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "sub{l}\t{%3, %2|%2, %3}\;fstp%z0\t%y0"; - else - return "sub{l}\t{%3, %2|%2, %3}\;fst%z0\t%y0"; - - case 1: - return "#"; - - default: - abort (); - } + /* This insn should be already splitted before reg-stack. */ + abort (); } [(set_attr "type" "multi") (set_attr "mode" "XF,SI")]) @@ -2610,7 +2685,12 @@ case 0: if (REG_P (operands[1]) && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "fstp\t%y0"; + { + if (REGNO (operands[0]) == FIRST_STACK_REG + && TARGET_USE_FFREEP) + return "ffreep\t%y0"; + return "fstp\t%y0"; + } else if (STACK_TOP_P (operands[0])) return "fld%z1\t%y1"; else @@ -2657,7 +2737,12 @@ case 0: if (REG_P (operands[1]) && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "fstp\t%y0"; + { + if (REGNO (operands[0]) == FIRST_STACK_REG + && TARGET_USE_FFREEP) + return "ffreep\t%y0"; + return "fstp\t%y0"; + } else if (STACK_TOP_P (operands[0])) return "fld%z1\t%y1"; else @@ -2704,7 +2789,12 @@ case 0: if (REG_P (operands[1]) && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "fstp\t%y0"; + { + if (REGNO (operands[0]) == FIRST_STACK_REG + && TARGET_USE_FFREEP) + return "ffreep\t%y0"; + return "fstp\t%y0"; + } else if (STACK_TOP_P (operands[0])) return "fld%z1\t%y1"; else @@ -2751,7 +2841,12 @@ case 0: if (REG_P (operands[1]) && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "fstp\t%y0"; + { + if (REGNO (operands[0]) == FIRST_STACK_REG + && TARGET_USE_FFREEP) + return "ffreep\t%y0"; + return "fstp\t%y0"; + } else if (STACK_TOP_P (operands[0])) return "fld%z1\t%y1"; else @@ -3699,11 +3794,11 @@ (set_attr "mode" "SF,SF,SF,SF")]) (define_insn "*truncdfsf2_1_sse" - [(set (match_operand:SF 0 "nonimmediate_operand" "=*!m,?f#rx,?r#fx,?x#rf,Y") + [(set (match_operand:SF 0 "nonimmediate_operand" "=*!m#fxr,?f#xr,?r#fx,?x#fr,Y#fr") (float_truncate:SF - (match_operand:DF 1 "nonimmediate_operand" "f,f,f,f,mY"))) + (match_operand:DF 1 "nonimmediate_operand" "f#Y,f#Y,f#Y,f#Y,mY#f"))) (clobber (match_operand:SF 2 "memory_operand" "=X,m,m,m,X"))] - "TARGET_80387 && TARGET_SSE2" + "TARGET_80387 && TARGET_SSE2 && !TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS" { switch (which_alternative) { @@ -3713,7 +3808,30 @@ else return "fst%z0\t%y0"; case 4: - return "cvtsd2ss\t{%1, %0|%0, %1}"; + return "#"; + default: + abort (); + } +} + [(set_attr "type" "fmov,multi,multi,multi,ssecvt") + (set_attr "mode" "SF,SF,SF,SF,DF")]) + +(define_insn "*truncdfsf2_1_sse_nooverlap" + [(set (match_operand:SF 0 "nonimmediate_operand" "=*!m,?f#rx,?r#fx,?x#rf,&Y") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "f#Y,f#Y,f#Y,f#Y,mY#f"))) + (clobber (match_operand:SF 2 "memory_operand" "=X,m,m,m,X"))] + "TARGET_80387 && TARGET_SSE2 && TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS" +{ + switch (which_alternative) + { + case 0: + if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + return "fstp%z0\t%y0"; + else + return "fst%z0\t%y0"; + case 4: + return "#"; default: abort (); } @@ -3724,8 +3842,8 @@ (define_insn "*truncdfsf2_2" [(set (match_operand:SF 0 "nonimmediate_operand" "=Y,!m") (float_truncate:SF - (match_operand:DF 1 "nonimmediate_operand" "mY,f")))] - "TARGET_80387 && TARGET_SSE2 + (match_operand:DF 1 "nonimmediate_operand" "mY,f#Y")))] + "TARGET_80387 && TARGET_SSE2 && !TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)" { switch (which_alternative) @@ -3744,7 +3862,30 @@ [(set_attr "type" "ssecvt,fmov") (set_attr "mode" "DF,SF")]) -(define_insn "truncdfsf2_3" +(define_insn "*truncdfsf2_2_nooverlap" + [(set (match_operand:SF 0 "nonimmediate_operand" "=&Y,!m") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "mY,f")))] + "TARGET_80387 && TARGET_SSE2 && TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS + && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)" +{ + switch (which_alternative) + { + case 0: + return "#"; + case 1: + if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + return "fstp%z0\t%y0"; + else + return "fst%z0\t%y0"; + default: + abort (); + } +} + [(set_attr "type" "ssecvt,fmov") + (set_attr "mode" "DF,SF")]) + +(define_insn "*truncdfsf2_3" [(set (match_operand:SF 0 "memory_operand" "=m") (float_truncate:SF (match_operand:DF 1 "register_operand" "f")))] @@ -3762,11 +3903,20 @@ [(set (match_operand:SF 0 "register_operand" "=Y") (float_truncate:SF (match_operand:DF 1 "nonimmediate_operand" "mY")))] - "!TARGET_80387 && TARGET_SSE2" + "!TARGET_80387 && TARGET_SSE2 && !TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS" "cvtsd2ss\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") (set_attr "mode" "DF")]) +(define_insn "*truncdfsf2_sse_only_nooverlap" + [(set (match_operand:SF 0 "register_operand" "=&Y") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "mY")))] + "!TARGET_80387 && TARGET_SSE2 && TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS" + "#" + [(set_attr "type" "ssecvt") + (set_attr "mode" "DF")]) + (define_split [(set (match_operand:SF 0 "memory_operand" "") (float_truncate:SF @@ -3776,15 +3926,56 @@ [(set (match_dup 0) (float_truncate:SF (match_dup 1)))] "") +; Avoid possible reformating penalty on the destination by first +; zeroing it out (define_split - [(set (match_operand:SF 0 "nonimmediate_operand" "") + [(set (match_operand:SF 0 "register_operand" "") (float_truncate:SF (match_operand:DF 1 "nonimmediate_operand" ""))) (clobber (match_operand 2 "" ""))] "TARGET_80387 && reload_completed - && !FP_REG_P (operands[0]) && !FP_REG_P (operands[1])" - [(set (match_dup 0) (float_truncate:SF (match_dup 1)))] - "") + && SSE_REG_P (operands[0]) + && !STACK_REG_P (operands[1])" + [(const_int 0)] +{ + rtx src, dest; + if (!TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS) + emit_insn (gen_truncdfsf2_sse_only (operands[0], operands[1])); + else + { + dest = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0); + src = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0); + /* simplify_gen_subreg refuses to widen memory references. */ + if (GET_CODE (src) == SUBREG) + alter_subreg (&src); + if (reg_overlap_mentioned_p (operands[0], operands[1])) + abort (); + emit_insn (gen_sse_clrv4sf (dest, CONST0_RTX (V4SFmode))); + emit_insn (gen_cvtsd2ss (dest, dest, src)); + } + DONE; +}) + +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand" "")))] + "TARGET_80387 && reload_completed + && SSE_REG_P (operands[0]) && TARGET_SSE_PARTIAL_REGS_FOR_CVTSD2SS" + [(const_int 0)] +{ + rtx src, dest; + dest = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0); + src = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0); + /* simplify_gen_subreg refuses to widen memory references. */ + if (GET_CODE (src) == SUBREG) + alter_subreg (&src); + if (reg_overlap_mentioned_p (operands[0], operands[1])) + abort (); + emit_insn (gen_sse_clrv4sf (dest, CONST0_RTX (V4SFmode))); + emit_insn (gen_cvtsd2ss (dest, dest, src)); + DONE; +}) (define_split [(set (match_operand:SF 0 "register_operand" "") @@ -4468,7 +4659,7 @@ "") (define_insn "*floatsisf2_i387" - [(set (match_operand:SF 0 "register_operand" "=f,?f,x") + [(set (match_operand:SF 0 "register_operand" "=f#x,?f#x,x#f") (float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,mr")))] "TARGET_80387 && (!TARGET_SSE || TARGET_MIX_SSE_I387)" "@ @@ -4488,6 +4679,22 @@ (set_attr "mode" "SF") (set_attr "fp_int_src" "true")]) +; Avoid possible reformating penalty on the destination by first +; zeroing it out +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (float:SF (match_operand:SI 1 "nonimmediate_operand" "")))] + "TARGET_80387 && reload_completed && TARGET_SSE_PARTIAL_REGS + && SSE_REG_P (operands[0])" + [(const_int 0)] +{ + rtx dest; + dest = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0); + emit_insn (gen_sse_clrv4sf (dest, CONST0_RTX (V4SFmode))); + emit_insn (gen_cvtsi2ss (dest, dest, operands[1])); + DONE; +}) + (define_expand "floatdisf2" [(set (match_operand:SF 0 "register_operand" "") (float:SF (match_operand:DI 1 "nonimmediate_operand" "")))] @@ -4506,7 +4713,7 @@ (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_i387" - [(set (match_operand:SF 0 "register_operand" "=f,?f,x") + [(set (match_operand:SF 0 "register_operand" "=f#x,?f#x,x#f") (float:SF (match_operand:DI 1 "nonimmediate_operand" "m,r,mr")))] "TARGET_64BIT && TARGET_80387 && (!TARGET_SSE || TARGET_MIX_SSE_I387)" "@ @@ -4526,6 +4733,22 @@ (set_attr "mode" "SF") (set_attr "fp_int_src" "true")]) +; Avoid possible reformating penalty on the destination by first +; zeroing it out +(define_split + [(set (match_operand:SF 0 "register_operand" "") + (float:SF (match_operand:DI 1 "nonimmediate_operand" "")))] + "TARGET_80387 && reload_completed && TARGET_SSE_PARTIAL_REGS + && SSE_REG_P (operands[0])" + [(const_int 0)] +{ + rtx dest; + dest = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0); + emit_insn (gen_sse_clrv4sf (dest, CONST0_RTX (V4SFmode))); + emit_insn (gen_cvtsi2ssq (dest, dest, operands[1])); + DONE; +}) + (define_insn "floathidf2" [(set (match_operand:DF 0 "register_operand" "=f,f") (float:DF (match_operand:HI 1 "nonimmediate_operand" "m,r")))] @@ -4544,7 +4767,7 @@ "") (define_insn "*floatsidf2_i387" - [(set (match_operand:DF 0 "register_operand" "=f,?f,Y") + [(set (match_operand:DF 0 "register_operand" "=f#Y,?f#Y,Y#f") (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,mr")))] "TARGET_80387 && (!TARGET_SSE2 || TARGET_MIX_SSE_I387)" "@ @@ -4582,7 +4805,7 @@ (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_i387" - [(set (match_operand:DF 0 "register_operand" "=f,?f,Y") + [(set (match_operand:DF 0 "register_operand" "=f#Y,?f#Y,Y#f") (float:DF (match_operand:DI 1 "nonimmediate_operand" "m,r,mr")))] "TARGET_64BIT && TARGET_80387 && (!TARGET_SSE2 || TARGET_MIX_SSE_I387)" "@ @@ -9269,12 +9492,15 @@ in register. */ rtx reg = gen_reg_rtx (SFmode); rtx dest = operands[0]; + rtx imm = gen_lowpart (SFmode, gen_int_mode (0x80000000, SImode)); operands[1] = force_reg (SFmode, operands[1]); operands[0] = force_reg (SFmode, operands[0]); - emit_move_insn (reg, - gen_lowpart (SFmode, - gen_int_mode (0x80000000, SImode))); + reg = force_reg (V4SFmode, + gen_rtx_CONST_VECTOR (V4SFmode, + gen_rtvec (4, imm, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), + CONST0_RTX (SFmode)))); emit_insn (gen_negsf2_ifs (operands[0], operands[1], reg)); if (dest != operands[0]) emit_move_insn (dest, operands[0]); @@ -9293,7 +9519,7 @@ (define_insn "negsf2_ifs" [(set (match_operand:SF 0 "nonimmediate_operand" "=x#fr,x#fr,f#xr,rm#xf") (neg:SF (match_operand:SF 1 "nonimmediate_operand" "0,x#fr,0,0"))) - (use (match_operand:SF 2 "nonmemory_operand" "x,0#x,*g#x,*g#x")) + (use (match_operand:V4SF 2 "nonimmediate_operand" "xm,0,xm*r,xm*r")) (clobber (reg:CC 17))] "TARGET_SSE && (reload_in_progress || reload_completed @@ -9314,7 +9540,7 @@ (define_split [(set (match_operand:SF 0 "register_operand" "") (neg:SF (match_operand:SF 1 "register_operand" ""))) - (use (match_operand:SF 2 "" "")) + (use (match_operand:V4SF 2 "" "")) (clobber (reg:CC 17))] "reload_completed && !SSE_REG_P (operands[0])" [(parallel [(set (match_dup 0) @@ -9324,13 +9550,15 @@ (define_split [(set (match_operand:SF 0 "register_operand" "") (neg:SF (match_operand:SF 1 "register_operand" ""))) - (use (match_operand:SF 2 "register_operand" "")) + (use (match_operand:V4SF 2 "nonimmediate_operand" "")) (clobber (reg:CC 17))] "reload_completed && SSE_REG_P (operands[0])" [(set (subreg:TI (match_dup 0) 0) - (xor:TI (subreg:TI (match_dup 1) 0) - (subreg:TI (match_dup 2) 0)))] + (xor:TI (match_dup 1) + (match_dup 2)))] { + operands[1] = simplify_gen_subreg (TImode, operands[1], SFmode, 0); + operands[2] = simplify_gen_subreg (TImode, operands[2], V4SFmode, 0); if (operands_match_p (operands[0], operands[2])) { rtx tmp; @@ -9403,7 +9631,7 @@ { /* Using SSE is tricky, since we need bitwise negation of -0 in register. */ - rtx reg = gen_reg_rtx (DFmode); + rtx reg; #if HOST_BITS_PER_WIDE_INT >= 64 rtx imm = gen_int_mode (((HOST_WIDE_INT)1) << 63, DImode); #else @@ -9413,7 +9641,10 @@ operands[1] = force_reg (DFmode, operands[1]); operands[0] = force_reg (DFmode, operands[0]); - emit_move_insn (reg, gen_lowpart (DFmode, imm)); + imm = gen_lowpart (DFmode, imm); + reg = force_reg (V2DFmode, + gen_rtx_CONST_VECTOR (V2DFmode, + gen_rtvec (2, imm, CONST0_RTX (DFmode)))); emit_insn (gen_negdf2_ifs (operands[0], operands[1], reg)); if (dest != operands[0]) emit_move_insn (dest, operands[0]); @@ -9432,7 +9663,7 @@ (define_insn "negdf2_ifs" [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,Y#fr,f#Yr,rm#Yf") (neg:DF (match_operand:DF 1 "nonimmediate_operand" "0,Y#fr,0,0"))) - (use (match_operand:DF 2 "nonmemory_operand" "Y,0,*g#Y,*g#Y")) + (use (match_operand:V2DF 2 "nonimmediate_operand" "Ym,0,Ym*r,Ym*r")) (clobber (reg:CC 17))] "!TARGET_64BIT && TARGET_SSE2 && (reload_in_progress || reload_completed @@ -9442,8 +9673,8 @@ (define_insn "*negdf2_ifs_rex64" [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#f,Y#f,fm#Y") - (neg:DF (match_operand:DF 1 "nonimmediate_operand" "0,Y#f,0"))) - (use (match_operand:DF 2 "general_operand" "Y,0,*g#Y*r")) + (neg:DF (match_operand:DF 1 "nonimmediate_operand" "0,Y#fr,0"))) + (use (match_operand:V2DF 2 "nonimmediate_operand" "Ym,0,Ym*r")) (clobber (reg:CC 17))] "TARGET_64BIT && TARGET_SSE2 && (reload_in_progress || reload_completed @@ -9454,7 +9685,7 @@ (define_split [(set (match_operand:DF 0 "memory_operand" "") (neg:DF (match_operand:DF 1 "memory_operand" ""))) - (use (match_operand:DF 2 "" "")) + (use (match_operand:V2DF 2 "" "")) (clobber (reg:CC 17))] "" [(parallel [(set (match_dup 0) @@ -9464,7 +9695,7 @@ (define_split [(set (match_operand:DF 0 "register_operand" "") (neg:DF (match_operand:DF 1 "register_operand" ""))) - (use (match_operand:DF 2 "" "")) + (use (match_operand:V2DF 2 "" "")) (clobber (reg:CC 17))] "reload_completed && !SSE_REG_P (operands[0]) && (!TARGET_64BIT || FP_REG_P (operands[0]))" @@ -9475,7 +9706,7 @@ (define_split [(set (match_operand:DF 0 "register_operand" "") (neg:DF (match_operand:DF 1 "register_operand" ""))) - (use (match_operand:DF 2 "" "")) + (use (match_operand:V2DF 2 "" "")) (clobber (reg:CC 17))] "TARGET_64BIT && reload_completed && GENERAL_REG_P (operands[0])" [(parallel [(set (match_dup 0) @@ -9488,13 +9719,19 @@ (define_split [(set (match_operand:DF 0 "register_operand" "") (neg:DF (match_operand:DF 1 "register_operand" ""))) - (use (match_operand:DF 2 "register_operand" "")) + (use (match_operand:V2DF 2 "nonimmediate_operand" "")) (clobber (reg:CC 17))] "reload_completed && SSE_REG_P (operands[0])" [(set (subreg:TI (match_dup 0) 0) - (xor:TI (subreg:TI (match_dup 1) 0) - (subreg:TI (match_dup 2) 0)))] + (xor:TI (match_dup 1) + (match_dup 2)))] { + operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0); + operands[1] = simplify_gen_subreg (TImode, operands[1], DFmode, 0); + operands[2] = simplify_gen_subreg (TImode, operands[2], V2DFmode, 0); + /* Avoid possible reformating on the operands. */ + if (TARGET_SSE_PARTIAL_REGS && !optimize_size) + emit_insn (gen_sse2_unpcklpd (operands[0], operands[0], operands[0])); if (operands_match_p (operands[0], operands[2])) { rtx tmp; @@ -9727,14 +9964,18 @@ { /* Using SSE is tricky, since we need bitwise negation of -0 in register. */ - rtx reg = gen_reg_rtx (SFmode); + rtx reg = gen_reg_rtx (V4SFmode); rtx dest = operands[0]; + rtx imm; operands[1] = force_reg (SFmode, operands[1]); operands[0] = force_reg (SFmode, operands[0]); - emit_move_insn (reg, - gen_lowpart (SFmode, - gen_int_mode (0x80000000, SImode))); + imm = gen_lowpart (SFmode, gen_int_mode(~0x80000000, SImode)); + reg = force_reg (V4SFmode, + gen_rtx_CONST_VECTOR (V4SFmode, + gen_rtvec (4, imm, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), + CONST0_RTX (SFmode)))); emit_insn (gen_abssf2_ifs (operands[0], operands[1], reg)); if (dest != operands[0]) emit_move_insn (dest, operands[0]); @@ -9751,20 +9992,20 @@ "#") (define_insn "abssf2_ifs" - [(set (match_operand:SF 0 "nonimmediate_operand" "=x#fr,f#xr,rm#xf") - (abs:SF (match_operand:SF 1 "nonimmediate_operand" "x,0,0"))) - (use (match_operand:SF 2 "nonmemory_operand" "*0#x,*g#x,*g#x")) + [(set (match_operand:SF 0 "nonimmediate_operand" "=x#fr,x#fr,f#xr,rm#xf") + (abs:SF (match_operand:SF 1 "nonimmediate_operand" "0,x#fr,0,0"))) + (use (match_operand:V4SF 2 "nonimmediate_operand" "xm,0,xm*r,xm*r")) (clobber (reg:CC 17))] "TARGET_SSE && (reload_in_progress || reload_completed || (register_operand (operands[0], VOIDmode) - && register_operand (operands[1], VOIDmode)))" + && register_operand (operands[1], VOIDmode)))" "#") (define_split [(set (match_operand:SF 0 "memory_operand" "") (abs:SF (match_operand:SF 1 "memory_operand" ""))) - (use (match_operand:SF 2 "" "")) + (use (match_operand:V4SF 2 "" "")) (clobber (reg:CC 17))] "" [(parallel [(set (match_dup 0) @@ -9774,7 +10015,7 @@ (define_split [(set (match_operand:SF 0 "register_operand" "") (abs:SF (match_operand:SF 1 "register_operand" ""))) - (use (match_operand:SF 2 "" "")) + (use (match_operand:V4SF 2 "" "")) (clobber (reg:CC 17))] "reload_completed && !SSE_REG_P (operands[0])" [(parallel [(set (match_dup 0) @@ -9784,12 +10025,23 @@ (define_split [(set (match_operand:SF 0 "register_operand" "") (abs:SF (match_operand:SF 1 "register_operand" ""))) - (use (match_operand:SF 2 "register_operand" "")) + (use (match_operand:V4SF 2 "nonimmediate_operand" "")) (clobber (reg:CC 17))] "reload_completed && SSE_REG_P (operands[0])" [(set (subreg:TI (match_dup 0) 0) - (and:TI (not:TI (subreg:TI (match_dup 2) 0)) - (subreg:TI (match_dup 1) 0)))]) + (and:TI (match_dup 1) + (match_dup 2)))] +{ + operands[1] = simplify_gen_subreg (TImode, operands[1], SFmode, 0); + operands[2] = simplify_gen_subreg (TImode, operands[2], V4SFmode, 0); + if (operands_match_p (operands[0], operands[2])) + { + rtx tmp; + tmp = operands[1]; + operands[1] = operands[2]; + operands[2] = tmp; + } +}) ;; Keep 'f' and 'r' in separate alternatives to avoid reload problems ;; because of secondary memory needed to reload from class FLOAT_INT_REGS @@ -9852,17 +10104,22 @@ { /* Using SSE is tricky, since we need bitwise negation of -0 in register. */ - rtx reg = gen_reg_rtx (DFmode); + rtx reg = gen_reg_rtx (V2DFmode); #if HOST_BITS_PER_WIDE_INT >= 64 - rtx imm = gen_int_mode (((HOST_WIDE_INT)1) << 63, DImode); + rtx imm = gen_int_mode (~(((HOST_WIDE_INT)1) << 63), DImode); #else - rtx imm = immed_double_const (0, 0x80000000, DImode); + rtx imm = immed_double_const (~0, ~0x80000000, DImode); #endif rtx dest = operands[0]; operands[1] = force_reg (DFmode, operands[1]); operands[0] = force_reg (DFmode, operands[0]); - emit_move_insn (reg, gen_lowpart (DFmode, imm)); + + /* Produce LONG_DOUBLE with the proper immediate argument. */ + imm = gen_lowpart (DFmode, imm); + reg = force_reg (V2DFmode, + gen_rtx_CONST_VECTOR (V2DFmode, + gen_rtvec (2, imm, CONST0_RTX (DFmode)))); emit_insn (gen_absdf2_ifs (operands[0], operands[1], reg)); if (dest != operands[0]) emit_move_insn (dest, operands[0]); @@ -9879,9 +10136,9 @@ "#") (define_insn "absdf2_ifs" - [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,mf#Yr,mr#Yf") - (abs:DF (match_operand:DF 1 "nonimmediate_operand" "Y,0,0"))) - (use (match_operand:DF 2 "nonmemory_operand" "*0#Y,*g#Y,*g#Y")) + [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,Y#fr,mf#Yr,mr#Yf") + (abs:DF (match_operand:DF 1 "nonimmediate_operand" "0,Y#fr,0,0"))) + (use (match_operand:V2DF 2 "nonimmediate_operand" "Ym,0,Ym*r,Ym*r")) (clobber (reg:CC 17))] "!TARGET_64BIT && TARGET_SSE2 && (reload_in_progress || reload_completed @@ -9890,9 +10147,9 @@ "#") (define_insn "*absdf2_ifs_rex64" - [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,mf#Yr") - (abs:DF (match_operand:DF 1 "nonimmediate_operand" "Y,0"))) - (use (match_operand:DF 2 "nonmemory_operand" "*0#Y,*g#Y")) + [(set (match_operand:DF 0 "nonimmediate_operand" "=Y#fr,Y#fr,mf#Yr") + (abs:DF (match_operand:DF 1 "nonimmediate_operand" "0,Y#fr,0"))) + (use (match_operand:V2DF 2 "nonimmediate_operand" "Ym,0,Ym*r")) (clobber (reg:CC 17))] "TARGET_64BIT && TARGET_SSE2 && (reload_in_progress || reload_completed @@ -9903,7 +10160,7 @@ (define_split [(set (match_operand:DF 0 "memory_operand" "") (abs:DF (match_operand:DF 1 "memory_operand" ""))) - (use (match_operand:DF 2 "" "")) + (use (match_operand:V2DF 2 "" "")) (clobber (reg:CC 17))] "" [(parallel [(set (match_dup 0) @@ -9913,7 +10170,7 @@ (define_split [(set (match_operand:DF 0 "register_operand" "") (abs:DF (match_operand:DF 1 "register_operand" ""))) - (use (match_operand:DF 2 "" "")) + (use (match_operand:V2DF 2 "" "")) (clobber (reg:CC 17))] "reload_completed && !SSE_REG_P (operands[0])" [(parallel [(set (match_dup 0) @@ -9923,12 +10180,27 @@ (define_split [(set (match_operand:DF 0 "register_operand" "") (abs:DF (match_operand:DF 1 "register_operand" ""))) - (use (match_operand:DF 2 "register_operand" "")) + (use (match_operand:V2DF 2 "nonimmediate_operand" "")) (clobber (reg:CC 17))] "reload_completed && SSE_REG_P (operands[0])" [(set (subreg:TI (match_dup 0) 0) - (and:TI (not:TI (subreg:TI (match_dup 2) 0)) - (subreg:TI (match_dup 1) 0)))]) + (and:TI (match_dup 1) + (match_dup 2)))] +{ + operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0); + operands[1] = simplify_gen_subreg (TImode, operands[1], DFmode, 0); + operands[2] = simplify_gen_subreg (TImode, operands[2], V2DFmode, 0); + /* Avoid possible reformating on the operands. */ + if (TARGET_SSE_PARTIAL_REGS && !optimize_size) + emit_insn (gen_sse2_unpcklpd (operands[0], operands[0], operands[0])); + if (operands_match_p (operands[0], operands[2])) + { + rtx tmp; + tmp = operands[1]; + operands[1] = operands[2]; + operands[2] = tmp; + } +}) ;; Keep 'f' and 'r' in separate alternatives to avoid reload problems @@ -13367,7 +13639,7 @@ (match_operand:SI 3 "" "")))])] "!TARGET_64BIT" { - ix86_expand_call (NULL, operands[0], operands[1], operands[2], operands[3]); + ix86_expand_call (NULL, operands[0], operands[1], operands[2], operands[3], 0); DONE; }) @@ -13412,7 +13684,17 @@ (use (match_operand 2 "" ""))] "" { - ix86_expand_call (NULL, operands[0], operands[1], operands[2], NULL); + ix86_expand_call (NULL, operands[0], operands[1], operands[2], NULL, 0); + DONE; +}) + +(define_expand "sibcall" + [(call (match_operand:QI 0 "" "") + (match_operand 1 "" "")) + (use (match_operand 2 "" ""))] + "" +{ + ix86_expand_call (NULL, operands[0], operands[1], operands[2], NULL, 1); DONE; }) @@ -13431,41 +13713,51 @@ (define_insn "*call_1" [(call (mem:QI (match_operand:SI 0 "call_insn_operand" "rsm")) (match_operand 1 "" ""))] - "!TARGET_64BIT" + "!SIBLING_CALL_P (insn) && !TARGET_64BIT" { if (constant_call_address_operand (operands[0], QImode)) - { - if (SIBLING_CALL_P (insn)) - return "jmp\t%P0"; - else - return "call\t%P0"; - } - if (SIBLING_CALL_P (insn)) - return "jmp\t%A0"; - else - return "call\t%A0"; + return "call\t%P0"; + return "call\t%A0"; +} + [(set_attr "type" "call")]) + +(define_insn "*sibcall_1" + [(call (mem:QI (match_operand:SI 0 "sibcall_insn_operand" "s,c,d,a")) + (match_operand 1 "" ""))] + "SIBLING_CALL_P (insn) && !TARGET_64BIT" +{ + if (constant_call_address_operand (operands[0], QImode)) + return "jmp\t%P0"; + return "jmp\t%A0"; } [(set_attr "type" "call")]) (define_insn "*call_1_rex64" [(call (mem:QI (match_operand:DI 0 "call_insn_operand" "rsm")) (match_operand 1 "" ""))] - "TARGET_64BIT" + "!SIBLING_CALL_P (insn) && TARGET_64BIT" { if (constant_call_address_operand (operands[0], QImode)) - { - if (SIBLING_CALL_P (insn)) - return "jmp\t%P0"; - else - return "call\t%P0"; - } - if (SIBLING_CALL_P (insn)) - return "jmp\t%A0"; - else - return "call\t%A0"; + return "call\t%P0"; + return "call\t%A0"; } [(set_attr "type" "call")]) +(define_insn "*sibcall_1_rex64" + [(call (mem:QI (match_operand:DI 0 "constant_call_address_operand" "")) + (match_operand 1 "" ""))] + "SIBLING_CALL_P (insn) && TARGET_64BIT" + "jmp\t%P0" + [(set_attr "type" "call")]) + +(define_insn "*sibcall_1_rex64_v" + [(call (mem:QI (reg:DI 40)) + (match_operand 0 "" ""))] + "SIBLING_CALL_P (insn) && TARGET_64BIT" + "jmp\t*%%r11" + [(set_attr "type" "call")]) + + ;; Call subroutine, returning value in operand 0 (define_expand "call_value_pop" @@ -13478,7 +13770,7 @@ "!TARGET_64BIT" { ix86_expand_call (operands[0], operands[1], operands[2], - operands[3], operands[4]); + operands[3], operands[4], 0); DONE; }) @@ -13490,7 +13782,19 @@ ;; Operand 2 not used on the i386. "" { - ix86_expand_call (operands[0], operands[1], operands[2], operands[3], NULL); + ix86_expand_call (operands[0], operands[1], operands[2], operands[3], NULL, 0); + DONE; +}) + +(define_expand "sibcall_value" + [(set (match_operand 0 "" "") + (call (match_operand:QI 1 "" "") + (match_operand:SI 2 "" ""))) + (use (match_operand:SI 3 "" ""))] + ;; Operand 2 not used on the i386. + "" +{ + ix86_expand_call (operands[0], operands[1], operands[2], operands[3], NULL, 1); DONE; }) @@ -13513,7 +13817,7 @@ ix86_expand_call ((TARGET_FLOAT_RETURNS_IN_80387 ? gen_rtx_REG (XCmode, FIRST_FLOAT_REG) : NULL), operands[0], const0_rtx, GEN_INT (SSE_REGPARM_MAX - 1), - NULL); + NULL, 0); for (i = 0; i < XVECLEN (operands[2], 0); i++) { @@ -13662,11 +13966,7 @@ (clobber (mem:BLK (scratch)))] "!TARGET_64BIT" "leave" - [(set_attr "length_immediate" "0") - (set_attr "length" "1") - (set_attr "modrm" "0") - (set_attr "athlon_decode" "vector") - (set_attr "ppro_uops" "few")]) + [(set_attr "type" "leave")]) (define_insn "leave_rex64" [(set (reg:DI 7) (plus:DI (reg:DI 6) (const_int 8))) @@ -13674,11 +13974,7 @@ (clobber (mem:BLK (scratch)))] "TARGET_64BIT" "leave" - [(set_attr "length_immediate" "0") - (set_attr "length" "1") - (set_attr "modrm" "0") - (set_attr "athlon_decode" "vector") - (set_attr "ppro_uops" "few")]) + [(set_attr "type" "leave")]) (define_expand "ffssi2" [(set (match_operand:SI 0 "nonimmediate_operand" "") @@ -14312,6 +14608,24 @@ (const_string "fop"))) (set_attr "mode" "SF")]) +(define_insn "*fop_df_6" + [(set (match_operand:DF 0 "register_operand" "=f,f") + (match_operator:DF 3 "binary_fp_operator" + [(float_extend:DF + (match_operand:SF 1 "register_operand" "0,f")) + (float_extend:DF + (match_operand:SF 2 "nonimmediate_operand" "fm,0"))]))] + "TARGET_80387 && !(TARGET_SSE2 && TARGET_SSE_MATH)" + "* return output_387_binary_op (insn, operands);" + [(set (attr "type") + (cond [(match_operand:DF 3 "mult_operator" "") + (const_string "fmul") + (match_operand:DF 3 "div_operator" "") + (const_string "fdiv") + ] + (const_string "fop"))) + (set_attr "mode" "SF")]) + (define_insn "*fop_xf_1" [(set (match_operand:XF 0 "register_operand" "=f,f") (match_operator:XF 3 "binary_fp_operator" @@ -14421,7 +14735,7 @@ (define_insn "*fop_xf_4" [(set (match_operand:XF 0 "register_operand" "=f,f") (match_operator:XF 3 "binary_fp_operator" - [(float_extend:XF (match_operand:SF 1 "nonimmediate_operand" "fm,0")) + [(float_extend:XF (match_operand 1 "nonimmediate_operand" "fm,0")) (match_operand:XF 2 "register_operand" "0,f")]))] "!TARGET_64BIT && TARGET_80387" "* return output_387_binary_op (insn, operands);" @@ -14437,7 +14751,7 @@ (define_insn "*fop_tf_4" [(set (match_operand:TF 0 "register_operand" "=f,f") (match_operator:TF 3 "binary_fp_operator" - [(float_extend:TF (match_operand:SF 1 "nonimmediate_operand" "fm,0")) + [(float_extend:TF (match_operand 1 "nonimmediate_operand" "fm,0")) (match_operand:TF 2 "register_operand" "0,f")]))] "TARGET_80387" "* return output_387_binary_op (insn, operands);" @@ -14455,7 +14769,7 @@ (match_operator:XF 3 "binary_fp_operator" [(match_operand:XF 1 "register_operand" "0,f") (float_extend:XF - (match_operand:SF 2 "nonimmediate_operand" "fm,0"))]))] + (match_operand 2 "nonimmediate_operand" "fm,0"))]))] "!TARGET_64BIT && TARGET_80387" "* return output_387_binary_op (insn, operands);" [(set (attr "type") @@ -14472,7 +14786,7 @@ (match_operator:TF 3 "binary_fp_operator" [(match_operand:TF 1 "register_operand" "0,f") (float_extend:TF - (match_operand:SF 2 "nonimmediate_operand" "fm,0"))]))] + (match_operand 2 "nonimmediate_operand" "fm,0"))]))] "TARGET_80387" "* return output_387_binary_op (insn, operands);" [(set (attr "type") @@ -14487,41 +14801,10 @@ (define_insn "*fop_xf_6" [(set (match_operand:XF 0 "register_operand" "=f,f") (match_operator:XF 3 "binary_fp_operator" - [(float_extend:XF (match_operand:DF 1 "nonimmediate_operand" "fm,0")) - (match_operand:XF 2 "register_operand" "0,f")]))] - "!TARGET_64BIT && TARGET_80387" - "* return output_387_binary_op (insn, operands);" - [(set (attr "type") - (cond [(match_operand:XF 3 "mult_operator" "") - (const_string "fmul") - (match_operand:XF 3 "div_operator" "") - (const_string "fdiv") - ] - (const_string "fop"))) - (set_attr "mode" "DF")]) - -(define_insn "*fop_tf_6" - [(set (match_operand:TF 0 "register_operand" "=f,f") - (match_operator:TF 3 "binary_fp_operator" - [(float_extend:TF (match_operand:DF 1 "nonimmediate_operand" "fm,0")) - (match_operand:TF 2 "register_operand" "0,f")]))] - "TARGET_80387" - "* return output_387_binary_op (insn, operands);" - [(set (attr "type") - (cond [(match_operand:TF 3 "mult_operator" "") - (const_string "fmul") - (match_operand:TF 3 "div_operator" "") - (const_string "fdiv") - ] - (const_string "fop"))) - (set_attr "mode" "DF")]) - -(define_insn "*fop_xf_7" - [(set (match_operand:XF 0 "register_operand" "=f,f") - (match_operator:XF 3 "binary_fp_operator" - [(match_operand:XF 1 "register_operand" "0,f") + [(float_extend:XF + (match_operand 1 "register_operand" "0,f")) (float_extend:XF - (match_operand:DF 2 "nonimmediate_operand" "fm,0"))]))] + (match_operand 2 "nonimmediate_operand" "fm,0"))]))] "!TARGET_64BIT && TARGET_80387" "* return output_387_binary_op (insn, operands);" [(set (attr "type") @@ -14531,14 +14814,15 @@ (const_string "fdiv") ] (const_string "fop"))) - (set_attr "mode" "DF")]) + (set_attr "mode" "SF")]) -(define_insn "*fop_tf_7" +(define_insn "*fop_tf_6" [(set (match_operand:TF 0 "register_operand" "=f,f") (match_operator:TF 3 "binary_fp_operator" - [(match_operand:TF 1 "register_operand" "0,f") + [(float_extend:TF + (match_operand 1 "register_operand" "0,f")) (float_extend:TF - (match_operand:DF 2 "nonimmediate_operand" "fm,0"))]))] + (match_operand 2 "nonimmediate_operand" "fm,0"))]))] "TARGET_80387" "* return output_387_binary_op (insn, operands);" [(set (attr "type") @@ -14548,7 +14832,7 @@ (const_string "fdiv") ] (const_string "fop"))) - (set_attr "mode" "DF")]) + (set_attr "mode" "SF")]) (define_split [(set (match_operand 0 "register_operand" "") @@ -15949,9 +16233,9 @@ (define_expand "movhicc" [(set (match_operand:HI 0 "register_operand" "") (if_then_else:HI (match_operand 1 "comparison_operator" "") - (match_operand:HI 2 "nonimmediate_operand" "") - (match_operand:HI 3 "nonimmediate_operand" "")))] - "TARGET_CMOVE && TARGET_HIMODE_MATH" + (match_operand:HI 2 "general_operand" "") + (match_operand:HI 3 "general_operand" "")))] + "TARGET_HIMODE_MATH" "if (!ix86_expand_int_movcc (operands)) FAIL; DONE;") (define_insn "*movhicc_noc" @@ -15968,6 +16252,33 @@ [(set_attr "type" "icmov") (set_attr "mode" "HI")]) +(define_expand "movqicc" + [(set (match_operand:QI 0 "register_operand" "") + (if_then_else:QI (match_operand 1 "comparison_operator" "") + (match_operand:QI 2 "general_operand" "") + (match_operand:QI 3 "general_operand" "")))] + "TARGET_QIMODE_MATH" + "if (!ix86_expand_int_movcc (operands)) FAIL; DONE;") + +(define_insn_and_split "*movqicc_noc" + [(set (match_operand:QI 0 "register_operand" "=r,r") + (if_then_else:QI (match_operator 1 "ix86_comparison_operator" + [(match_operand 4 "flags_reg_operand" "") (const_int 0)]) + (match_operand:QI 2 "register_operand" "r,0") + (match_operand:QI 3 "register_operand" "0,r")))] + "TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL" + "#" + "&& reload_completed" + [(set (match_dup 0) + (if_then_else:SI (match_op_dup 1 [(match_dup 4) (const_int 0)]) + (match_dup 2) + (match_dup 3)))] + "operands[0] = gen_lowpart (SImode, operands[0]); + operands[2] = gen_lowpart (SImode, operands[2]); + operands[3] = gen_lowpart (SImode, operands[3]);" + [(set_attr "type" "icmov") + (set_attr "mode" "SI")]) + (define_expand "movsfcc" [(set (match_operand:SF 0 "register_operand" "") (if_then_else:SF (match_operand 1 "comparison_operator" "") @@ -16035,7 +16346,7 @@ (define_split [(set (match_operand:DF 0 "register_and_not_any_fp_reg_operand" "") (if_then_else:DF (match_operator 1 "fcmov_comparison_operator" - [(match_operand 4 "" "") (const_int 0)]) + [(match_operand 4 "flags_reg_operand" "") (const_int 0)]) (match_operand:DF 2 "nonimmediate_operand" "") (match_operand:DF 3 "nonimmediate_operand" "")))] "!TARGET_64BIT && reload_completed" @@ -16549,6 +16860,12 @@ (clobber (reg:CC 17))] "TARGET_SSE && (GET_CODE (operands[2]) != MEM || GET_CODE (operands[3]) != MEM) + /* Avoid combine from being smart and converting min/max + instruction patterns into conditional moves. */ + && ((GET_CODE (operands[1]) != LT && GET_CODE (operands[1]) != GT + && GET_CODE (operands[1]) != UNLE && GET_CODE (operands[1]) != UNGE) + || !rtx_equal_p (operands[4], operands[2]) + || !rtx_equal_p (operands[5], operands[3])) && (!TARGET_IEEE_FP || (GET_CODE (operands[1]) != EQ && GET_CODE (operands[1]) != NE))" "#") @@ -16576,6 +16893,12 @@ (clobber (reg:CC 17))] "TARGET_SSE2 && (GET_CODE (operands[2]) != MEM || GET_CODE (operands[3]) != MEM) + /* Avoid combine from being smart and converting min/max + instruction patterns into conditional moves. */ + && ((GET_CODE (operands[1]) != LT && GET_CODE (operands[1]) != GT + && GET_CODE (operands[1]) != UNLE && GET_CODE (operands[1]) != UNGE) + || !rtx_equal_p (operands[4], operands[2]) + || !rtx_equal_p (operands[5], operands[3])) && (!TARGET_IEEE_FP || (GET_CODE (operands[1]) != EQ && GET_CODE (operands[1]) != NE))" "#") @@ -16637,6 +16960,14 @@ (set (subreg:TI (match_dup 0) 0) (ior:TI (subreg:TI (match_dup 6) 0) (subreg:TI (match_dup 7) 0)))] { + if (GET_MODE (operands[2]) == DFmode + && TARGET_SSE_PARTIAL_REGS && !optimize_size) + { + rtx op = simplify_gen_subreg (V2DFmode, operands[2], DFmode, 0); + emit_insn (gen_sse2_unpcklpd (op, op, op)); + op = simplify_gen_subreg (V2DFmode, operands[3], DFmode, 0); + emit_insn (gen_sse2_unpcklpd (op, op, op)); + } /* If op2 == op3, op3 will be clobbered before it is used. This should be optimized out though. */ if (operands_match_p (operands[2], operands[3])) @@ -16743,8 +17074,22 @@ || const0_operand (operands[3], GET_MODE (operands[0])))" [(set (match_dup 0) (match_op_dup 1 [(match_dup 0) (match_dup 5)])) (set (subreg:TI (match_dup 0) 0) (and:TI (match_dup 6) - (subreg:TI (match_dup 7) 0)))] + (match_dup 7)))] { + if (TARGET_SSE_PARTIAL_REGS && !optimize_size + && GET_MODE (operands[2]) == DFmode) + { + if (REG_P (operands[2])) + { + rtx op = simplify_gen_subreg (V2DFmode, operands[2], DFmode, 0); + emit_insn (gen_sse2_unpcklpd (op, op, op)); + } + if (REG_P (operands[3])) + { + rtx op = simplify_gen_subreg (V2DFmode, operands[3], DFmode, 0); + emit_insn (gen_sse2_unpcklpd (op, op, op)); + } + } PUT_MODE (operands[1], GET_MODE (operands[0])); if (!sse_comparison_operator (operands[1], VOIDmode)) { @@ -16764,6 +17109,8 @@ operands[7] = operands[2]; operands[6] = gen_rtx_SUBREG (TImode, operands[0], 0); } + operands[7] = simplify_gen_subreg (TImode, operands[7], + GET_MODE (operands[7]), 0); }) (define_expand "allocate_stack_worker" @@ -17759,19 +18106,23 @@ [(set (match_operand 0 "" "") (call (mem:QI (match_operand:SI 1 "call_insn_operand" "rsm")) (match_operand:SI 2 "" "")))] - "!TARGET_64BIT" + "!SIBLING_CALL_P (insn) && !TARGET_64BIT" { if (constant_call_address_operand (operands[1], QImode)) - { - if (SIBLING_CALL_P (insn)) - return "jmp\t%P1"; - else - return "call\t%P1"; - } - if (SIBLING_CALL_P (insn)) - return "jmp\t%*%1"; - else - return "call\t%*%1"; + return "call\t%P1"; + return "call\t%*%1"; +} + [(set_attr "type" "callv")]) + +(define_insn "*sibcall_value_1" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:SI 1 "sibcall_insn_operand" "s,c,d,a")) + (match_operand:SI 2 "" "")))] + "SIBLING_CALL_P (insn) && !TARGET_64BIT" +{ + if (constant_call_address_operand (operands[1], QImode)) + return "jmp\t%P1"; + return "jmp\t%*%1"; } [(set_attr "type" "callv")]) @@ -17779,21 +18130,29 @@ [(set (match_operand 0 "" "") (call (mem:QI (match_operand:DI 1 "call_insn_operand" "rsm")) (match_operand:DI 2 "" "")))] - "TARGET_64BIT" + "!SIBLING_CALL_P (insn) && TARGET_64BIT" { if (constant_call_address_operand (operands[1], QImode)) - { - if (SIBLING_CALL_P (insn)) - return "jmp\t%P1"; - else - return "call\t%P1"; - } - if (SIBLING_CALL_P (insn)) - return "jmp\t%A1"; - else - return "call\t%A1"; + return "call\t%P1"; + return "call\t%A1"; } [(set_attr "type" "callv")]) + +(define_insn "*sibcall_value_1_rex64" + [(set (match_operand 0 "" "") + (call (mem:QI (match_operand:DI 1 "constant_call_address_operand" "")) + (match_operand:DI 2 "" "")))] + "SIBLING_CALL_P (insn) && TARGET_64BIT" + "jmp\t%P1" + [(set_attr "type" "callv")]) + +(define_insn "*sibcall_value_1_rex64_v" + [(set (match_operand 0 "" "") + (call (mem:QI (reg:DI 40)) + (match_operand:DI 1 "" "")))] + "SIBLING_CALL_P (insn) && TARGET_64BIT" + "jmp\t*%%r11" + [(set_attr "type" "callv")]) (define_insn "trap" [(trap_if (const_int 1) (const_int 5))] @@ -17838,7 +18197,7 @@ { operands[2] = gen_label_rtx (); output_asm_insn ("j%c0\t%l2\; int\t%1", operands); - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, "L", + (*targetm.asm_out.internal_label) (asm_out_file, "L", CODE_LABEL_NUMBER (operands[2])); RET; }) @@ -17851,28 +18210,93 @@ [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,m") (match_operand:V4SF 1 "nonimmediate_operand" "xm,x"))] "TARGET_SSE" - ;; @@@ let's try to use movaps here. "movaps\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "mode" "V4SF")]) +(define_split + [(set (match_operand:V4SF 0 "register_operand" "") + (match_operand:V4SF 1 "zero_extended_scalar_load_operand" ""))] + "TARGET_SSE" + [(set (match_dup 0) + (vec_merge:V4SF + (vec_duplicate:V4SF (match_dup 1)) + (match_dup 2) + (const_int 1)))] +{ + operands[1] = simplify_gen_subreg (SFmode, operands[1], V4SFmode, 0); + operands[2] = CONST0_RTX (V4SFmode); +}) + (define_insn "movv4si_internal" [(set (match_operand:V4SI 0 "nonimmediate_operand" "=x,m") (match_operand:V4SI 1 "nonimmediate_operand" "xm,x"))] "TARGET_SSE" - ;; @@@ let's try to use movaps here. - "movaps\t{%1, %0|%0, %1}" +{ + if (get_attr_mode (insn) == MODE_V4SF) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movdqa\t{%1, %0|%0, %1}"; +} [(set_attr "type" "ssemov") - (set_attr "mode" "V4SF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "0") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (eq_attr "alternative" "1") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)) + (ne (symbol_ref "optimize_size") + (const_int 0))) + (const_string "V4SF") + (const_string "TI"))] + (const_string "TI")))]) (define_insn "movv2di_internal" [(set (match_operand:V2DI 0 "nonimmediate_operand" "=x,m") (match_operand:V2DI 1 "nonimmediate_operand" "xm,x"))] - "TARGET_SSE" - ;; @@@ let's try to use movaps here. - "movdqa\t{%1, %0|%0, %1}" + "TARGET_SSE2" +{ + if (get_attr_mode (insn) == MODE_V4SF) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movdqa\t{%1, %0|%0, %1}"; +} [(set_attr "type" "ssemov") - (set_attr "mode" "V4SF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "0") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (eq_attr "alternative" "1") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)) + (ne (symbol_ref "optimize_size") + (const_int 0))) + (const_string "V4SF") + (const_string "TI"))] + (const_string "TI")))]) + +(define_split + [(set (match_operand:V2DF 0 "register_operand" "") + (match_operand:V2DF 1 "zero_extended_scalar_load_operand" ""))] + "TARGET_SSE2" + [(set (match_dup 0) + (vec_merge:V2DF + (vec_duplicate:V2DF (match_dup 1)) + (match_dup 2) + (const_int 1)))] +{ + operands[1] = simplify_gen_subreg (DFmode, operands[1], V2DFmode, 0); + operands[2] = CONST0_RTX (V2DFmode); +}) (define_insn "movv8qi_internal" [(set (match_operand:V8QI 0 "nonimmediate_operand" "=y,m") @@ -17922,28 +18346,85 @@ [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,m") (match_operand:V2DF 1 "nonimmediate_operand" "xm,x"))] "TARGET_SSE2" - ;; @@@ let's try to use movaps here. - "movapd\t{%1, %0|%0, %1}" +{ + if (get_attr_mode (insn) == MODE_V4SF) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movapd\t{%1, %0|%0, %1}"; +} [(set_attr "type" "ssemov") - (set_attr "mode" "V2DF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "0") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "V2DF")) + (eq_attr "alternative" "1") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)) + (ne (symbol_ref "optimize_size") + (const_int 0))) + (const_string "V4SF") + (const_string "V2DF"))] + (const_string "V2DF")))]) (define_insn "movv8hi_internal" [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m") (match_operand:V8HI 1 "nonimmediate_operand" "xm,x"))] "TARGET_SSE2" - ;; @@@ let's try to use movaps here. - "movaps\t{%1, %0|%0, %1}" +{ + if (get_attr_mode (insn) == MODE_V4SF) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movdqa\t{%1, %0|%0, %1}"; +} [(set_attr "type" "ssemov") - (set_attr "mode" "V4SF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "0") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (eq_attr "alternative" "1") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)) + (ne (symbol_ref "optimize_size") + (const_int 0))) + (const_string "V4SF") + (const_string "TI"))] + (const_string "TI")))]) (define_insn "movv16qi_internal" [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m") (match_operand:V16QI 1 "nonimmediate_operand" "xm,x"))] "TARGET_SSE2" - ;; @@@ let's try to use movaps here. - "movaps\t{%1, %0|%0, %1}" +{ + if (get_attr_mode (insn) == MODE_V4SF) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movdqa\t{%1, %0|%0, %1}"; +} [(set_attr "type" "ssemov") - (set_attr "mode" "V4SF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "0") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (eq_attr "alternative" "1") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)) + (ne (symbol_ref "optimize_size") + (const_int 0))) + (const_string "V4SF") + (const_string "TI"))] + (const_string "TI")))]) (define_expand "movv2df" [(set (match_operand:V2DF 0 "general_operand" "") @@ -18160,26 +18641,83 @@ [(set (match_operand:TI 0 "nonimmediate_operand" "=x,x,m") (match_operand:TI 1 "general_operand" "C,xm,x"))] "TARGET_SSE && !TARGET_64BIT" - "@ - xorps\t%0, %0 - movaps\t{%1, %0|%0, %1} - movaps\t{%1, %0|%0, %1}" +{ + switch (which_alternative) + { + case 0: + if (get_attr_mode (insn) == MODE_V4SF) + return "xorps\t%0, %0"; + else + return "pxor\t%0, %0"; + case 1: + case 2: + if (get_attr_mode (insn) == MODE_V4SF) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movdqa\t{%1, %0|%0, %1}"; + default: + abort (); + } +} [(set_attr "type" "ssemov,ssemov,ssemov") - (set_attr "mode" "V4SF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "0,1") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (eq_attr "alternative" "2") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "TI"))] + (const_string "TI")))]) (define_insn "*movti_rex64" - [(set (match_operand:TI 0 "nonimmediate_operand" "=r,o,x,mx,x") - (match_operand:TI 1 "general_operand" "riFo,riF,O,x,m"))] + [(set (match_operand:TI 0 "nonimmediate_operand" "=r,o,x,x,xm") + (match_operand:TI 1 "general_operand" "riFo,riF,O,xm,x"))] "TARGET_64BIT && (GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)" - "@ - # - # - xorps\t%0, %0 - movaps\\t{%1, %0|%0, %1} - movaps\\t{%1, %0|%0, %1}" +{ + switch (which_alternative) + { + case 0: + case 1: + return "#"; + case 2: + if (get_attr_mode (insn) == MODE_V4SF) + return "xorps\t%0, %0"; + else + return "pxor\t%0, %0"; + case 3: + case 4: + if (get_attr_mode (insn) == MODE_V4SF) + return "movaps\t{%1, %0|%0, %1}"; + else + return "movdqa\t{%1, %0|%0, %1}"; + default: + abort (); + } +} [(set_attr "type" "*,*,ssemov,ssemov,ssemov") - (set_attr "mode" "V4SF")]) + (set (attr "mode") + (cond [(eq_attr "alternative" "2,3") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")) + (eq_attr "alternative" "4") + (if_then_else + (ior (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES") + (const_int 0)) + (ne (symbol_ref "optimize_size") + (const_int 0))) + (const_string "V4SF") + (const_string "TI"))] + (const_string "DI")))]) (define_split [(set (match_operand:TI 0 "nonimmediate_operand" "") @@ -18329,11 +18867,21 @@ [(set_attr "type" "ssecvt") (set_attr "mode" "V4SF")]) -(define_insn "sse_loadss" +(define_expand "sse_loadss" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:SF 1 "memory_operand" "")] + "TARGET_SSE" +{ + emit_insn (gen_sse_loadss_1 (operands[0], operands[1], + CONST0_RTX (V4SFmode))); + DONE; +}) + +(define_insn "sse_loadss_1" [(set (match_operand:V4SF 0 "register_operand" "=x") (vec_merge:V4SF - (match_operand:V4SF 1 "memory_operand" "m") - (vec_duplicate:V4SF (float:SF (const_int 0))) + (vec_duplicate:V4SF (match_operand:SF 1 "memory_operand" "m")) + (match_operand:V4SF 2 "const0_operand" "X") (const_int 1)))] "TARGET_SSE" "movss\t{%1, %0|%0, %1}" @@ -18856,12 +19404,26 @@ ;; this insn. (define_insn "sse_clrv4sf" [(set (match_operand:V4SF 0 "register_operand" "=x") - (unspec:V4SF [(const_int 0)] UNSPEC_NOP))] + (match_operand:V4SF 1 "const0_operand" "X"))] "TARGET_SSE" - "xorps\t{%0, %0|%0, %0}" +{ + if (get_attr_mode (insn) == MODE_TI) + return "pxor\t{%0, %0|%0, %0}"; + else + return "xorps\t{%0, %0|%0, %0}"; +} [(set_attr "type" "sselog") (set_attr "memory" "none") - (set_attr "mode" "V4SF")]) + (set (attr "mode") + (if_then_else + (and (and (ne (symbol_ref "TARGET_SSE_LOAD0_BY_PXOR") + (const_int 0)) + (ne (symbol_ref "TARGET_SSE2") + (const_int 0))) + (eq (symbol_ref "optimize_size") + (const_int 0))) + (const_string "TI") + (const_string "V4SF")))]) ;; Use xor, but don't show input operands so they aren't live before ;; this insn. @@ -19093,6 +19655,18 @@ [(set_attr "type" "ssecvt") (set_attr "mode" "SF")]) +(define_insn "cvtsi2ssq" + [(set (match_operand:V4SF 0 "register_operand" "=x") + (vec_merge:V4SF + (match_operand:V4SF 1 "register_operand" "0") + (vec_duplicate:V4SF + (float:SF (match_operand:DI 2 "nonimmediate_operand" "rm"))) + (const_int 14)))] + "TARGET_SSE && TARGET_64BIT" + "cvtsi2ssq\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "SF")]) + (define_insn "cvtss2si" [(set (match_operand:SI 0 "register_operand" "=r") (vec_select:SI @@ -19883,7 +20457,7 @@ output_asm_insn (\"rex\", operands); output_asm_insn (\"movaps\\t{%5, %4|%4, %5}\", operands); } - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\", + (*targetm.asm_out.internal_label) (asm_out_file, \"L\", CODE_LABEL_NUMBER (operands[3])); RET; } @@ -20720,7 +21294,7 @@ (vec_merge:V4SF (match_operand:V4SF 1 "register_operand" "0") (vec_duplicate:V4SF (float_truncate:V2SF - (match_operand:V2DF 2 "register_operand" "xm"))) + (match_operand:V2DF 2 "nonimmediate_operand" "xm"))) (const_int 14)))] "TARGET_SSE2" "cvtsd2ss\t{%2, %0|%0, %2}" @@ -20732,7 +21306,7 @@ (vec_merge:V2DF (match_operand:V2DF 1 "register_operand" "0") (float_extend:V2DF (vec_select:V2SF - (match_operand:V4SF 2 "register_operand" "xm") + (match_operand:V4SF 2 "nonimmediate_operand" "xm") (parallel [(const_int 0) (const_int 1)]))) (const_int 2)))] @@ -21008,10 +21582,20 @@ (define_insn "sse2_clrti" [(set (match_operand:TI 0 "register_operand" "=x") (const_int 0))] "TARGET_SSE2" - "pxor\t{%0, %0|%0, %0}" - [(set_attr "type" "sseiadd") +{ + if (get_attr_mode (insn) == MODE_TI) + return "pxor\t%0, %0"; + else + return "xorps\t%0, %0"; +} + [(set_attr "type" "ssemov") (set_attr "memory" "none") - (set_attr "mode" "TI")]) + (set (attr "mode") + (if_then_else + (ne (symbol_ref "optimize_size") + (const_int 0)) + (const_string "V4SF") + (const_string "TI")))]) ;; MMX unsigned averages/sum of absolute differences @@ -21716,11 +22300,21 @@ [(set_attr "type" "ssecvt") (set_attr "mode" "V2DF")]) -(define_insn "sse2_loadsd" +(define_expand "sse2_loadsd" + [(match_operand:V2DF 0 "register_operand" "") + (match_operand:DF 1 "memory_operand" "")] + "TARGET_SSE2" +{ + emit_insn (gen_sse2_loadsd_1 (operands[0], operands[1], + CONST0_RTX (V2DFmode))); + DONE; +}) + +(define_insn "sse2_loadsd_1" [(set (match_operand:V2DF 0 "register_operand" "=x") (vec_merge:V2DF - (match_operand:DF 1 "memory_operand" "m") - (vec_duplicate:DF (float:DF (const_int 0))) + (vec_duplicate:V2DF (match_operand:DF 1 "memory_operand" "m")) + (match_operand:V2DF 2 "const0_operand" "X") (const_int 1)))] "TARGET_SSE2" "movsd\t{%1, %0|%0, %1}" diff --git a/gcc/config/i386/k6.md b/gcc/config/i386/k6.md index af128bfe037..d9f6d84b022 100644 --- a/gcc/config/i386/k6.md +++ b/gcc/config/i386/k6.md @@ -71,7 +71,7 @@ ;; Load unit have two cycle latency, but we take care for it in adjust_cost (define_function_unit "k6_load" 1 0 (and (eq_attr "cpu" "k6") - (ior (eq_attr "type" "pop") + (ior (eq_attr "type" "pop,leave") (eq_attr "memory" "load,both"))) 1 1) diff --git a/gcc/config/i386/lynx-ng.h b/gcc/config/i386/lynx-ng.h index 08fa60f430c..8d41add6cee 100644 --- a/gcc/config/i386/lynx-ng.h +++ b/gcc/config/i386/lynx-ng.h @@ -68,10 +68,3 @@ Boston, MA 02111-1307, USA. */ #undef ASM_GENERATE_INTERNAL_LABEL #define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER) \ sprintf ((BUF), ".%s%ld", (PREFIX), (long)(NUMBER)) - -/* This is how to output an internal numbered label where - PREFIX is the class of label and NUM is the number within the class. */ - -#undef ASM_OUTPUT_INTERNAL_LABEL -#define ASM_OUTPUT_INTERNAL_LABEL(FILE,PREFIX,NUM) \ - fprintf (FILE, ".%s%d:\n", PREFIX, NUM) diff --git a/gcc/config/i386/lynx.h b/gcc/config/i386/lynx.h index 7835f2713f2..bdbfbe65f0f 100644 --- a/gcc/config/i386/lynx.h +++ b/gcc/config/i386/lynx.h @@ -69,10 +69,3 @@ Boston, MA 02111-1307, USA. */ #undef ASM_GENERATE_INTERNAL_LABEL #define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER) \ sprintf ((BUF), ".%s%ld", (PREFIX), (long)(NUMBER)) - -/* This is how to output an internal numbered label where - PREFIX is the class of label and NUM is the number within the class. */ - -#undef ASM_OUTPUT_INTERNAL_LABEL -#define ASM_OUTPUT_INTERNAL_LABEL(FILE,PREFIX,NUM) \ - fprintf (FILE, ".%s%d:\n", PREFIX, NUM) diff --git a/gcc/config/i386/pentium.md b/gcc/config/i386/pentium.md index b4c5ece3678..24f8becb02a 100644 --- a/gcc/config/i386/pentium.md +++ b/gcc/config/i386/pentium.md @@ -194,7 +194,7 @@ (define_insn_reservation "pent_pop" 1 (and (eq_attr "cpu" "pentium") - (eq_attr "type" "pop")) + (eq_attr "type" "pop,leave")) "pentium-firstuv") ;; Call and branch instruction can execute in either pipe, but diff --git a/gcc/config/i386/ppro.md b/gcc/config/i386/ppro.md index 86906856469..67cc1f1e293 100644 --- a/gcc/config/i386/ppro.md +++ b/gcc/config/i386/ppro.md @@ -29,7 +29,7 @@ (define_attr "ppro_uops" "one,few,many" (cond [(eq_attr "type" "other,multi,call,callv,fpspc,str") (const_string "many") - (eq_attr "type" "icmov,fcmov,str,cld") + (eq_attr "type" "icmov,fcmov,str,cld,leave") (const_string "few") (eq_attr "type" "imov") (if_then_else (eq_attr "memory" "store,both") @@ -118,7 +118,7 @@ (define_function_unit "ppro_p2" 1 0 (and (eq_attr "cpu" "pentiumpro") - (ior (eq_attr "type" "pop") + (ior (eq_attr "type" "pop,leave") (eq_attr "memory" "load,both"))) 3 1) diff --git a/gcc/config/i386/sco5.h b/gcc/config/i386/sco5.h index 815e45771d6..3b25b65d826 100644 --- a/gcc/config/i386/sco5.h +++ b/gcc/config/i386/sco5.h @@ -343,7 +343,7 @@ do { \ do { \ if (TARGET_ELF) \ ASM_OUTPUT_ALIGN ((FILE), 2); \ - ASM_OUTPUT_INTERNAL_LABEL((FILE),(PREFIX),(NUM)); \ + (*targetm.asm_out.internal_label)((FILE),(PREFIX),(NUM)); \ } while (0) #undef ASM_OUTPUT_IDENT @@ -354,10 +354,6 @@ do { \ #define ASM_OUTPUT_EXTERNAL_LIBCALL(FILE, FUN) \ if (TARGET_ELF) (*targetm.asm_out.globalize_label) (FILE, XSTR (FUN, 0)) -#undef ASM_OUTPUT_INTERNAL_LABEL -#define ASM_OUTPUT_INTERNAL_LABEL(FILE,PREFIX,NUM) \ - fprintf (FILE, ".%s%d:\n", PREFIX, NUM) - /* The prefix to add to user-visible assembler symbols. */ #undef USER_LABEL_PREFIX diff --git a/gcc/config/i386/t-cygwin b/gcc/config/i386/t-cygwin index 6fcb8340ddc..b4ea698cbc2 100644 --- a/gcc/config/i386/t-cygwin +++ b/gcc/config/i386/t-cygwin @@ -14,7 +14,9 @@ LIBGCC2_INCLUDES = -I$(srcdir)/../winsup/include \ -I$(srcdir)/../winsup/cygwin/include \ -I$(srcdir)/../winsup/w32api/include -winnt.o: $(srcdir)/config/i386/winnt.c $(RTL_H) $(TREE_H) $(CONFIG_H) $(TM_P_H) +winnt.o: $(srcdir)/config/i386/winnt.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \ + $(TM_P_H) toplev.h $(HASHTAB_H) $(GGC_H) $(CC) -c $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $(srcdir)/config/i386/winnt.c # Don't run fixproto diff --git a/gcc/config/i386/t-interix b/gcc/config/i386/t-interix index 710de8b0881..d5fff6167b7 100644 --- a/gcc/config/i386/t-interix +++ b/gcc/config/i386/t-interix @@ -1,6 +1,7 @@ LIB1ASMSRC = i386/cygwin.asm LIB1ASMFUNCS = _chkstk -winnt.o: $(srcdir)/config/i386/winnt.c $(TM_P_H) +winnt.o: $(srcdir)/config/i386/winnt.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \ + $(TM_P_H) toplev.h $(HASHTAB_H) $(GGC_H) $(CC) -c $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $(srcdir)/config/i386/winnt.c - diff --git a/gcc/config/i386/vxi386.h b/gcc/config/i386/vxi386.h deleted file mode 100644 index ee4a74093f8..00000000000 --- a/gcc/config/i386/vxi386.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Definitions of target machine for GNU compiler. VxWorks i386 version. - Copyright (C) 1998, 2002 Free Software Foundation, Inc. - -This file is part of GNU CC. - -GNU CC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -GNU CC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GNU CC; see the file COPYING. If not, write to -the Free Software Foundation, 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ - -#undef TARGET_VERSION -#define TARGET_VERSION fprintf (stderr, " (80386, VxWorks BSD syntax)"); - -#define TARGET_OS_CPP_BUILTINS() \ - do \ - { \ - builtin_define ("__vxworks"); \ - builtin_assert ("system=unix"); \ - \ - if (TARGET_386) \ - builtin_define ("CPU=I80386"); \ - else if (TARGET_486) \ - builtin_define ("CPU=I80486"); \ - else if (TARGET_PENTIUM) \ - { \ - builtin_define ("CPU=PENTIUM"); \ - builtin_define ("CPU_VARIANT=PENTIUM"); \ - } \ - else if (TARGET_PENTIUMPRO) \ - { \ - builtin_define ("CPU=PENTIUM"); \ - builtin_define ("CPU_VARIANT=PENTIUMPRO"); \ - } \ - } \ - while (0) - -#define HANDLE_SYSV_PRAGMA 1 - -/* VxWorks does all the library stuff itself. */ - -#undef LIB_SPEC -#define LIB_SPEC "" - -/* VxWorks uses object files, not loadable images. make linker just - combine objects. */ - -#undef LINK_SPEC -#define LINK_SPEC "-r" - -/* VxWorks provides the functionality of crt0.o and friends itself. */ - -#undef STARTFILE_SPEC -#define STARTFILE_SPEC "" - -#undef ENDFILE_SPEC -#define ENDFILE_SPEC "" diff --git a/gcc/config/i386/winnt.c b/gcc/config/i386/winnt.c index 00b3dfd0442..bc2527aaca5 100644 --- a/gcc/config/i386/winnt.c +++ b/gcc/config/i386/winnt.c @@ -22,6 +22,8 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include "system.h" +#include "coretypes.h" +#include "tm.h" #include "rtl.h" #include "regs.h" #include "hard-reg-set.h" diff --git a/gcc/config/i386/xm-i386-interix.h b/gcc/config/i386/xm-i386-interix.h deleted file mode 100644 index bd010e47bc5..00000000000 --- a/gcc/config/i386/xm-i386-interix.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Configuration for GNU compiler - for an Intel i386 or later processor running Interix. - Copyright (C) 1999 Free Software Foundation, Inc. - Contributed by Donn Terry (donn@interix.com) - Derived from code by Douglas B. Rupp (drupp@cs.washington.edu) - -This file is part of GNU CC. - -GNU CC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -GNU CC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GNU CC; see the file COPYING. If not, write to -the Free Software Foundation, 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ - -#define HOST_BITS_PER_WIDEST_INT HOST_BITS_PER_LONGLONG -#ifdef __GNUC__ -# define HOST_WIDEST_INT long long -#else -# define HOST_WIDEST_INT __int64 -#endif -#define HOST_WIDEST_INT_PRINT_DEC "%lld" -#define HOST_WIDEST_INT_PRINT_UNSIGNED "%llu" -#define HOST_WIDEST_INT_PRINT_HEX "0x%llx" |