diff options
-rw-r--r-- | gcc/ChangeLog | 7 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 55 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune.def | 2 |
3 files changed, 63 insertions, 1 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7ff5bd22948..e0b30a373eb 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2013-10-01 Wei Mi <wmi@google.com> + + * config/i386/x86-tune.def (DEF_TUNE): Remove + m_CORE_ALL. + * config/i386/i386.md: Add define_peephole2 to + break partial reg stall for cvtss2sd/cvtsd2ss. + 2013-10-01 Joern Rennecke <joern.rennecke@embecosm.com> * config/arc/arc.c (pass_arc_ifcvt::clone): diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 03b38426c4b..7368719d313 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -5117,6 +5117,61 @@ emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode)); }) +;; Break partial reg stall for cvtsd2ss. + +(define_peephole2 + [(set (match_operand:SF 0 "register_operand") + (float_truncate:SF + (match_operand:DF 1 "nonimmediate_operand")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && optimize_function_for_speed_p (cfun) + && SSE_REG_P (operands[0]) + && (!SSE_REG_P (operands[1]) + || REGNO (operands[0]) != REGNO (operands[1]))" + [(set (match_dup 0) + (vec_merge:V4SF + (vec_duplicate:V4SF + (float_truncate:V2SF + (match_dup 1))) + (match_dup 0) + (const_int 1)))] +{ + operands[0] = simplify_gen_subreg (V4SFmode, operands[0], + SFmode, 0); + operands[1] = simplify_gen_subreg (V2DFmode, operands[1], + DFmode, 0); + emit_move_insn (operands[0], CONST0_RTX (V4SFmode)); +}) + +;; Break partial reg stall for cvtss2sd. + +(define_peephole2 + [(set (match_operand:DF 0 "register_operand") + (float_extend:DF + (match_operand:SF 1 "nonimmediate_operand")))] + "TARGET_SSE2 && TARGET_SSE_MATH + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && optimize_function_for_speed_p (cfun) + && SSE_REG_P (operands[0]) + && (!SSE_REG_P (operands[1]) + || REGNO (operands[0]) != REGNO (operands[1]))" + [(set (match_dup 0) + (vec_merge:V2DF + (float_extend:V2DF + (vec_select:V2SF + (match_dup 1) + (parallel [(const_int 0) (const_int 1)]))) + (match_dup 0) + (const_int 1)))] +{ + operands[0] = simplify_gen_subreg (V2DFmode, operands[0], + DFmode, 0); + operands[1] = simplify_gen_subreg (V4SFmode, operands[1], + SFmode, 0); + emit_move_insn (operands[0], CONST0_RTX (V2DFmode)); +}) + ;; Avoid store forwarding (partial memory) stall penalty ;; by passing DImode value through XMM registers. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index c3cf00f53b4..6b0a593ddda 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -346,7 +346,7 @@ DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6) from FP to FP. This form of instructions avoids partial write to the destination. */ DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", - m_CORE_ALL | m_AMDFAM10 | m_GENERIC) + m_AMDFAM10 | m_GENERIC) /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion from integer to FP. */ |