diff options
author | uros <uros@138bc75d-0d04-0410-961f-82ee72b054a4> | 2007-09-08 11:33:08 +0000 |
---|---|---|
committer | uros <uros@138bc75d-0d04-0410-961f-82ee72b054a4> | 2007-09-08 11:33:08 +0000 |
commit | fac183054d5b79876786617ca21984f8cbbd06da (patch) | |
tree | 876b7ca4bc50725d03f1ecf64568f646160c69a5 /gcc/config/i386/sse.md | |
parent | 4ec180c2cf5f1c95a8206063f57badd2aceffb1c (diff) | |
download | gcc-fac183054d5b79876786617ca21984f8cbbd06da.tar.gz |
PR target/33329
PR target/26449
* config/i386/sse.md (mulv4si3): Do not expand sse2 sequence.
(*sse2_mulv4si3): New define_insn_and_split pattern. Split insn in
split1 pass.
(mulv16qi3): Implement as define_insn_and_split pattern instead of
define_expand, to split insn in split1 pass.
(mulv2di3): Ditto.
testsuite/ChangeLog:
PR target/33329
PR target/26449
* gcc.target/i386/pr33329.c: New file.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@128269 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/i386/sse.md')
-rw-r--r-- | gcc/config/i386/sse.md | 113 |
1 files changed, 65 insertions, 48 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6779e9a805a..02964d7aa6a 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2899,11 +2899,15 @@ (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) -(define_expand "mulv16qi3" +(define_insn_and_split "mulv16qi3" [(set (match_operand:V16QI 0 "register_operand" "") (mult:V16QI (match_operand:V16QI 1 "register_operand" "") (match_operand:V16QI 2 "register_operand" "")))] - "TARGET_SSE2" + "TARGET_SSE2 + && !(reload_completed || reload_in_progress)" + "#" + "&& 1" + [(const_int 0)] { rtx t[12], op0; int i; @@ -3097,50 +3101,6 @@ { if (TARGET_SSE4_1) ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands); - else - { - rtx t1, t2, t3, t4, t5, t6, thirtytwo; - rtx op0, op1, op2; - - op0 = operands[0]; - op1 = operands[1]; - op2 = operands[2]; - t1 = gen_reg_rtx (V4SImode); - t2 = gen_reg_rtx (V4SImode); - t3 = gen_reg_rtx (V4SImode); - t4 = gen_reg_rtx (V4SImode); - t5 = gen_reg_rtx (V4SImode); - t6 = gen_reg_rtx (V4SImode); - thirtytwo = GEN_INT (32); - - /* Multiply elements 2 and 0. */ - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), - op1, op2)); - - /* Shift both input vectors down one element, so that elements 3 - and 1 are now in the slots for elements 2 and 0. For K8, at - least, this is faster than using a shuffle. */ - emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2), - gen_lowpart (TImode, op1), - thirtytwo)); - emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3), - gen_lowpart (TImode, op2), - thirtytwo)); - /* Multiply elements 3 and 1. */ - emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), - t2, t3)); - - /* Move the results in element 2 down to element 1; we don't care - what goes in elements 2 and 3. */ - emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - - /* Merge the parts back together. */ - emit_insn (gen_sse2_punpckldq (op0, t5, t6)); - DONE; - } }) (define_insn "*sse4_1_mulv4si3" @@ -3153,11 +3113,68 @@ (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) -(define_expand "mulv2di3" +(define_insn_and_split "*sse2_mulv4si3" + [(set (match_operand:V4SI 0 "register_operand" "") + (mult:V4SI (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")))] + "TARGET_SSE2 && !TARGET_SSE4_1 + && !(reload_completed || reload_in_progress)" + "#" + "&& 1" + [(const_int 0)] +{ + rtx t1, t2, t3, t4, t5, t6, thirtytwo; + rtx op0, op1, op2; + + op0 = operands[0]; + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V4SImode); + t4 = gen_reg_rtx (V4SImode); + t5 = gen_reg_rtx (V4SImode); + t6 = gen_reg_rtx (V4SImode); + thirtytwo = GEN_INT (32); + + /* Multiply elements 2 and 0. */ + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), + op1, op2)); + + /* Shift both input vectors down one element, so that elements 3 + and 1 are now in the slots for elements 2 and 0. For K8, at + least, this is faster than using a shuffle. */ + emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2), + gen_lowpart (TImode, op1), + thirtytwo)); + emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3), + gen_lowpart (TImode, op2), + thirtytwo)); + /* Multiply elements 3 and 1. */ + emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), + t2, t3)); + + /* Move the results in element 2 down to element 1; we don't care + what goes in elements 2 and 3. */ + emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + + /* Merge the parts back together. */ + emit_insn (gen_sse2_punpckldq (op0, t5, t6)); + DONE; +}) + +(define_insn_and_split "mulv2di3" [(set (match_operand:V2DI 0 "register_operand" "") (mult:V2DI (match_operand:V2DI 1 "register_operand" "") (match_operand:V2DI 2 "register_operand" "")))] - "TARGET_SSE2" + "TARGET_SSE2 + && !(reload_completed || reload_in_progress)" + "#" + "&& 1" + [(const_int 0)] { rtx t1, t2, t3, t4, t5, t6, thirtytwo; rtx op0, op1, op2; |