summaryrefslogtreecommitdiff
path: root/gcc/config/i386/sse.md
diff options
context:
space:
mode:
authoruros <uros@138bc75d-0d04-0410-961f-82ee72b054a4>2007-09-08 11:33:08 +0000
committeruros <uros@138bc75d-0d04-0410-961f-82ee72b054a4>2007-09-08 11:33:08 +0000
commitfac183054d5b79876786617ca21984f8cbbd06da (patch)
tree876b7ca4bc50725d03f1ecf64568f646160c69a5 /gcc/config/i386/sse.md
parent4ec180c2cf5f1c95a8206063f57badd2aceffb1c (diff)
downloadgcc-fac183054d5b79876786617ca21984f8cbbd06da.tar.gz
PR target/33329
PR target/26449 * config/i386/sse.md (mulv4si3): Do not expand sse2 sequence. (*sse2_mulv4si3): New define_insn_and_split pattern. Split insn in split1 pass. (mulv16qi3): Implement as define_insn_and_split pattern instead of define_expand, to split insn in split1 pass. (mulv2di3): Ditto. testsuite/ChangeLog: PR target/33329 PR target/26449 * gcc.target/i386/pr33329.c: New file. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@128269 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/i386/sse.md')
-rw-r--r--gcc/config/i386/sse.md113
1 files changed, 65 insertions, 48 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6779e9a805a..02964d7aa6a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2899,11 +2899,15 @@
(set_attr "prefix_data16" "1")
(set_attr "mode" "TI")])
-(define_expand "mulv16qi3"
+(define_insn_and_split "mulv16qi3"
[(set (match_operand:V16QI 0 "register_operand" "")
(mult:V16QI (match_operand:V16QI 1 "register_operand" "")
(match_operand:V16QI 2 "register_operand" "")))]
- "TARGET_SSE2"
+ "TARGET_SSE2
+ && !(reload_completed || reload_in_progress)"
+ "#"
+ "&& 1"
+ [(const_int 0)]
{
rtx t[12], op0;
int i;
@@ -3097,50 +3101,6 @@
{
if (TARGET_SSE4_1)
ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);
- else
- {
- rtx t1, t2, t3, t4, t5, t6, thirtytwo;
- rtx op0, op1, op2;
-
- op0 = operands[0];
- op1 = operands[1];
- op2 = operands[2];
- t1 = gen_reg_rtx (V4SImode);
- t2 = gen_reg_rtx (V4SImode);
- t3 = gen_reg_rtx (V4SImode);
- t4 = gen_reg_rtx (V4SImode);
- t5 = gen_reg_rtx (V4SImode);
- t6 = gen_reg_rtx (V4SImode);
- thirtytwo = GEN_INT (32);
-
- /* Multiply elements 2 and 0. */
- emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
- op1, op2));
-
- /* Shift both input vectors down one element, so that elements 3
- and 1 are now in the slots for elements 2 and 0. For K8, at
- least, this is faster than using a shuffle. */
- emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2),
- gen_lowpart (TImode, op1),
- thirtytwo));
- emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3),
- gen_lowpart (TImode, op2),
- thirtytwo));
- /* Multiply elements 3 and 1. */
- emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
- t2, t3));
-
- /* Move the results in element 2 down to element 1; we don't care
- what goes in elements 2 and 3. */
- emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
- const0_rtx, const0_rtx));
- emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
- const0_rtx, const0_rtx));
-
- /* Merge the parts back together. */
- emit_insn (gen_sse2_punpckldq (op0, t5, t6));
- DONE;
- }
})
(define_insn "*sse4_1_mulv4si3"
@@ -3153,11 +3113,68 @@
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
-(define_expand "mulv2di3"
+(define_insn_and_split "*sse2_mulv4si3"
+ [(set (match_operand:V4SI 0 "register_operand" "")
+ (mult:V4SI (match_operand:V4SI 1 "register_operand" "")
+ (match_operand:V4SI 2 "register_operand" "")))]
+ "TARGET_SSE2 && !TARGET_SSE4_1
+ && !(reload_completed || reload_in_progress)"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx t1, t2, t3, t4, t5, t6, thirtytwo;
+ rtx op0, op1, op2;
+
+ op0 = operands[0];
+ op1 = operands[1];
+ op2 = operands[2];
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+ t3 = gen_reg_rtx (V4SImode);
+ t4 = gen_reg_rtx (V4SImode);
+ t5 = gen_reg_rtx (V4SImode);
+ t6 = gen_reg_rtx (V4SImode);
+ thirtytwo = GEN_INT (32);
+
+ /* Multiply elements 2 and 0. */
+ emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
+ op1, op2));
+
+ /* Shift both input vectors down one element, so that elements 3
+ and 1 are now in the slots for elements 2 and 0. For K8, at
+ least, this is faster than using a shuffle. */
+ emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2),
+ gen_lowpart (TImode, op1),
+ thirtytwo));
+ emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3),
+ gen_lowpart (TImode, op2),
+ thirtytwo));
+ /* Multiply elements 3 and 1. */
+ emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
+ t2, t3));
+
+ /* Move the results in element 2 down to element 1; we don't care
+ what goes in elements 2 and 3. */
+ emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
+ const0_rtx, const0_rtx));
+ emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
+ const0_rtx, const0_rtx));
+
+ /* Merge the parts back together. */
+ emit_insn (gen_sse2_punpckldq (op0, t5, t6));
+ DONE;
+})
+
+(define_insn_and_split "mulv2di3"
[(set (match_operand:V2DI 0 "register_operand" "")
(mult:V2DI (match_operand:V2DI 1 "register_operand" "")
(match_operand:V2DI 2 "register_operand" "")))]
- "TARGET_SSE2"
+ "TARGET_SSE2
+ && !(reload_completed || reload_in_progress)"
+ "#"
+ "&& 1"
+ [(const_int 0)]
{
rtx t1, t2, t3, t4, t5, t6, thirtytwo;
rtx op0, op1, op2;