summaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorrth <rth@138bc75d-0d04-0410-961f-82ee72b054a4>2005-05-17 08:15:41 +0000
committerrth <rth@138bc75d-0d04-0410-961f-82ee72b054a4>2005-05-17 08:15:41 +0000
commit4b26818b361c5c3d36e89609785435ab41c28658 (patch)
treef0184ce23c030c7fb89609b4a44adfbe78828469 /gcc
parent52a1613211dcae4cd9627441b53e2a039d4a44a4 (diff)
downloadgcc-4b26818b361c5c3d36e89609785435ab41c28658.tar.gz
* config/i386/sse.md (mulv16qi3, mulv2di3): New.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@99824 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog4
-rw-r--r--gcc/config/i386/sse.md88
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-100.c28
3 files changed, 120 insertions, 0 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index dc64af228ba..82233e9ee9b 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,7 @@
+2005-05-17 Richard Henderson <rth@redhat.com>
+
+ * config/i386/sse.md (mulv16qi3, mulv2di3): New.
+
2005-05-17 Jakub Jelinek <jakub@redhat.com>
PR middle-end/21492
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 76efe5f0586..5ff94ba9814 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2392,6 +2392,50 @@
[(set_attr "type" "sseiadd")
(set_attr "mode" "TI")])
+(define_expand "mulv16qi3"
+ [(set (match_operand:V16QI 0 "register_operand" "")
+ (mult:V16QI (match_operand:V16QI 1 "register_operand" "")
+ (match_operand:V16QI 2 "register_operand" "")))]
+ "TARGET_SSE2"
+{
+ rtx t[12], op0;
+ int i;
+
+ for (i = 0; i < 12; ++i)
+ t[i] = gen_reg_rtx (V16QImode);
+
+ /* Unpack data such that we've got a source byte in each low byte of
+ each word. We don't care what goes into the high byte of each word.
+ Rather than trying to get zero in there, most convenient is to let
+ it be a copy of the low byte. */
+ emit_insn (gen_sse2_punpckhbw (t[0], operands[1], operands[1]));
+ emit_insn (gen_sse2_punpckhbw (t[1], operands[2], operands[2]));
+ emit_insn (gen_sse2_punpcklbw (t[2], operands[1], operands[1]));
+ emit_insn (gen_sse2_punpcklbw (t[3], operands[2], operands[2]));
+
+ /* Multiply words. The end-of-line annotations here give a picture of what
+ the output of that instruction looks like. Dot means don't care; the
+ letters are the bytes of the result with A being the most significant. */
+ emit_insn (gen_mulv8hi3 (gen_lowpart (V8HImode, t[4]), /* .A.B.C.D.E.F.G.H */
+ gen_lowpart (V8HImode, t[0]),
+ gen_lowpart (V8HImode, t[1])));
+ emit_insn (gen_mulv8hi3 (gen_lowpart (V8HImode, t[5]), /* .I.J.K.L.M.N.O.P */
+ gen_lowpart (V8HImode, t[2]),
+ gen_lowpart (V8HImode, t[3])));
+
+ /* Extract the relevant bytes and merge them back together. */
+ emit_insn (gen_sse2_punpckhbw (t[6], t[5], t[4])); /* ..AI..BJ..CK..DL */
+ emit_insn (gen_sse2_punpcklbw (t[7], t[5], t[4])); /* ..EM..FN..GO..HP */
+ emit_insn (gen_sse2_punpckhbw (t[8], t[7], t[6])); /* ....AEIM....BFJN */
+ emit_insn (gen_sse2_punpcklbw (t[9], t[7], t[6])); /* ....CGKO....DHLP */
+ emit_insn (gen_sse2_punpckhbw (t[10], t[9], t[8])); /* ........ACEGIKMO */
+ emit_insn (gen_sse2_punpcklbw (t[11], t[9], t[8])); /* ........BDFHJLNP */
+
+ op0 = operands[0];
+ emit_insn (gen_sse2_punpcklbw (op0, t[11], t[10])); /* ABCDEFGHIJKLMNOP */
+ DONE;
+})
+
(define_expand "mulv8hi3"
[(set (match_operand:V8HI 0 "register_operand" "")
(mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "")
@@ -2536,6 +2580,50 @@
DONE;
})
+(define_expand "mulv2di3"
+ [(set (match_operand:V2DI 0 "register_operand" "")
+ (mult:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "")
+ (match_operand:V2DI 2 "nonimmediate_operand" "")))]
+ "TARGET_SSE2"
+{
+ rtx t1, t2, t3, t4, t5, t6, thirtytwo;
+ rtx op0, op1, op2;
+
+ op0 = operands[0];
+ op1 = operands[1];
+ op2 = operands[2];
+ t1 = gen_reg_rtx (V2DImode);
+ t2 = gen_reg_rtx (V2DImode);
+ t3 = gen_reg_rtx (V2DImode);
+ t4 = gen_reg_rtx (V2DImode);
+ t5 = gen_reg_rtx (V2DImode);
+ t6 = gen_reg_rtx (V2DImode);
+ thirtytwo = GEN_INT (32);
+
+ /* Multiply low parts. */
+ emit_insn (gen_sse2_umulv2siv2di3 (t1, gen_lowpart (V4SImode, op1),
+ gen_lowpart (V4SImode, op2)));
+
+ /* Shift input vectors left 32 bits so we can multiply high parts. */
+ emit_insn (gen_lshrv2di3 (t2, op1, thirtytwo));
+ emit_insn (gen_lshrv2di3 (t3, op2, thirtytwo));
+
+ /* Multiply high parts by low parts. */
+ emit_insn (gen_sse2_umulv2siv2di3 (t4, gen_lowpart (V4SImode, op1),
+ gen_lowpart (V4SImode, t3)));
+ emit_insn (gen_sse2_umulv2siv2di3 (t5, gen_lowpart (V4SImode, op2),
+ gen_lowpart (V4SImode, t2)));
+
+ /* Shift them back. */
+ emit_insn (gen_ashlv2di3 (t4, t4, thirtytwo));
+ emit_insn (gen_ashlv2di3 (t5, t5, thirtytwo));
+
+ /* Add the three parts together. */
+ emit_insn (gen_addv2di3 (t6, t1, t4));
+ emit_insn (gen_addv2di3 (op0, t6, t5));
+ DONE;
+})
+
(define_insn "ashr<mode>3"
[(set (match_operand:SSEMODE24 0 "register_operand" "=x")
(ashiftrt:SSEMODE24
diff --git a/gcc/testsuite/gcc.dg/vect/vect-100.c b/gcc/testsuite/gcc.dg/vect/vect-100.c
new file mode 100644
index 00000000000..3b803fc71b6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-100.c
@@ -0,0 +1,28 @@
+/* Assuming we can vectorize char multiplication, here's an execute test. */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+extern void abort (void);
+void foo()
+{
+ static unsigned char A[256], B[256], C[256];
+ int i;
+
+ for (i = 0; i < 256; ++i)
+ A[i] = B[i] = i;
+
+ for (i = 0; i < 256; ++i)
+ C[i] = A[i] * B[i];
+
+ for (i = 0; i < 256; ++i)
+ if (C[i] != (unsigned char)(i * i))
+ abort ();
+}
+
+int main()
+{
+ check_vect ();
+ foo();
+ return 0;
+}