summaryrefslogtreecommitdiff
path: root/gcc/config/i386/sse.md
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/sse.md')
-rw-r--r--gcc/config/i386/sse.md575
1 files changed, 549 insertions, 26 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 88822c5fb31..6ee090a6b4a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -53,7 +53,14 @@
(define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
;; Mapping of vector modes back to the scalar modes
-(define_mode_attr ssescalarmode [(V4SF "SF") (V2DF "DF")])
+(define_mode_attr ssescalarmode [(V4SF "SF") (V2DF "DF")
+ (V16QI "QI") (V8HI "HI")
+ (V4SI "SI") (V2DI "DI")])
+
+;; Number of scalar elements in each vector type
+(define_mode_attr ssescalarnum [(V4SF "4") (V2DF "2")
+ (V16QI "16") (V8HI "8")
+ (V4SI "4") (V2DI "2")])
;; Mapping of immediate bits for blend instructions
(define_mode_attr blendbits [(V4SF "15") (V2DF "3")])
@@ -3154,7 +3161,7 @@
;; We don't have a straight 32-bit parallel multiply on SSE5, so fake it with a
;; multiply/add. In general, we expect the define_split to occur before
;; register allocation, so we have to handle the corner case where the target
-;; is used as the base or index register in operands 1/2.
+;; is the same as one of the inputs.
(define_insn_and_split "*sse5_mulv4si3"
[(set (match_operand:V4SI 0 "register_operand" "=&x")
(mult:V4SI (match_operand:V4SI 1 "register_operand" "%x")
@@ -3242,6 +3249,42 @@
rtx t1, t2, t3, t4, t5, t6, thirtytwo;
rtx op0, op1, op2;
+ if (TARGET_SSE5)
+ {
+ /* op1: A,B,C,D, op2: E,F,G,H */
+ op0 = operands[0];
+ op1 = gen_lowpart (V4SImode, operands[1]);
+ op2 = gen_lowpart (V4SImode, operands[2]);
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+ t3 = gen_reg_rtx (V4SImode);
+ t4 = gen_reg_rtx (V2DImode);
+ t5 = gen_reg_rtx (V2DImode);
+
+ /* t1: B,A,D,C */
+ emit_insn (gen_sse2_pshufd_1 (t1, op1,
+ GEN_INT (1),
+ GEN_INT (0),
+ GEN_INT (3),
+ GEN_INT (2)));
+
+ /* t2: 0 */
+ emit_move_insn (t2, CONST0_RTX (V4SImode));
+
+ /* t3: (B*E),(A*F),(D*G),(C*H) */
+ emit_insn (gen_sse5_pmacsdd (t3, t1, op2, t2));
+
+ /* t4: (B*E)+(A*F), (D*G)+(C*H) */
+ emit_insn (gen_sse5_phadddq (t4, t3));
+
+ /* t5: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+ emit_insn (gen_ashlv2di3 (t5, t4, GEN_INT (32)));
+
+ /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
+ emit_insn (gen_sse5_pmacsdql (op0, op1, op2, t5));
+ DONE;
+ }
+
op0 = operands[0];
op1 = operands[1];
op2 = operands[2];
@@ -3357,6 +3400,57 @@
DONE;
})
+(define_expand "vec_widen_smult_hi_v4si"
+ [(match_operand:V2DI 0 "register_operand" "")
+ (match_operand:V4SI 1 "register_operand" "")
+ (match_operand:V4SI 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx t1, t2;
+
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
+ GEN_INT (0),
+ GEN_INT (2),
+ GEN_INT (1),
+ GEN_INT (3)));
+ emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
+ GEN_INT (0),
+ GEN_INT (2),
+ GEN_INT (1),
+ GEN_INT (3)));
+ emit_insn (gen_sse5_mulv2div2di3_high (operands[0], t1, t2));
+ DONE;
+})
+
+(define_expand "vec_widen_smult_lo_v4si"
+ [(match_operand:V2DI 0 "register_operand" "")
+ (match_operand:V4SI 1 "register_operand" "")
+ (match_operand:V4SI 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx t1, t2;
+
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
+ GEN_INT (0),
+ GEN_INT (2),
+ GEN_INT (1),
+ GEN_INT (3)));
+ emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
+ GEN_INT (0),
+ GEN_INT (2),
+ GEN_INT (1),
+ GEN_INT (3)));
+ emit_insn (gen_sse5_mulv2div2di3_low (operands[0], t1, t2));
+ DONE;
+ DONE;
+})
+
(define_expand "vec_widen_umult_hi_v4si"
[(match_operand:V2DI 0 "register_operand" "")
(match_operand:V4SI 1 "register_operand" "")
@@ -3893,6 +3987,12 @@
{
rtx op1, op2, h1, l1, h2, l2, h3, l3;
+ if (TARGET_SSE5)
+ {
+ ix86_expand_sse5_pack (operands);
+ DONE;
+ }
+
op1 = gen_lowpart (V16QImode, operands[1]);
op2 = gen_lowpart (V16QImode, operands[2]);
h1 = gen_reg_rtx (V16QImode);
@@ -3928,6 +4028,12 @@
{
rtx op1, op2, h1, l1, h2, l2;
+ if (TARGET_SSE5)
+ {
+ ix86_expand_sse5_pack (operands);
+ DONE;
+ }
+
op1 = gen_lowpart (V8HImode, operands[1]);
op2 = gen_lowpart (V8HImode, operands[2]);
h1 = gen_reg_rtx (V8HImode);
@@ -3957,6 +4063,12 @@
{
rtx op1, op2, h1, l1;
+ if (TARGET_SSE5)
+ {
+ ix86_expand_sse5_pack (operands);
+ DONE;
+ }
+
op1 = gen_lowpart (V4SImode, operands[1]);
op2 = gen_lowpart (V4SImode, operands[2]);
h1 = gen_reg_rtx (V4SImode);
@@ -7024,6 +7136,87 @@
[(set_attr "type" "ssemuladd")
(set_attr "mode" "TI")])
+(define_insn_and_split "*sse5_pmacsdql_mem"
+ [(set (match_operand:V2DI 0 "register_operand" "=&x,&x,&x")
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m")
+ (parallel [(const_int 1)
+ (const_int 3)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x")
+ (parallel [(const_int 1)
+ (const_int 3)]))))
+ (match_operand:V2DI 3 "memory_operand" "m,m,m")))]
+ "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, -1)"
+ "#"
+ "&& (reload_completed
+ || (!reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])))"
+ [(set (match_dup 0)
+ (match_dup 3))
+ (set (match_dup 0)
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 1)
+ (parallel [(const_int 1)
+ (const_int 3)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 1)
+ (const_int 3)]))))
+ (match_dup 0)))])
+
+;; We don't have a straight 32-bit parallel multiply and extend on SSE5, so
+;; fake it with a multiply/add. In general, we expect the define_split to
+;; occur before register allocation, so we have to handle the corner case where
+;; the target is the same as operands 1/2
+(define_insn_and_split "sse5_mulv2div2di3_low"
+ [(set (match_operand:V2DI 0 "register_operand" "=&x")
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 1 "nonimmediate_operand" "%x")
+ (parallel [(const_int 1)
+ (const_int 3)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+ (parallel [(const_int 1)
+ (const_int 3)])))))]
+ "TARGET_SSE5"
+ "#"
+ "&& (reload_completed
+ || (!reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])))"
+ [(set (match_dup 0)
+ (match_dup 3))
+ (set (match_dup 0)
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 1)
+ (parallel [(const_int 1)
+ (const_int 3)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 1)
+ (const_int 3)]))))
+ (match_dup 0)))]
+{
+ operands[3] = CONST0_RTX (V2DImode);
+}
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "TI")])
+
(define_insn "sse5_pmacsdqh"
[(set (match_operand:V2DI 0 "register_operand" "=x,x,x")
(plus:V2DI
@@ -7047,6 +7240,87 @@
[(set_attr "type" "ssemuladd")
(set_attr "mode" "TI")])
+(define_insn_and_split "*sse5_pmacsdqh_mem"
+ [(set (match_operand:V2DI 0 "register_operand" "=&x,&x,&x")
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m")
+ (parallel [(const_int 0)
+ (const_int 2)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x")
+ (parallel [(const_int 0)
+ (const_int 2)]))))
+ (match_operand:V2DI 3 "memory_operand" "m,m,m")))]
+ "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, -1)"
+ "#"
+ "&& (reload_completed
+ || (!reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])))"
+ [(set (match_dup 0)
+ (match_dup 3))
+ (set (match_dup 0)
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 1)
+ (parallel [(const_int 0)
+ (const_int 2)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 0)
+ (const_int 2)]))))
+ (match_dup 0)))])
+
+;; We don't have a straight 32-bit parallel multiply and extend on SSE5, so
+;; fake it with a multiply/add. In general, we expect the define_split to
+;; occur before register allocation, so we have to handle the corner case where
+;; the target is the same as either operands[1] or operands[2]
+(define_insn_and_split "sse5_mulv2div2di3_high"
+ [(set (match_operand:V2DI 0 "register_operand" "=&x")
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 1 "nonimmediate_operand" "%x")
+ (parallel [(const_int 0)
+ (const_int 2)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+ (parallel [(const_int 0)
+ (const_int 2)])))))]
+ "TARGET_SSE5"
+ "#"
+ "&& (reload_completed
+ || (!reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])))"
+ [(set (match_dup 0)
+ (match_dup 3))
+ (set (match_dup 0)
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 1)
+ (parallel [(const_int 0)
+ (const_int 2)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 0)
+ (const_int 2)]))))
+ (match_dup 0)))]
+{
+ operands[3] = CONST0_RTX (V2DImode);
+}
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "TI")])
+
;; SSE5 parallel integer multiply/add instructions for the intrinisics
(define_insn "sse5_pmacsswd"
[(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
@@ -7190,19 +7464,17 @@
;; SSE5 parallel XMM conditional moves
(define_insn "sse5_pcmov_<mode>"
- [(set (match_operand:SSEMODE 0 "register_operand" "=x,x,x,x,x,x")
+ [(set (match_operand:SSEMODE 0 "register_operand" "=x,x,x,x")
(if_then_else:SSEMODE
- (match_operand:SSEMODE 3 "nonimmediate_operand" "0,0,xm,x,0,0")
- (match_operand:SSEMODE 1 "vector_move_operand" "x,xm,0,0,C,x")
- (match_operand:SSEMODE 2 "vector_move_operand" "xm,x,x,xm,x,C")))]
+ (match_operand:SSEMODE 3 "nonimmediate_operand" "0,0,xm,x")
+ (match_operand:SSEMODE 1 "vector_move_operand" "x,xm,0,0")
+ (match_operand:SSEMODE 2 "vector_move_operand" "xm,x,x,xm")))]
"TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
"@
pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
- pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
- andps\t{%2, %0|%0, %2}
- andnps\t{%1, %0|%0, %1}"
+ pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}"
[(set_attr "type" "sse4arg")])
;; SSE5 horizontal add/subtract instructions
@@ -7801,7 +8073,71 @@
(set_attr "mode" "<MODE>")])
;; SSE5 packed rotate instructions
-(define_insn "rotl<mode>3"
+(define_expand "rotl<mode>3"
+ [(set (match_operand:SSEMODE1248 0 "register_operand" "")
+ (rotate:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "")
+ (match_operand:SI 2 "general_operand")))]
+ "TARGET_SSE5"
+{
+ /* If we were given a scalar, convert it to parallel */
+ if (! const_0_to_<sserotatemax>_operand (operands[2], SImode))
+ {
+ rtvec vs = rtvec_alloc (<ssescalarnum>);
+ rtx par = gen_rtx_PARALLEL (<MODE>mode, vs);
+ rtx reg = gen_reg_rtx (<MODE>mode);
+ rtx op2 = operands[2];
+ int i;
+
+ if (GET_MODE (op2) != <ssescalarmode>mode)
+ {
+ op2 = gen_reg_rtx (<ssescalarmode>mode);
+ convert_move (op2, operands[2], false);
+ }
+
+ for (i = 0; i < <ssescalarnum>; i++)
+ RTVEC_ELT (vs, i) = op2;
+
+ emit_insn (gen_vec_init<mode> (reg, par));
+ emit_insn (gen_sse5_vrotl<mode>3 (operands[0], operands[1], reg));
+ DONE;
+ }
+})
+
+(define_expand "rotr<mode>3"
+ [(set (match_operand:SSEMODE1248 0 "register_operand" "")
+ (rotatert:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "")
+ (match_operand:SI 2 "general_operand")))]
+ "TARGET_SSE5"
+{
+ /* If we were given a scalar, convert it to parallel */
+ if (! const_0_to_<sserotatemax>_operand (operands[2], SImode))
+ {
+ rtvec vs = rtvec_alloc (<ssescalarnum>);
+ rtx par = gen_rtx_PARALLEL (<MODE>mode, vs);
+ rtx neg = gen_reg_rtx (<MODE>mode);
+ rtx reg = gen_reg_rtx (<MODE>mode);
+ rtx op2 = operands[2];
+ int i;
+
+ if (GET_MODE (op2) != <ssescalarmode>mode)
+ {
+ op2 = gen_reg_rtx (<ssescalarmode>mode);
+ convert_move (op2, operands[2], false);
+ }
+
+ for (i = 0; i < <ssescalarnum>; i++)
+ RTVEC_ELT (vs, i) = op2;
+
+ emit_insn (gen_vec_init<mode> (reg, par));
+ emit_insn (gen_neg<mode>2 (neg, reg));
+ emit_insn (gen_sse5_vrotl<mode>3 (operands[0], operands[1], neg));
+ DONE;
+ }
+})
+
+(define_insn "sse5_rotl<mode>3"
[(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
(rotate:SSEMODE1248
(match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm")
@@ -7811,26 +8147,106 @@
[(set_attr "type" "sseishft")
(set_attr "mode" "TI")])
-(define_insn "sse5_rotl<mode>3"
+(define_insn "sse5_rotr<mode>3"
+ [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+ (rotatert:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm")
+ (match_operand:SI 2 "const_0_to_<sserotatemax>_operand" "n")))]
+ "TARGET_SSE5"
+{
+ operands[3] = GEN_INT ((<ssescalarnum> * 8) - INTVAL (operands[2]));
+ return \"prot<ssevecsize>\t{%3, %1, %0|%0, %1, %3}\";
+}
+ [(set_attr "type" "sseishft")
+ (set_attr "mode" "TI")])
+
+(define_expand "vrotr<mode>3"
+ [(match_operand:SSEMODE1248 0 "register_operand" "")
+ (match_operand:SSEMODE1248 1 "register_operand" "")
+ (match_operand:SSEMODE1248 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx reg = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neg<mode>2 (reg, operands[2]));
+ emit_insn (gen_sse5_vrotl<mode>3 (operands[0], operands[1], reg));
+ DONE;
+})
+
+(define_expand "vrotl<mode>3"
+ [(match_operand:SSEMODE1248 0 "register_operand" "")
+ (match_operand:SSEMODE1248 1 "register_operand" "")
+ (match_operand:SSEMODE1248 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ emit_insn (gen_sse5_vrotl<mode>3 (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_insn "sse5_vrotl<mode>3"
[(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
- (rotate:SSEMODE1248
- (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
- (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")))]
+ (if_then_else:SSEMODE1248
+ (ge:SSEMODE1248
+ (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")
+ (const_int 0))
+ (rotate:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+ (match_dup 2))
+ (rotatert:SSEMODE1248
+ (match_dup 1)
+ (neg:SSEMODE1248 (match_dup 2)))))]
"TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
"prot<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sseishft")
(set_attr "mode" "TI")])
-;; SSE5 packed shift instructions. Note negative values for the shift amount
-;; convert this into a right shift instead of left shift. For now, model this
-;; with an UNSPEC instead of using ashift/lshift since the rest of the x86 does
-;; not have the concept of negating the shift amount. Also, there is no LSHIFT
+;; SSE5 packed shift instructions.
+;; FIXME: add V2DI back in
+(define_expand "vlshr<mode>3"
+ [(match_operand:SSEMODE124 0 "register_operand" "")
+ (match_operand:SSEMODE124 1 "register_operand" "")
+ (match_operand:SSEMODE124 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx neg = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neg<mode>2 (neg, operands[2]));
+ emit_insn (gen_sse5_lshl<mode>3 (operands[0], operands[1], neg));
+ DONE;
+})
+
+(define_expand "vashr<mode>3"
+ [(match_operand:SSEMODE124 0 "register_operand" "")
+ (match_operand:SSEMODE124 1 "register_operand" "")
+ (match_operand:SSEMODE124 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx neg = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neg<mode>2 (neg, operands[2]));
+ emit_insn (gen_sse5_ashl<mode>3 (operands[0], operands[1], neg));
+ DONE;
+})
+
+(define_expand "vashl<mode>3"
+ [(match_operand:SSEMODE124 0 "register_operand" "")
+ (match_operand:SSEMODE124 1 "register_operand" "")
+ (match_operand:SSEMODE124 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ emit_insn (gen_sse5_ashl<mode>3 (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
(define_insn "sse5_ashl<mode>3"
[(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
- (unspec:SSEMODE1248
- [(match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
- (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")]
- UNSPEC_SSE5_ASHIFT))]
+ (if_then_else:SSEMODE1248
+ (ge:SSEMODE1248
+ (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")
+ (const_int 0))
+ (ashift:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+ (match_dup 2))
+ (ashiftrt:SSEMODE1248
+ (match_dup 1)
+ (neg:SSEMODE1248 (match_dup 2)))))]
"TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
"psha<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sseishft")
@@ -7838,15 +8254,122 @@
(define_insn "sse5_lshl<mode>3"
[(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
- (unspec:SSEMODE1248
- [(match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
- (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")]
- UNSPEC_SSE5_LSHIFT))]
+ (if_then_else:SSEMODE1248
+ (ge:SSEMODE1248
+ (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")
+ (const_int 0))
+ (ashift:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+ (match_dup 2))
+ (lshiftrt:SSEMODE1248
+ (match_dup 1)
+ (neg:SSEMODE1248 (match_dup 2)))))]
"TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
"pshl<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sseishft")
(set_attr "mode" "TI")])
+;; SSE2 doesn't have some shift varients, so define versions for SSE5
+(define_expand "ashlv16qi3"
+ [(match_operand:V16QI 0 "register_operand" "")
+ (match_operand:V16QI 1 "register_operand" "")
+ (match_operand:SI 2 "nonmemory_operand" "")]
+ "TARGET_SSE5"
+{
+ rtvec vs = rtvec_alloc (16);
+ rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+ rtx reg = gen_reg_rtx (V16QImode);
+ int i;
+ for (i = 0; i < 16; i++)
+ RTVEC_ELT (vs, i) = operands[2];
+
+ emit_insn (gen_vec_initv16qi (reg, par));
+ emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], reg));
+ DONE;
+})
+
+(define_expand "lshlv16qi3"
+ [(match_operand:V16QI 0 "register_operand" "")
+ (match_operand:V16QI 1 "register_operand" "")
+ (match_operand:SI 2 "nonmemory_operand" "")]
+ "TARGET_SSE5"
+{
+ rtvec vs = rtvec_alloc (16);
+ rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+ rtx reg = gen_reg_rtx (V16QImode);
+ int i;
+ for (i = 0; i < 16; i++)
+ RTVEC_ELT (vs, i) = operands[2];
+
+ emit_insn (gen_vec_initv16qi (reg, par));
+ emit_insn (gen_sse5_lshlv16qi3 (operands[0], operands[1], reg));
+ DONE;
+})
+
+(define_expand "ashrv16qi3"
+ [(match_operand:V16QI 0 "register_operand" "")
+ (match_operand:V16QI 1 "register_operand" "")
+ (match_operand:SI 2 "nonmemory_operand" "")]
+ "TARGET_SSE5"
+{
+ rtvec vs = rtvec_alloc (16);
+ rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+ rtx reg = gen_reg_rtx (V16QImode);
+ int i;
+ rtx ele = ((GET_CODE (operands[2]) == CONST_INT)
+ ? GEN_INT (- INTVAL (operands[2]))
+ : operands[2]);
+
+ for (i = 0; i < 16; i++)
+ RTVEC_ELT (vs, i) = ele;
+
+ emit_insn (gen_vec_initv16qi (reg, par));
+
+ if (GET_CODE (operands[2]) != CONST_INT)
+ {
+ rtx neg = gen_reg_rtx (V16QImode);
+ emit_insn (gen_negv16qi2 (neg, reg));
+ emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], neg));
+ }
+ else
+ emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], reg));
+
+ DONE;
+})
+
+(define_expand "ashrv2di3"
+ [(match_operand:V2DI 0 "register_operand" "")
+ (match_operand:V2DI 1 "register_operand" "")
+ (match_operand:DI 2 "nonmemory_operand" "")]
+ "TARGET_SSE5"
+{
+ rtvec vs = rtvec_alloc (2);
+ rtx par = gen_rtx_PARALLEL (V2DImode, vs);
+ rtx reg = gen_reg_rtx (V2DImode);
+ rtx ele;
+
+ if (GET_CODE (operands[2]) == CONST_INT)
+ ele = GEN_INT (- INTVAL (operands[2]));
+ else if (GET_MODE (operands[2]) != DImode)
+ {
+ rtx move = gen_reg_rtx (DImode);
+ ele = gen_reg_rtx (DImode);
+ convert_move (move, operands[2], false);
+ emit_insn (gen_negdi2 (ele, move));
+ }
+ else
+ {
+ ele = gen_reg_rtx (DImode);
+ emit_insn (gen_negdi2 (ele, operands[2]));
+ }
+
+ RTVEC_ELT (vs, 0) = ele;
+ RTVEC_ELT (vs, 1) = ele;
+ emit_insn (gen_vec_initv2di (reg, par));
+ emit_insn (gen_sse5_ashlv2di3 (operands[0], operands[1], reg));
+ DONE;
+})
+
;; SSE5 FRCZ support
;; parallel insns
(define_insn "sse5_frcz<mode>2"