diff options
Diffstat (limited to 'lib/Target/X86/X86InstrSSE.td')
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 104 |
1 files changed, 49 insertions, 55 deletions
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 451303054f56..955a40ee171e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1514,13 +1514,12 @@ let mayLoad = 1 in def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RM>, - XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, + [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable; } def : Pat<(f32 (fpround FR64:$src)), - (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>, + (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, Requires<[UseAVX]>; def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), @@ -1574,20 +1573,18 @@ let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RR>, - XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, + [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable; let mayLoad = 1 in def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RM>, - XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, + [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable; } def : Pat<(f64 (fpextend FR32:$src)), - (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>; + (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; def : Pat<(fpextend (loadf32 addr:$src)), (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; @@ -1899,7 +1896,7 @@ let Predicates = [HasAVX, NoVLX] in { (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), (VCVTTPD2DQrm addr:$src)>; } -} // Predicates = [HasAVX] +} // Predicates = [HasAVX, NoVLX] def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", @@ -3095,7 +3092,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, ValueType vt, ValueType ScalarVT, X86MemOperand x86memop, Intrinsic Intr, SDNode OpNode, Domain d, - OpndItins itins, string Suffix> { + OpndItins itins, Predicate target, string Suffix> { let hasSideEffects = 0 in { def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -3126,21 +3123,17 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, // vrcpss mem, %xmm0, %xmm0 // TODO: In theory, we could fold the load, and avoid the stall caused by // the partial register store, either in ExecutionDepsFix or with smarter RA. - let Predicates = [UseAVX] in { + let Predicates = [target] in { def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; - } - let Predicates = [HasAVX] in { def : Pat<(Intr VR128:$src), (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; } - let Predicates = [HasAVX, OptForSize] in { + let Predicates = [target, OptForSize] in { def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))), (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; - } - let Predicates = [UseAVX, OptForSize] in { def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; @@ -3186,7 +3179,7 @@ let Predicates = prds in { /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), @@ -3220,41 +3213,41 @@ let Predicates = [HasAVX] in { } multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { + OpndItins itins, Predicate AVXTarget> { defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, SSEPackedSingle, itins, UseSSE1, "SS">, XS; defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, f32mem, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, - SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG, - NotMemoryFoldable; + SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V, + VEX_LIG, VEX_WIG, NotMemoryFoldable; } multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { + OpndItins itins, Predicate AVXTarget> { defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, f64mem, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, SSEPackedDouble, itins, "SD">, + OpNode, SSEPackedDouble, itins, AVXTarget, "SD">, XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; } // Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >; -defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>; +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>; +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>, + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>; // There is no f64 version of the reciprocal approximation instructions. @@ -7692,22 +7685,24 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { +multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (Int VR128:$src))]>, + [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, T8PD, VEX, Sched<[WriteCvtF2F]>; let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, - Sched<[WriteCvtF2FLd]>; + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set RC:$dst, (X86cvtph2ps (bc_v8i16 + (loadv2i64 addr:$src))))]>, + T8PD, VEX, Sched<[WriteCvtF2FLd]>; } -multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { +multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> { def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, + [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, TAPD, VEX, Sched<[WriteCvtF2F]>; let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteCvtF2FLd, WriteRMW] in @@ -7717,32 +7712,31 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { TAPD, VEX; } -let Predicates = [HasF16C] in { - defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; - defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; - defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; - defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; +let Predicates = [HasF16C, NoVLX] in { + defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem>; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L; + defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem>; + defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(int_x86_vcvtph2ps_128 (bitconvert - (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 - (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), - addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16 - (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), - addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)), - addr:$dst), - (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; + def : Pat<(store (f64 (extractelt + (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (i64 (extractelt + (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. |