diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 305 |
1 files changed, 240 insertions, 65 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b178ad6c13e7..22b4d7997fa6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -380,8 +380,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. - if (Subtarget.useSoftFloat() || - (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) { + if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } @@ -4998,6 +4997,8 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { switch (Opcode) { default: return false; + case X86ISD::TESTM: + case X86ISD::TESTNM: case X86ISD::PCMPEQM: case X86ISD::PCMPGTM: case X86ISD::CMPM: @@ -6746,6 +6747,9 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); + BitVector UndefElements; + SDValue Ld = BVOp->getSplatValue(&UndefElements); + // Attempt to use VBROADCASTM // From this paterrn: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) @@ -6753,17 +6757,23 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // // Create (VBROADCASTM v2i1 X) if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) { - MVT EltType; - unsigned NumElts; + MVT EltType = VT.getScalarType(); + unsigned NumElts = VT.getVectorNumElements(); + SDValue BOperand; SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType); - if (ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) { - SDValue BOperand = ZeroExtended.getOperand(0); + if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) || + (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND && + Ld.getOperand(0).getOpcode() == ISD::BITCAST)) { + if (ZeroExtended) + BOperand = ZeroExtended.getOperand(0); + else + BOperand = Ld.getOperand(0).getOperand(0); if (BOperand.getValueType().isVector() && BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) { - if ((EltType == MVT::i64 && - VT.getVectorElementType() == MVT::i8) || // for broadcastmb2q - (EltType == MVT::i32 && - VT.getVectorElementType() == MVT::i16)) { // for broadcastmw2d + if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 || + NumElts == 8)) || // for broadcastmb2q + (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 || + NumElts == 16))) { // for broadcastmw2d SDValue Brdcst = DAG.getNode(X86ISD::VBROADCASTM, dl, MVT::getVectorVT(EltType, NumElts), BOperand); @@ -6773,9 +6783,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, } } - BitVector UndefElements; - SDValue Ld = BVOp->getSplatValue(&UndefElements); - // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { @@ -7707,6 +7714,111 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, return SDValue(); } +// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be +// reasoned to be a permutation of a vector by indices in a non-constant vector. +// (build_vector (extract_elt V, (extract_elt I, 0)), +// (extract_elt V, (extract_elt I, 1)), +// ... +// -> +// (vpermv I, V) +// +// TODO: Handle undefs +// TODO: Utilize pshufb and zero mask blending to support more efficient +// construction of vectors with constant-0 elements. +// TODO: Use smaller-element vectors of same width, and "interpolate" the indices, +// when no native operation available. +static SDValue +LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Look for VPERMV and PSHUFB opportunities. + MVT VT = V.getSimpleValueType(); + switch (VT.SimpleTy) { + default: + return SDValue(); + case MVT::v16i8: + if (!Subtarget.hasSSE3()) + return SDValue(); + break; + case MVT::v8f32: + case MVT::v8i32: + if (!Subtarget.hasAVX2()) + return SDValue(); + break; + case MVT::v4i64: + case MVT::v4f64: + if (!Subtarget.hasVLX()) + return SDValue(); + break; + case MVT::v16f32: + case MVT::v8f64: + case MVT::v16i32: + case MVT::v8i64: + if (!Subtarget.hasAVX512()) + return SDValue(); + break; + case MVT::v32i16: + if (!Subtarget.hasBWI()) + return SDValue(); + break; + case MVT::v8i16: + case MVT::v16i16: + if (!Subtarget.hasVLX() || !Subtarget.hasBWI()) + return SDValue(); + break; + case MVT::v64i8: + if (!Subtarget.hasVBMI()) + return SDValue(); + break; + case MVT::v32i8: + if (!Subtarget.hasVLX() || !Subtarget.hasVBMI()) + return SDValue(); + break; + } + SDValue SrcVec, IndicesVec; + // Check for a match of the permute source vector and permute index elements. + // This is done by checking that the i-th build_vector operand is of the form: + // (extract_elt SrcVec, (extract_elt IndicesVec, i)). + for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { + SDValue Op = V.getOperand(Idx); + if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // If this is the first extract encountered in V, set the source vector, + // otherwise verify the extract is from the previously defined source + // vector. + if (!SrcVec) + SrcVec = Op.getOperand(0); + else if (SrcVec != Op.getOperand(0)) + return SDValue(); + SDValue ExtractedIndex = Op->getOperand(1); + // Peek through extends. + if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || + ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) + ExtractedIndex = ExtractedIndex.getOperand(0); + if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // If this is the first extract from the index vector candidate, set the + // indices vector, otherwise verify the extract is from the previously + // defined indices vector. + if (!IndicesVec) + IndicesVec = ExtractedIndex.getOperand(0); + else if (IndicesVec != ExtractedIndex.getOperand(0)) + return SDValue(); + + auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1)); + if (!PermIdx || PermIdx->getZExtValue() != Idx) + return SDValue(); + } + MVT IndicesVT = VT; + if (VT.isFloatingPoint()) + IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); + return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV, + SDLoc(V), VT, IndicesVec, SrcVec); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -7922,6 +8034,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); + if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) + return V; + // See if we can use a vector load to get all of the elements. if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); @@ -10716,10 +10831,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. @@ -11016,10 +11137,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). @@ -12372,6 +12499,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } } + + // Try to use SHUF128 if possible. + if (Subtarget.hasVLX()) { + if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { + unsigned PermMask = ((WidenedMask[0] % 2) << 0) | + ((WidenedMask[1] % 2) << 1); + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); + } + } } // Otherwise form a 128-bit permutation. After accounting for undefs, @@ -13697,10 +13834,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) - return Shuf128; - if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four @@ -13722,6 +13855,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -17333,6 +17470,20 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { if (Swap) std::swap(Op0, Op1); + + // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM. + if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) { + SDValue A = peekThroughBitcasts(Op0); + if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) && + ISD::isBuildVectorAllZeros(Op1.getNode())) { + MVT VT0 = Op0.getSimpleValueType(); + SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0)); + SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1)); + return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM, + dl, VT, RHS, LHS); + } + } + if (Opc) return DAG.getNode(Opc, dl, VT, Op0, Op1); Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; @@ -19838,10 +19989,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else PassThru = Src1; - SDValue Rnd = Op.getOperand(5); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (!isRoundModeCurDirection(Rnd)) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, + Op.getValueType(), Src1, Src2, + Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, - Src3, Rnd), + Src3), Mask, PassThru, Subtarget, DAG); } case IFMA_OP_MASKZ: @@ -24786,9 +24946,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; - case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; case X86ISD::FRCP: return "X86ISD::FRCP"; - case X86ISD::FRCPS: return "X86ISD::FRCPS"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; @@ -24942,10 +25100,18 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::FMADDS1: return "X86ISD::FMADDS1"; + case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1"; + case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1"; + case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1"; case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND"; case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND"; case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND"; case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND"; + case X86ISD::FMADDS3: return "X86ISD::FMADDS3"; + case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3"; + case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3"; + case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3"; case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND"; case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND"; case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND"; @@ -24966,9 +25132,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SELECT: return "X86ISD::SELECT"; case X86ISD::SELECTS: return "X86ISD::SELECTS"; case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; + case X86ISD::RCP14: return "X86ISD::RCP14"; + case X86ISD::RCP14S: return "X86ISD::RCP14S"; case X86ISD::RCP28: return "X86ISD::RCP28"; case X86ISD::RCP28S: return "X86ISD::RCP28S"; case X86ISD::EXP2: return "X86ISD::EXP2"; + case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; + case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; @@ -25006,6 +25176,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; + case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND"; @@ -30314,10 +30485,17 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) + if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; - if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) + // TODO - Remove this once we can handle the implicit zero-extension of + // X86ISD::PEXTRW/X86ISD::PEXTRB in: + // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and + // combineBasicSADPattern. + if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; SDValue InputVector = N->getOperand(0); @@ -30464,16 +30642,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// TODO - merge with combineExtractVectorElt once it can handle the implicit -// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in: -// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and -// combineBasicSADPattern. -static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - return combineExtractWithShuffle(N, DAG, DCI, Subtarget); -} - /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? @@ -30674,26 +30842,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, unsigned Opcode = Op.getOpcode(); switch (Opcode) { - case X86ISD::PALIGNR: - // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. - if (!VT.is128BitVector()) - return false; - Opcode = X86ISD::VALIGN; - LLVM_FALLTHROUGH; - case X86ISD::VALIGN: { - if (EltVT != MVT::i32 && EltVT != MVT::i64) - return false; - uint64_t Imm = Op.getConstantOperandVal(2); - MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); - unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits(); - unsigned EltSize = EltVT.getSizeInBits(); - // Make sure we can represent the same shift with the new VT. - if ((ShiftAmt % EltSize) != 0) - return false; - Imm = ShiftAmt / EltSize; - return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), - DAG.getConstant(Imm, DL, MVT::i8)); - } case X86ISD::SHUF128: { if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) return false; @@ -34441,8 +34589,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// This function transforms vector truncation of 'extended sign-bits' values. -/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations. +/// This function transforms vector truncation of 'extended sign-bits' or +/// 'extended zero-bits' values. +/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -34475,10 +34624,19 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. unsigned NumSignBits = DAG.ComputeNumSignBits(In); unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16); - if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits)) - return SDValue(); + if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + + // Use PACKUS if the input has zero-bits that extend all the way to the + // packed/truncated value. e.g. masks, zext_in_reg, etc. + KnownBits Known; + DAG.computeKnownBits(In, Known); + unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); + NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; + if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); - return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + return SDValue(); } static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, @@ -34507,7 +34665,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } - // Try to truncate extended sign bits with PACKSS. + // Try to truncate extended sign/zero bits with PACKSS/PACKUS. if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) return V; @@ -35341,9 +35499,11 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, // Do not convert the passthru input of scalar intrinsics. // FIXME: We could allow negations of the lower element only. - bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); + bool NegA = N->getOpcode() != X86ISD::FMADDS1 && + N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); bool NegB = invertIfNegative(B); - bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); + bool NegC = N->getOpcode() != X86ISD::FMADDS3 && + N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); @@ -35371,6 +35531,20 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; } + } else if (N->getOpcode() == X86ISD::FMADDS1) { + switch (NewOpcode) { + case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break; + } + } else if (N->getOpcode() == X86ISD::FMADDS3) { + switch (NewOpcode) { + case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break; + } } else if (N->getOpcode() == X86ISD::FMADDS1_RND) { switch (NewOpcode) { case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break; @@ -36590,10 +36764,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: - return combineExtractVectorElt(N, DAG, DCI, Subtarget); case X86ISD::PEXTRW: case X86ISD::PEXTRB: - return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget); + return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: @@ -36689,6 +36862,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FMADD_RND: case X86ISD::FMADDS1_RND: case X86ISD::FMADDS3_RND: + case X86ISD::FMADDS1: + case X86ISD::FMADDS3: case ISD::FMA: return combineFMA(N, DAG, Subtarget); case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG); |