diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 29 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/setcc-lowering.ll | 11 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vshift-6.ll | 18 |
3 files changed, 38 insertions, 20 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 64c91cb62c8a..fb0ec198dd45 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -46647,6 +46647,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc dl(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If this is SSE1 only convert to FOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { @@ -46663,7 +46664,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, SmallVector<APInt, 2> SrcPartials; if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) && SrcOps.size() == 1) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); @@ -46724,11 +46724,36 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, } } - // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + // Attempt to recursively combine an OR of shuffles. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; + + // If either operand is a constant mask, then only the elements that aren't + // allones are actually demanded by the other operand. + auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) { + APInt UndefElts; + SmallVector<APInt> EltBits; + int NumElts = VT.getVectorNumElements(); + int EltSizeInBits = VT.getScalarSizeInBits(); + if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) + return false; + + APInt DemandedElts = APInt::getZero(NumElts); + for (int I = 0; I != NumElts; ++I) + if (!EltBits[I].isAllOnes()) + DemandedElts.setBit(I); + + APInt KnownUndef, KnownZero; + return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef, + KnownZero, DCI); + }; + if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); + } } // We should fold "masked merge" patterns when `andn` is not available. diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll index fae9602eaaaf..f740c5925090 100644 --- a/llvm/test/CodeGen/X86/setcc-lowering.ll +++ b/llvm/test/CodeGen/X86/setcc-lowering.ll @@ -9,12 +9,11 @@ define <8 x i16> @pr25080(<8 x i32> %a) { ; AVX-LABEL: pr25080: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll index e66d91340164..fab8e9be7d0a 100644 --- a/llvm/test/CodeGen/X86/vshift-6.ll +++ b/llvm/test/CodeGen/X86/vshift-6.ll @@ -30,12 +30,9 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movb %al, (%ecx) -; X86-NEXT: movd %eax, %xmm0 -; X86-NEXT: psllq $56, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; X86-NEXT: movdqa %xmm2, %xmm1 -; X86-NEXT: pandn %xmm0, %xmm1 -; X86-NEXT: por %xmm2, %xmm1 +; X86-NEXT: movd %eax, %xmm1 +; X86-NEXT: psllq $56, %xmm1 +; X86-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-NEXT: pcmpeqd %xmm3, %xmm3 ; X86-NEXT: psllw $5, %xmm1 ; X86-NEXT: pxor %xmm2, %xmm2 @@ -65,12 +62,9 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) { ; X64-LABEL: do_not_crash: ; X64: # %bb.0: # %entry ; X64-NEXT: movb %r9b, (%rdi) -; X64-NEXT: movd %r9d, %xmm0 -; X64-NEXT: psllq $56, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; X64-NEXT: movdqa %xmm2, %xmm1 -; X64-NEXT: pandn %xmm0, %xmm1 -; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: movd %r9d, %xmm1 +; X64-NEXT: psllq $56, %xmm1 +; X64-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-NEXT: pcmpeqd %xmm2, %xmm2 ; X64-NEXT: psllw $5, %xmm1 ; X64-NEXT: pxor %xmm3, %xmm3 |