diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2021-01-28 12:11:31 +0000 |
---|---|---|
committer | Tom Stellard <tstellar@redhat.com> | 2021-02-03 11:26:33 -0800 |
commit | 52a70a07e93c322ad137bce1a1ff2f1c9fdf6050 (patch) | |
tree | 46a0084c18cef1e577374f1c9a66d18922d2f68b | |
parent | c1899cd5102dbdacd006fdb33db075319ccc933f (diff) | |
download | llvm-52a70a07e93c322ad137bce1a1ff2f1c9fdf6050.tar.gz |
[X86][AVX] canonicalizeLaneShuffleWithRepeatedOps - don't merge VPERMILPD ops with different low/high masks.
Unlike VPERMILPS, VPERMILPD can have non-repeating masks in each 128-bit subvector, we weren't accounting for this when folding vperm2f128(vpermilpd(x,c),vpermilpd(y,c)) -> vpermilpd(vperm2f128(x,y),c).
I'm intending to add support for this but wanted to get a minimal fix in first for merging into 12.xx.
Fixes PR48908
(cherry picked from commit 6663330bc8c84a75ea092272297b557bfc310380)
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll | 40 |
2 files changed, 30 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0dd20235aa3c..6b816c710f98 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36916,11 +36916,18 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res)); return DAG.getBitcast(VT, Res); } + case X86ISD::VPERMILPI: + // TODO: Handle v4f64 permutes with different low/high lane masks. + if (SrcVT0 == MVT::v4f64) { + uint64_t Mask = Src0.getConstantOperandVal(1); + if ((Mask & 0x3) != ((Mask >> 2) & 0x3)) + break; + } + LLVM_FALLTHROUGH; case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: case X86ISD::PSHUFD: - case X86ISD::VPERMILPI: if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) { SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0)); SDValue RHS = diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 3da83b25d363..1a1153d0e886 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -442,16 +442,18 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1] -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 -; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3] -; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1] +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X86-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2] +; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 +; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] -; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2] -; X86-AVX1-NEXT: vmovapd %ymm4, (%edx) -; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] -; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3] -; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3] +; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] +; X86-AVX1-NEXT: vmovapd %ymm3, (%edx) +; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] +; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3] +; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3] ; X86-AVX1-NEXT: vmovapd %ymm3, (%ecx) ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] @@ -513,16 +515,18 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x ; ; X64-AVX1-LABEL: PR48908: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1] -; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 -; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3] -; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1] +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,1,2,2] +; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 +; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] -; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2] -; X64-AVX1-NEXT: vmovapd %ymm4, (%rdi) -; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1] -; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3] -; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3] +; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] +; X64-AVX1-NEXT: vmovapd %ymm3, (%rdi) +; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] +; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3] +; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3] ; X64-AVX1-NEXT: vmovapd %ymm3, (%rsi) ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] |