summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPranav Kant <prka@google.com>2023-05-10 00:38:59 +0000
committerPranav Kant <prka@google.com>2023-05-17 17:17:28 +0000
commit726785b1594c6b567c5c8ddd59075aee726590c6 (patch)
tree48c9eca38451b48b24702d9d81edff763c727665
parent782a16db4db5bc9c145fbe27c8c652c0d4cb49d7 (diff)
downloadllvm-726785b1594c6b567c5c8ddd59075aee726590c6.tar.gz
[AArch64] Sink operands for faster bitselect vector instructions
Differential Revision: https://reviews.llvm.org/D150237
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp37
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll52
2 files changed, 89 insertions, 0 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5dbe20cc5afb..04b27b8019f6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14336,6 +14336,43 @@ bool AArch64TargetLowering::shouldSinkOperands(
return true;
}
+ case Instruction::Or: {
+ // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
+ // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
+ if (Subtarget->hasNEON()) {
+ Instruction *OtherAnd, *IA, *IB;
+ Value *MaskValue;
+ // MainAnd refers to And instruction that has 'Not' as one of its operands
+ if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
+ m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
+ m_Instruction(IA)))))) {
+ if (match(OtherAnd,
+ m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
+ Instruction *MainAnd = I->getOperand(0) == OtherAnd
+ ? cast<Instruction>(I->getOperand(1))
+ : cast<Instruction>(I->getOperand(0));
+
+ // Both Ands should be in same basic block as Or
+ if (I->getParent() != MainAnd->getParent() ||
+ I->getParent() != OtherAnd->getParent())
+ return false;
+
+ // Non-mask operands of both Ands should also be in same basic block
+ if (I->getParent() != IA->getParent() ||
+ I->getParent() != IB->getParent())
+ return false;
+
+ Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
+ Ops.push_back(&I->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(1));
+
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
case Instruction::Mul: {
int NumZExts = 0, NumSExts = 0;
for (auto &Op : I->operands()) {
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index f29ea22ff8db..4cfc879526dd 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -144,3 +144,55 @@ define <16 x i8> @test_bit_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
%or = or <16 x i8> %and, %and1
ret <16 x i8> %or
}
+
+define <4 x i32> @test_bit_sink_operand(<4 x i32> %src, <4 x i32> %dst, <4 x i32> %mask, i32 %scratch) {
+; CHECK-LABEL: test_bit_sink_operand:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: cinc w9, w0, lt
+; CHECK-NEXT: asr w9, w9, #1
+; CHECK-NEXT: .LBB11_1: // %do.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT: add x10, sp, #16
+; CHECK-NEXT: bfi x10, x8, #2, #2
+; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: bfi x11, x8, #2, #2
+; CHECK-NEXT: add w8, w8, #1
+; CHECK-NEXT: cmp w8, #5
+; CHECK-NEXT: str q1, [sp, #16]
+; CHECK-NEXT: str w0, [x10]
+; CHECK-NEXT: ldr q1, [sp, #16]
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: str w9, [x11]
+; CHECK-NEXT: ldr q0, [sp]
+; CHECK-NEXT: b.ne .LBB11_1
+; CHECK-NEXT: // %bb.2: // %do.end
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ret
+
+entry:
+ %0 = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %div = sdiv i32 %scratch, 2
+ br label %do.body
+
+do.body:
+ %dst.addr.0 = phi <4 x i32> [ %dst, %entry ], [ %vecins, %do.body ]
+ %src.addr.0 = phi <4 x i32> [ %src, %entry ], [ %vecins1, %do.body ]
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+ %vbsl3.i = and <4 x i32> %src.addr.0, %mask
+ %vbsl4.i = and <4 x i32> %dst.addr.0, %0
+ %vbsl5.i = or <4 x i32> %vbsl3.i, %vbsl4.i
+ %vecins = insertelement <4 x i32> %vbsl5.i, i32 %scratch, i32 %i.0
+ %vecins1 = insertelement <4 x i32> %src.addr.0, i32 %div, i32 %i.0
+ %inc = add nuw nsw i32 %i.0, 1
+ %exitcond.not = icmp eq i32 %inc, 5
+ br i1 %exitcond.not, label %do.end, label %do.body
+
+do.end:
+ ret <4 x i32> %vecins
+}