summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoman Lebedev <lebedev.ri@gmail.com>2021-10-27 19:13:49 +0300
committerRoman Lebedev <lebedev.ri@gmail.com>2021-10-27 19:45:55 +0300
commit156f10c840a07034a6bd638b5912054100365741 (patch)
tree694118dd41eaefea154051c458c8306b9b2a27e8
parent35c3f5610c969091359a92f31e5a2745f5d49739 (diff)
downloadllvm-156f10c840a07034a6bd638b5912054100365741.tar.gz
[IR] `SCEVExpander::generateOverflowCheck()`: short-circuit `umul_with_overflow`-by-one
It's a no-op, no overflow happens ever: https://alive2.llvm.org/ce/z/Zw89rZ While generally i don't like such hacks, we have a very good reason to do this: here we are expanding a run-time correctness check for the vectorization, and said `umul_with_overflow` will not be optimized out before we query the cost of the checks we've generated. Which means, the cost of run-time checks would be artificially inflated, and after https://reviews.llvm.org/D109368 that will affect the minimal trip count for which these checks are even evaluated. And if they aren't even evaluated, then the vectorized code certainly won't be run. We could consider doing this in IRBuilder, but then we'd need to also teach `CreateExtractValue()` to look into chain of `insertvalue`'s, and i'm not sure there's precedent for that. Refs. https://reviews.llvm.org/D109368#3089809
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp19
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll40
-rw-r--r--llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll92
-rw-r--r--llvm/test/Transforms/LoopVectorize/pr45259.ll30
-rw-r--r--llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll58
-rw-r--r--llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll124
6 files changed, 163 insertions, 200 deletions
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 252eea660dc1..3abd1890a4ab 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2494,13 +2494,22 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
// Get the backedge taken count and truncate or extended to the AR type.
Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
- auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
- Intrinsic::umul_with_overflow, Ty);
// Compute |Step| * Backedge
- CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
- Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
- Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+ Value *MulV, *OfMul;
+ if (Step->isOne()) {
+ // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't
+ // needed, there is never an overflow, so to avoid artificially inflating
+ // the cost of the check, directly emit the optimized IR.
+ MulV = TruncTripCount;
+ OfMul = ConstantInt::getFalse(MulV->getContext());
+ } else {
+ auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
+ Intrinsic::umul_with_overflow, Ty);
+ CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
+ MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
+ OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+ }
// Compute:
// Start + |Step| * Backedge < Start
diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 2b199450c46c..e12411c77a04 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -62,39 +62,35 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
-; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP0]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
-; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 0
-; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[ADD_US]], [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; CHECK-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[ADD_US]], [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
+; CHECK-NEXT: store i32 [[TMP20]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
; CHECK-NEXT: store i32 [[TMP21]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i32 1
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
; CHECK-NEXT: store i32 [[TMP22]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i32 2
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
; CHECK-NEXT: store i32 [[TMP23]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i32 3
-; CHECK-NEXT: store i32 [[TMP24]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index 4c07fce65e76..8cf46b49ed11 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -645,41 +645,37 @@ define void @sink_dominance(i32* %ptr, i32 %N) {
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1
-; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0
; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP10]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP11]] to <4 x i32>
-; CHECK-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[TMP12]], <i32 213, i32 213, i32 213, i32 213>
-; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> [[TMP12]], <4 x i32> <i32 22, i32 22, i32 22, i32 22>
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT: [[TMP9]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP9]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[TMP11]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> <i32 22, i32 22, i32 22, i32 22>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
@@ -736,43 +732,39 @@ define void @sink_dominance_2(i32* %ptr, i32 %N) {
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1
-; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0
; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP10]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP11]] to <4 x i32>
-; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> [[TMP13]], <i32 99, i32 99, i32 99, i32 99>
-; CHECK-NEXT: [[TMP15:%.*]] = icmp slt <4 x i32> [[TMP12]], <i32 213, i32 213, i32 213, i32 213>
-; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP12]], <4 x i32> [[TMP14]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT: [[TMP9]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP9]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP11]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP13:%.*]] = mul <4 x i32> [[TMP12]], <i32 99, i32 99, i32 99, i32 99>
+; CHECK-NEXT: [[TMP14:%.*]] = icmp slt <4 x i32> [[TMP11]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> [[TMP11]], <4 x i32> [[TMP13]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index 560a073cd146..3575c2129b59 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -24,17 +24,13 @@ define i8 @widget(i8* %arr, i8 %t9) {
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[T1_0_LCSSA1]], -1
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], [[ARR2]]
; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i8
-; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP5]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = add i8 1, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i8 1, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP6:%.*]] = add i8 1, [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub i8 1, [[TMP5]]
; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i8 [[TMP7]], 1
; CHECK-NEXT: [[TMP9:%.*]] = icmp slt i8 [[TMP6]], 1
; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP4]], 255
; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
@@ -45,18 +41,18 @@ define i8 @widget(i8* %arr, i8 %t9) {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i8> [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp slt <4 x i8> [[TMP13]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP17:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i8>
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP15]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP17]], <4 x i8>* [[TMP19]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i8> [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp slt <4 x i8> [[TMP12]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP16:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i8>
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>*
+; CHECK-NEXT: store <4 x i8> [[TMP16]], <4 x i8>* [[TMP18]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], <i8 4, i8 4, i8 4, i8 4>
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
index 10b61cc9f3e7..f9e28b31baab 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll
@@ -7,49 +7,31 @@ define void @test(float* %A, i32 %x) {
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[X:%.*]], 1
-; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 undef)
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP0]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[IDENT_CHECK]], [[TMP4]]
-; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 undef)
-; CHECK-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = add i32 1, [[MUL_RESULT2]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 1, [[MUL_RESULT2]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i32 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP6]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW3]]
-; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i64 [[TMP12]], 1
-; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[X]]
-; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP19]], align 4
-; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP20]], 0
-; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[X]]
-; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
-; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP26]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], [[X]]
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[X]]
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
+; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP14]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index fef9b10ba156..e8b4c2983460 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -20,24 +20,20 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) {
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i2
-; CHECK-NEXT: [[MUL:%.*]] = call { i2, i1 } @llvm.umul.with.overflow.i2(i2 1, i2 [[TMP1]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i2, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i2, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i2 [[TMP3]], 0
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i2 [[TMP2]], 0
; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[TMP10]], 1
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 1
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP10]]
; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP11]]
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP10]]
; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
@@ -49,20 +45,20 @@ define void @load_clamped_index(i32* %A, i32* %B, i32 %N) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP13:%.*]] = urem i32 [[TMP12]], 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP17:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP12]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP17]], <2 x i32>* [[TMP20]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = urem i32 [[TMP11]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP15]], align 4, !alias.scope !0
+; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP11]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[TMP16]], <2 x i32>* [[TMP19]], align 4, !alias.scope !3, !noalias !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -113,24 +109,20 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) {
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i2
-; CHECK-NEXT: [[MUL:%.*]] = call { i2, i1 } @llvm.umul.with.overflow.i2(i2 1, i2 [[TMP1]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i2, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i2, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i2 [[TMP3]], 0
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i2 [[TMP2]], 0
; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[TMP10]], 1
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 1
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP10]]
; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP11]]
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP10]]
; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
@@ -142,20 +134,20 @@ define void @store_clamped_index(i32* %A, i32* %B, i32 %N) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP13:%.*]] = urem i32 [[TMP12]], 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT: [[TMP17:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP13]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP17]], <2 x i32>* [[TMP20]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = urem i32 [[TMP11]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP15]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP12]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[TMP16]], <2 x i32>* [[TMP19]], align 4, !alias.scope !11
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -285,35 +277,31 @@ define void @clamped_index_equal_dependence(i32* %A, i32* %B, i32 %N) {
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i2
-; CHECK-NEXT: [[MUL:%.*]] = call { i2, i1 } @llvm.umul.with.overflow.i2(i2 1, i2 [[TMP1]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i2, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i2, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i2 0, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i2 0, [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i2 [[TMP3]], 0
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i2 [[TMP2]], 0
; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP0]], 3
; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP10:%.*]] = urem i32 [[TMP9]], 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP14]], <2 x i32>* [[TMP15]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = urem i32 [[TMP8]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[TMP13]], <2 x i32>* [[TMP14]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]