summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHan Zhu <zhuhan7737@gmail.com>2021-03-29 23:35:10 -0700
committerHan Zhu <zhuhan7737@gmail.com>2021-03-29 23:35:35 -0700
commit2bd4049ceb82b116cc4d78c650c6a5af70e37ed0 (patch)
treeb9503b5bce66ad8c2fed6f5abaf7c54c539385f9
parentcef167f8d467201ec40a7817397c18fc264d0b54 (diff)
downloadllvm-2bd4049ceb82b116cc4d78c650c6a5af70e37ed0.tar.gz
Revert "[loop-idiom] Hoist loop memcpys to loop preheader"
This reverts commit deb5095833a834e0ef5f784138da53e66febff05. Bad commit message.
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp200
-rw-r--r--llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll2
-rw-r--r--llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll309
-rw-r--r--llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll2
4 files changed, 45 insertions, 468 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 9fabbd0441a2..596caf58c55f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -205,13 +205,6 @@ private:
enum class ForMemset { No, Yes };
bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
ForMemset For);
-
- template <typename MemInst>
- bool processLoopMemIntrinsic(
- BasicBlock *BB,
- bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
- const SCEV *BECount);
- bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
@@ -221,13 +214,6 @@ private:
const SCEVAddRecExpr *Ev, const SCEV *BECount,
bool NegStride, bool IsLoopMemset = false);
bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
- bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
- unsigned StoreSize, MaybeAlign StoreAlign,
- MaybeAlign LoadAlign, Instruction *TheStore,
- Instruction *TheLoad,
- const SCEVAddRecExpr *StoreEv,
- const SCEVAddRecExpr *LoadEv,
- const SCEV *BECount);
bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
bool IsLoopMemset = false);
@@ -642,10 +628,22 @@ bool LoopIdiomRecognize::runOnLoopBlock(
for (auto &SI : StoreRefsForMemcpy)
MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
- MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
- BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
- MadeChange |= processLoopMemIntrinsic<MemSetInst>(
- BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+ // Look for memset instructions, which may be optimized to a larger memset.
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+ WeakTrackingVH InstPtr(&*I);
+ if (!processLoopMemSet(MSI, BECount))
+ continue;
+ MadeChange = true;
+
+ // If processing the memset invalidated our iterator, start over from the
+ // top of the block.
+ if (!InstPtr)
+ I = BB->begin();
+ continue;
+ }
+ }
return MadeChange;
}
@@ -794,80 +792,6 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
return Changed;
}
-/// processLoopMemIntrinsic - Template function for calling different processor
-/// functions based on mem instrinsic type.
-template <typename MemInst>
-bool LoopIdiomRecognize::processLoopMemIntrinsic(
- BasicBlock *BB,
- bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
- const SCEV *BECount) {
- bool MadeChange = false;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *Inst = &*I++;
- // Look for memory instructions, which may be optimized to a larger one.
- if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
- WeakTrackingVH InstPtr(&*I);
- if (!(this->*Processor)(MI, BECount))
- continue;
- MadeChange = true;
-
- // If processing the instruction invalidated our iterator, start over from
- // the top of the block.
- if (!InstPtr)
- I = BB->begin();
- continue;
- }
- }
- return MadeChange;
-}
-
-/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
-bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
- const SCEV *BECount) {
- // We can only handle non-volatile memcpys with a constant size.
- if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
- return false;
-
- // If we're not allowed to hack on memcpy, we fail.
- if (!HasMemcpy || DisableLIRP::Memcpy)
- return false;
-
- Value *Dest = MCI->getDest();
- Value *Source = MCI->getSource();
- if (!Dest || !Source)
- return false;
-
- // See if the load and store pointer expressions are AddRec like {base,+,1} on
- // the current loop, which indicates a strided load and store. If we have
- // something else, it's a random load or store we can't handle.
- const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
- if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
- return false;
- const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
- if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
- return false;
-
- // Reject memcpys that are so large that they overflow an unsigned.
- uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
- if ((SizeInBytes >> 32) != 0)
- return false;
-
- // Check if the stride matches the size of the memcpy. If so, then we know
- // that every byte is touched in the loop.
- const SCEVConstant *ConstStride =
- dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
- if (!ConstStride)
- return false;
-
- APInt Stride = ConstStride->getAPInt();
- if (SizeInBytes != Stride && SizeInBytes != -Stride)
- return false;
-
- return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
- MCI->getDestAlign(), MCI->getSourceAlign(),
- MCI, MCI, StoreEv, LoadEv, BECount);
-}
-
/// processLoopMemSet - See if this memset can be promoted to a large memset.
bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
const SCEV *BECount) {
@@ -876,7 +800,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
return false;
// If we're not allowed to hack on memset, we fail.
- if (!HasMemset || DisableLIRP::Memset)
+ if (!HasMemset)
return false;
Value *Pointer = MSI->getDest();
@@ -1116,11 +1040,9 @@ bool LoopIdiomRecognize::processLoopStridedStore(
ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
NewCall->getDebugLoc(), Preheader)
- << "Transformed loop-strided store in "
- << ore::NV("Function", TheStore->getFunction())
- << " function into a call to "
+ << "Transformed loop-strided store into a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())
- << "() intrinsic";
+ << "() function";
});
// Okay, the memset has been formed. Zap the original store and anything that
@@ -1146,25 +1068,20 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
Value *StorePtr = SI->getPointerOperand();
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ APInt Stride = getStoreStride(StoreEv);
unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
+ bool NegStride = StoreSize == -Stride;
// The store must be feeding a non-volatile load.
LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided load. If we have something else, it's a
+ // random load we can't handle.
const SCEVAddRecExpr *LoadEv =
cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
- Value *LoadPtr = LI->getPointerOperand();
- return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize,
- SI->getAlign(), LI->getAlign(), SI, LI,
- StoreEv, LoadEv, BECount);
-}
-bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
- Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign,
- MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
- const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
- const SCEV *BECount) {
// The trip count of the loop and the base pointer of the addrec SCEV is
// guaranteed to be loop invariant, which means that it should dominate the
// header. This allows us to insert code for it in the preheader.
@@ -1176,12 +1093,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
bool Changed = false;
const SCEV *StrStart = StoreEv->getStart();
- unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
+ unsigned StrAS = SI->getPointerAddressSpace();
Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
- APInt Stride = getStoreStride(StoreEv);
- bool NegStride = StoreSize == -Stride;
-
// Handle negative strided loops.
if (NegStride)
StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
@@ -1205,26 +1119,13 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
Changed = true;
SmallPtrSet<Instruction *, 1> Stores;
- Stores.insert(TheStore);
-
- bool IsMemCpy = isa<MemCpyInst>(TheStore);
- const std::string InstRemark = IsMemCpy ? "memcpy" : "load and store";
-
+ Stores.insert(SI);
if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
- StoreSize, *AA, Stores)) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
- TheStore)
- << ore::NV("Inst", InstRemark) << " in "
- << ore::NV("Function", TheStore->getFunction())
- << " function will not be hoisted: "
- << ore::NV("Reason", "The loop may access store location");
- });
+ StoreSize, *AA, Stores))
return Changed;
- }
const SCEV *LdStart = LoadEv->getStart();
- unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
+ unsigned LdAS = LI->getPointerAddressSpace();
// Handle negative strided loops.
if (NegStride)
@@ -1235,21 +1136,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
Value *LoadBasePtr = Expander.expandCodeFor(
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
- // If the store is a memcpy instruction, we must check if it will write to
- // the load memory locations. So remove it from the ignored stores.
- if (IsMemCpy)
- Stores.erase(TheStore);
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
- StoreSize, *AA, Stores)) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
- << ore::NV("Inst", InstRemark) << " in "
- << ore::NV("Function", TheStore->getFunction())
- << " function will not be hoisted: "
- << ore::NV("Reason", "The loop may access load location");
- });
+ StoreSize, *AA, Stores))
return Changed;
- }
if (avoidLIRForMultiBlockLoop())
return Changed;
@@ -1266,15 +1155,15 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
// Check whether to generate an unordered atomic memcpy:
// If the load or store are atomic, then they must necessarily be unordered
// by previous checks.
- if (!TheStore->isAtomic() && !TheLoad->isAtomic())
- NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
- LoadAlign, NumBytes);
+ if (!SI->isAtomic() && !LI->isAtomic())
+ NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
+ LI->getAlign(), NumBytes);
else {
// We cannot allow unaligned ops for unordered load/store, so reject
// anything where the alignment isn't at least the element size.
- assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
- "Expect unordered load/store to have align.");
- if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
+ const Align StoreAlign = SI->getAlign();
+ const Align LoadAlign = LI->getAlign();
+ if (StoreAlign < StoreSize || LoadAlign < StoreSize)
return Changed;
// If the element.atomic memcpy is not lowered into explicit
@@ -1288,10 +1177,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
// Note that unordered atomic loads/stores are *required* by the spec to
// have an alignment but non-atomic loads/stores may not.
NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
- StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
- NumBytes, StoreSize);
+ StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
+ StoreSize);
}
- NewCall->setDebugLoc(TheStore->getDebugLoc());
+ NewCall->setDebugLoc(SI->getDebugLoc());
if (MSSAU) {
MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
@@ -1300,9 +1189,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
}
LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
- << " from load ptr=" << *LoadEv << " at: " << *TheLoad
- << "\n"
- << " from store ptr=" << *StoreEv << " at: " << *TheStore
+ << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+ << " from store ptr=" << *StoreEv << " at: " << *SI
<< "\n");
ORE.emit([&]() {
@@ -1310,16 +1198,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
NewCall->getDebugLoc(), Preheader)
<< "Formed a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())
- << "() intrinsic from " << ore::NV("Inst", InstRemark)
- << " instruction in " << ore::NV("Function", TheStore->getFunction())
- << " function";
+ << "() function";
});
// Okay, the memcpy has been formed. Zap the original store and anything that
// feeds into it.
if (MSSAU)
- MSSAU->removeMemoryAccess(TheStore, true);
- deleteDeadInstruction(TheStore);
+ MSSAU->removeMemoryAccess(SI, true);
+ deleteDeadInstruction(SI);
if (MSSAU && VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();
++NumMemCpy;
diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll b/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
index 6f817f2b56d8..3578540cc4d2 100644
--- a/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
+++ b/llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
; Check that everything still works when debuginfo is present, and that it is reasonably propagated.
-; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() intrinsic from load and store instruction in test6_dest_align function
+; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() function
define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {
; CHECK-LABEL: @test6_dest_align(
diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll b/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll
deleted file mode 100644
index bb0d68ed2042..000000000000
--- a/llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll
+++ /dev/null
@@ -1,309 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -loop-idiom < %s -S | FileCheck %s
-
-%struct.S = type { i32, i32, i8 }
-
-; unsigned copy_noalias(S* __restrict a, S *b, int n) {
-; for (int i = 0; i < n; i++) {
-; a[i] = b[i];
-; }
-; return sizeof(a[0]);
-; }
-
-; Function Attrs: nofree nounwind uwtable mustprogress
-define dso_local i32 @copy_noalias(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
-; CHECK-LABEL: @copy_noalias(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.S* [[A:%.*]] to i8*
-; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.S* [[B:%.*]] to i8*
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 12
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A1]], i8* align 4 [[B2]], i64 [[TMP1]], i1 false)
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret i32 12
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8*
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-;
-entry:
- %cmp7 = icmp sgt i32 %n, 0
- br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- ret i32 12
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %idxprom = zext i32 %i.08 to i64
- %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
- %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
- %0 = bitcast %struct.S* %arrayidx2 to i8*
- %1 = bitcast %struct.S* %arrayidx to i8*
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
- %inc = add nuw nsw i32 %i.08, 1
- %cmp = icmp slt i32 %inc, %n
- br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
-}
-
-; unsigned copy_may_alias(S *a, S *b, int n) {
-; for (int i = 0; i < n; i++) {
-; a[i] = b[i];
-; }
-; return sizeof(a[0]);
-; }
-
-; Function Attrs: nofree nounwind uwtable mustprogress
-define dso_local i32 @copy_may_alias(%struct.S* nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
-; CHECK-LABEL: @copy_may_alias(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret i32 12
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8*
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8*
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) [[TMP0]], i8* nonnull align 4 dereferenceable(12) [[TMP1]], i64 12, i1 false)
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-;
-entry:
- %cmp7 = icmp sgt i32 %n, 0
- br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- ret i32 12
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %idxprom = zext i32 %i.08 to i64
- %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
- %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
- %0 = bitcast %struct.S* %arrayidx2 to i8*
- %1 = bitcast %struct.S* %arrayidx to i8*
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
- %inc = add nuw nsw i32 %i.08, 1
- %cmp = icmp slt i32 %inc, %n
- br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
-}
-
-%struct.R = type <{ i8, i32, i8 }>
-
-; void copy_noalias_read(S* __restrict x, S* __restrict y, int n, int &s) {
-; for (int i = 0; i < n; i++) {
-; x[i] = y[i];
-; s += y[i].b;
-; }
-; }
-
-; Function Attrs: nofree nounwind uwtable mustprogress
-define dso_local void @copy_noalias_read(%struct.R* noalias nocapture %x, %struct.R* noalias nocapture readonly %y, i32 %n, i32* nocapture nonnull align 4 dereferenceable(4) %s) local_unnamed_addr #0 {
-; CHECK-LABEL: @copy_noalias_read(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[X1:%.*]] = bitcast %struct.R* [[X:%.*]] to i8*
-; CHECK-NEXT: [[Y2:%.*]] = bitcast %struct.R* [[Y:%.*]] to i8*
-; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.lr.ph:
-; CHECK-NEXT: [[S_PROMOTED:%.*]] = load i32, i32* [[S:%.*]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 6
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[X1]], i8* align 1 [[Y2]], i64 [[TMP1]], i1 false)
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.for.cond.cleanup_crit_edge:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* [[S]], align 4
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[ADD13:%.*]] = phi i32 [ [[S_PROMOTED]], [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_012]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_R:%.*]], %struct.R* [[X]], i64 [[IDXPROM]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_R]], %struct.R* [[Y]], i64 [[IDXPROM]], i32 0
-; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_R]], %struct.R* [[Y]], i64 [[IDXPROM]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[B]], align 1
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[ADD13]], [[TMP4]]
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]]
-;
-entry:
- %cmp11 = icmp sgt i32 %n, 0
- br i1 %cmp11, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph: ; preds = %entry
- %s.promoted = load i32, i32* %s, align 4
- br label %for.body
-
-for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body
- %add.lcssa = phi i32 [ %add, %for.body ]
- store i32 %add.lcssa, i32* %s, align 4
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
- ret void
-
-for.body: ; preds = %for.body.lr.ph, %for.body
- %add13 = phi i32 [ %s.promoted, %for.body.lr.ph ], [ %add, %for.body ]
- %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
- %idxprom = zext i32 %i.012 to i64
- %0 = getelementptr inbounds %struct.R, %struct.R* %x, i64 %idxprom, i32 0
- %1 = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 0
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(6) %0, i8* nonnull align 1 dereferenceable(6) %1, i64 6, i1 false)
- %b = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 1
- %2 = load i32, i32* %b, align 1
- %add = add nsw i32 %add13, %2
- %inc = add nuw nsw i32 %i.012, 1
- %cmp = icmp slt i32 %inc, %n
- br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge
-}
-
-%struct.SPacked = type <{ i32, i32, i8 }>
-
-; Function Attrs: nofree nounwind uwtable mustprogress
-define dso_local i32 @copy_noalias_packed(%struct.SPacked* noalias nocapture %a, %struct.SPacked* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
-; CHECK-LABEL: @copy_noalias_packed(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.SPacked* [[A:%.*]] to i8*
-; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.SPacked* [[B:%.*]] to i8*
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 9
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[A1]], i8* align 1 [[B2]], i64 [[TMP1]], i1 false)
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret i32 9
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_SPACKED:%.*]], %struct.SPacked* [[B]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_SPACKED]], %struct.SPacked* [[A]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.SPacked* [[ARRAYIDX2]] to i8*
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.SPacked* [[ARRAYIDX]] to i8*
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-;
-entry:
- %cmp7 = icmp sgt i32 %n, 0
- br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- ret i32 9
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %idxprom = zext i32 %i.08 to i64
- %arrayidx = getelementptr inbounds %struct.SPacked, %struct.SPacked* %b, i64 %idxprom
- %arrayidx2 = getelementptr inbounds %struct.SPacked, %struct.SPacked* %a, i64 %idxprom
- %0 = bitcast %struct.SPacked* %arrayidx2 to i8*
- %1 = bitcast %struct.SPacked* %arrayidx to i8*
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(9) %0, i8* nonnull align 1 dereferenceable(9) %1, i64 9, i1 false)
- %inc = add nuw nsw i32 %i.08, 1
- %cmp = icmp slt i32 %inc, %n
- br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
-}
-
-%struct.SAligned = type { i32, i32, i8, [7 x i8] }
-
-define dso_local i32 @copy_noalias_aligned(%struct.SAligned* noalias nocapture %a, %struct.SAligned* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
-; CHECK-LABEL: @copy_noalias_aligned(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast %struct.SAligned* [[A:%.*]] to i8*
-; CHECK-NEXT: [[B2:%.*]] = bitcast %struct.SAligned* [[B:%.*]] to i8*
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A1]], i8* align 16 [[B2]], i64 [[TMP1]], i1 false)
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret i32 16
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_SALIGNED:%.*]], %struct.SAligned* [[B]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_SALIGNED]], %struct.SAligned* [[A]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.SAligned* [[ARRAYIDX2]] to i8*
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.SAligned* [[ARRAYIDX]] to i8*
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-;
-entry:
- %cmp7 = icmp sgt i32 %n, 0
- br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- ret i32 16
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %idxprom = zext i32 %i.08 to i64
- %arrayidx = getelementptr inbounds %struct.SAligned, %struct.SAligned* %b, i64 %idxprom
- %arrayidx2 = getelementptr inbounds %struct.SAligned, %struct.SAligned* %a, i64 %idxprom
- %0 = bitcast %struct.SAligned* %arrayidx2 to i8*
- %1 = bitcast %struct.SAligned* %arrayidx to i8*
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) %0, i8* nonnull align 16 dereferenceable(16) %1, i64 16, i1 false)
- %inc = add nuw nsw i32 %i.08, 1
- %cmp = icmp slt i32 %inc, %n
- br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
-}
-
-; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
diff --git a/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll b/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
index b7a866f446c7..06e17fecec6d 100644
--- a/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
+++ b/llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
; *begin = value;
; }
-; CHECK: remark: <stdin>:4:1: Transformed loop-strided store in _Z15my_basic_memsetPcS_c function into a call to llvm.memset.p0i8.i64() intrinsic
+; CHECK: remark: <stdin>:4:1: Transformed loop-strided store into a call to llvm.memset.p0i8.i64() function
define void @_Z15my_basic_memsetPcS_c(i8* %ptr, i8* %end, i8 %value) {
; CHECK-LABEL: @_Z15my_basic_memsetPcS_c(