29 files changed, 2884 insertions, 396 deletions
diff --git a/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
new file mode 100644
index 000000000000..d1d854d8f457
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-split-or-phi.ll
@@ -0,0 +1,339 @@
+; RUN: opt < %s -callsite-splitting -S | FileCheck %s
+; RUN: opt < %s  -passes='function(callsite-splitting)' -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linaro-linux-gnueabi"
+
+;CHECK-LABEL: @test_eq_eq
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_eq
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_eq(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp ne i32* %a, null
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_ne
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_ne(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp ne i32* %a, null
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp ne i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_eq_eq_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_eq_eq_untaken(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_eq_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_eq_untaken(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp ne i32* %a, null
+  br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_ne_ne_untaken
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* null, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* nonnull %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_ne_ne_untaken(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp ne i32* %a, null
+  br i1 %tobool1, label %TBB, label %Tail
+
+TBB:
+  %cmp = icmp ne i32 %v, 1
+  br i1 %cmp, label %End, label %Tail
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_const_phi
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_nonconst_const_phi(i32* %a, i32* %b, i32 %v) {
+Header:
+  %tobool1 = icmp eq i32* %a, %b
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_nonconst_phi
+;CHECK-LABEL: Tail.predBB1.split:
+;CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 1)
+;CHECK-LABEL: Tail.predBB2.split:
+;CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 %v, i32 2)
+;CHECK-LABEL: Tail
+;CHECK: %p = phi i32 [ 1, %Tail.predBB1.split ], [ 2, %Tail.predBB2.split ]
+;CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ]
+;CHECK: ret i32 %[[MERGED]]
+define i32 @test_nonconst_nonconst_phi(i32* %a, i32* %b, i32 %v, i32 %v2) {
+Header:
+  %tobool1 = icmp eq i32* %a, %b
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, %v2 
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_nonconst_nonconst_phi_noncost
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_nonconst_nonconst_phi_noncost(i32* %a, i32* %b, i32 %v, i32 %v2) {
+Header:
+  %tobool1 = icmp eq i32* %a, %b
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, %v2 
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[%v,%Header], [%v2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_fisrtnonphi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_fisrtnonphi(i32* %a, i32 %v) {
+Header:
+  %tobool1 = icmp eq i32* %a, null
+  br i1 %tobool1, label %Tail, label %TBB
+
+TBB:
+  %cmp = icmp eq i32 %v, 1
+  br i1 %cmp, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  store i32 %v, i32* %a
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_3preds_constphi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_3preds_constphi(i32* %a, i32 %v, i1 %c1, i1 %c2, i1 %c3) {
+Header:
+  br i1 %c1, label %Tail, label %TBB1
+
+TBB1:
+  br i1 %c2, label %Tail, label %TBB2
+
+TBB2:
+  br i1 %c3, label %Tail, label %End
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB1], [3, %TBB2]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+;CHECK-LABEL: @test_indirectbr_phi
+;CHECK-NOT: Tail.predBB1.split:
+;CHECK-NOT: Tail.predBB2.split:
+;CHECK-LABEL: Tail:
+;CHECK: %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+;CHECK: ret i32 %r
+define i32 @test_indirectbr_phi(i8* %address, i32* %a, i32* %b, i32 %v) {
+Header:
+   %indirect.goto.dest = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address
+   indirectbr i8* %indirect.goto.dest, [label %TBB, label %Tail]
+
+TBB:
+  %indirect.goto.dest2 = select i1 undef, i8* blockaddress(@test_indirectbr_phi, %End), i8* %address
+  indirectbr i8* %indirect.goto.dest2, [label %Tail, label %End]
+
+Tail:
+  %p = phi i32[1,%Header], [2, %TBB]
+  %r = call i32 @callee(i32* %a, i32 %v, i32 %p)
+  ret i32 %r
+
+End:
+  ret i32 %v
+}
+
+define i32 @callee(i32* %a, i32 %v, i32 %p) {
+entry:
+  %c = icmp ne i32* %a, null
+  br i1 %c, label %BB1, label %BB2
+
+BB1:
+  call void @dummy(i32* %a, i32 %p)
+  br label %End
+
+BB2:
+  call void @dummy2(i32 %v, i32 %p)
+  br label %End
+
+End:
+  ret i32 %p
+}
+
+declare void @dummy(i32*, i32)
+declare void @dummy2(i32, i32)
diff --git a/test/Transforms/CallSiteSplitting/callsite-split.ll b/test/Transforms/CallSiteSplitting/callsite-split.ll
new file mode 100644
index 000000000000..419fa738563c
--- /dev/null
+++ b/test/Transforms/CallSiteSplitting/callsite-split.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -callsite-splitting -inline -instcombine -jump-threading -S | FileCheck %s
+; RUN: opt < %s  -passes='function(callsite-splitting),cgscc(inline),function(instcombine,jump-threading)' -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linaro-linux-gnueabi"
+
+%struct.bitmap = type { i32, %struct.bitmap* }
+
+;CHECK-LABEL: @caller
+;CHECK-LABEL: NextCond:
+;CHECK: br {{.*}} label %callee.exit
+;CHECK-LABEL: CallSiteBB.predBB1.split:
+;CHECK: call void @callee(%struct.bitmap* null, %struct.bitmap* null, %struct.bitmap* %b_elt, i1 false)
+;CHECK-LABEL: callee.exit:
+;CHECK: call void @dummy2(%struct.bitmap* %a_elt)
+
+define void @caller(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt) {
+entry:
+  br label %Top
+
+Top:
+  %tobool1 = icmp eq %struct.bitmap* %a_elt, null
+  br i1 %tobool1, label %CallSiteBB, label %NextCond
+
+NextCond:
+  %cmp = icmp ne %struct.bitmap* %b_elt, null
+  br i1 %cmp, label %CallSiteBB, label %End
+
+CallSiteBB:
+  %p = phi i1 [0, %Top], [%c, %NextCond]
+  call void @callee(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %p)
+  br label %End
+
+End:
+  ret void
+}
+
+define void @callee(%struct.bitmap* %dst_elt, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, i1 %c) {
+entry:
+  %tobool = icmp ne %struct.bitmap* %a_elt, null
+  %tobool1 = icmp ne %struct.bitmap* %b_elt, null
+  %or.cond = and i1 %tobool, %tobool1
+  br i1 %or.cond, label %Cond, label %Big
+
+Cond:
+  %cmp = icmp eq %struct.bitmap*  %dst_elt, %a_elt
+  br i1 %cmp, label %Small, label %Big
+
+Small:
+  call void @dummy2(%struct.bitmap* %a_elt)
+  br label %End
+
+Big:
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  call void @dummy1(%struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt, %struct.bitmap* %a_elt)
+  br label %End
+
+End:
+  ret void
+}
+
+declare void @dummy2(%struct.bitmap*)
+declare void @dummy1(%struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*, %struct.bitmap*)
+
+
+;CHECK-LABEL: @caller2
+;CHECK-LABEL: CallSiteBB.predBB1.split:
+;CHECK: call void @dummy4()
+;CHECK-LABEL: CallSiteBB.predBB2.split:
+;CHECK: call void @dummy3()
+;CheCK-LABEL: CallSiteBB:
+;CHECK: %phi.call = phi i1 [ false, %CallSiteBB.predBB1.split ], [ true, %CallSiteBB.predBB2.split ]
+;CHECK: call void @foo(i1 %phi.call)
+define void @caller2(i1 %c, %struct.bitmap* %a_elt, %struct.bitmap* %b_elt, %struct.bitmap* %c_elt) {
+entry:
+  br label %Top
+
+Top:
+  %tobool1 = icmp eq %struct.bitmap* %a_elt, %b_elt
+  br i1 %tobool1, label %CallSiteBB, label %NextCond
+
+NextCond:
+  %cmp = icmp ne %struct.bitmap* %b_elt, %c_elt
+  br i1 %cmp, label %CallSiteBB, label %End
+
+CallSiteBB:
+  %phi = phi i1 [0, %Top],[1, %NextCond]
+  %u = call i1 @callee2(i1 %phi)
+  call void @foo(i1 %u)
+  br label %End
+
+End:
+  ret void
+}
+
+define i1 @callee2(i1 %b) {
+entry:
+  br i1 %b, label %BB1, label %BB2
+
+BB1:
+  call void @dummy3()
+  br label %End
+
+BB2:
+  call void @dummy4()
+  br label %End
+
+End:
+  ret i1 %b
+}
+
+declare void @dummy3()
+declare void @dummy4()
+declare void @foo(i1)
diff --git a/test/Transforms/CodeExtractor/PartialInlineNoInline.ll b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll
new file mode 100644
index 000000000000..6c0b83298d23
--- /dev/null
+++ b/test/Transforms/CodeExtractor/PartialInlineNoInline.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -S -stats -pass-remarks=partial-inlining 2>&1 | FileCheck %s
+
+@stat = external global i32, align 4
+
+define i32 @inline_fail(i32 %count, ...) {
+entry:
+  %vargs = alloca i8*, align 8
+  %vargs1 = bitcast i8** %vargs to i8*
+  call void @llvm.va_start(i8* %vargs1)
+  %stat1 = load i32, i32* @stat, align 4
+  %cmp = icmp slt i32 %stat1, 0
+  br i1 %cmp, label %bb2, label %bb1
+
+bb1:                                              ; preds = %entry
+  %vg1 = add nsw i32 %stat1, 1
+  store i32 %vg1, i32* @stat, align 4
+  %va1 = va_arg i8** %vargs, i32
+  call void @foo(i32 %count, i32 %va1) #2
+  br label %bb2
+
+bb2:                                              ; preds = %bb1, %entry
+  %res = phi i32 [ 1, %bb1 ], [ 0, %entry ]
+  call void @llvm.va_end(i8* %vargs1)
+  ret i32 %res
+}
+
+define i32 @caller(i32 %arg) {
+bb:
+  %res = tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg)
+  ret i32 %res
+}
+
+declare void @foo(i32, i32)
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+
+; Check that no remarks have been emitted, inline_fail has not been partial
+; inlined, no code has been extracted and the partial-inlining counter
+; has not been incremented.
+
+; CHECK-NOT: remark
+; CHECK: tail call i32 (i32, ...) @inline_fail(i32 %arg, i32 %arg)
+; CHECK-NOT: inline_fail.1_bb1
+; CHECK-NOT: partial-inlining
diff --git a/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
new file mode 100644
index 000000000000..06a513543c45
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/ARM/sink-addrmode.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -codegenprepare -mtriple=thumbv7m -disable-complex-addr-modes=false -addr-sink-new-select=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Select between two geps with different base, same constant offset
+define void @test_select_twogep_base(i32* %ptr1, i32* %ptr2, i32 %value) {
+; CHECK-LABEL: @test_select_twogep_base
+; CHECK-NOT: select i1 %cmp, i32* %gep1, i32* %gep2
+; CHECK: select i1 %cmp, i32* %ptr1, i32* %ptr2
+entry:
+  %cmp = icmp sgt i32 %value, 0
+  %gep1 = getelementptr inbounds i32, i32* %ptr1, i32 1
+  %gep2 = getelementptr inbounds i32, i32* %ptr2, i32 1
+  %select = select i1 %cmp, i32* %gep1, i32* %gep2
+  store i32 %value, i32* %select, align 4
+  ret void
+}
+
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
new file mode 100644
index 000000000000..2bacbdd7f400
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
@@ -0,0 +1,475 @@
+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=true -addr-sink-new-select=true  %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-YES
+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-phis=false -addr-sink-new-select=true %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Can we sink for different base if there is no phi for base?
+define i32 @test1(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test1
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+  %c = phi i32* [%c1, %entry], [%c2, %if.then]
+  %v = load i32, i32* %c, align 4
+  ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base?
+define i32 @test2(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test2
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+  %b = phi i64* [%b1, %entry], [%b2, %if.then]
+  %c = phi i32* [%c1, %entry], [%c2, %if.then]
+  %v = load i32, i32* %c, align 4
+  ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base but not valid one?
+define i32 @test3(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test3
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+  %b = phi i64* [%b2, %entry], [%b1, %if.then]
+  %c = phi i32* [%c1, %entry], [%c2, %if.then]
+  %v = load i32, i32* %c, align 4
+  ret i32 %v
+}
+
+; Can we sink for different base if both addresses are in the same block?
+define i32 @test4(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test4
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+  %c = phi i32* [%c1, %entry], [%c2, %if.then]
+  %v = load i32, i32* %c, align 4
+  ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base?
+; Both addresses are in the same block.
+define i32 @test5(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test5
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+  %b = phi i64* [%b1, %entry], [%b2, %if.then]
+  %c = phi i32* [%c1, %entry], [%c2, %if.then]
+  %v = load i32, i32* %c, align 4
+  ret i32 %v
+}
+
+; Can we sink for different base if there is phi for base but not valid one?
+; Both addresses are in the same block.
+define i32 @test6(i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test6
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+  %b = phi i64* [%b2, %entry], [%b1, %if.then]
+  %c = phi i32* [%c1, %entry], [%c2, %if.then]
+  %v = load i32, i32* %c, align 4
+  ret i32 %v
+}
+
+; case with a loop. No phi node.
+define i32 @test7(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test7
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+  %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+  %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+  %c = phi i32* [%c3, %loop], [%c2, %if.then]
+  %v = load volatile i32, i32* %c, align 4
+  %iv.inc = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.inc, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %v
+}
+
+; case with a loop. There is phi node.
+define i32 @test8(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test8
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+  %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+  %b3 = phi i64* [%b1, %entry], [%b, %fallthrough]
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK: getelementptr i8, {{.+}} 40
+  %c = phi i32* [%c3, %loop], [%c2, %if.then]
+  %b = phi i64* [%b3, %loop], [%b2, %if.then]
+  %v = load volatile i32, i32* %c, align 4
+  %iv.inc = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.inc, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %v
+}
+
+; case with a loop. There is phi node but it does not fit.
+define i32 @test9(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test9
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+  %iv = phi i32 [0, %entry], [%iv.inc, %fallthrough]
+  %c3 = phi i32* [%c1, %entry], [%c, %fallthrough]
+  %b3 = phi i64* [%b1, %entry], [%b2, %fallthrough]
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+  %c = phi i32* [%c3, %loop], [%c2, %if.then]
+  %b = phi i64* [%b3, %loop], [%b2, %if.then]
+  %v = load volatile i32, i32* %c, align 4
+  %iv.inc = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.inc, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %v
+}
+
+; Case through a loop. No phi node.
+define i32 @test10(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test10
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: br
+  %c = phi i32* [%c1, %entry], [%c2, %if.then]
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %fallthrough], [%iv.inc, %loop]
+  %iv.inc = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.inc, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+; CHECK-YES: sunkaddr
+  %v = load volatile i32, i32* %c, align 4
+  ret i32 %v
+}
+
+; Case through a loop. There is a phi.
+define i32 @test11(i32 %N, i1 %cond, i64* %b1, i64* %b2) {
+; CHECK-LABEL: @test11
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK: phi
+; CHECK: phi
+; CHECK: br
+  %c = phi i32* [%c1, %entry], [%c2, %if.then]
+  %b = phi i64* [%b1, %entry], [%b2, %if.then]
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %fallthrough], [%iv.inc, %loop]
+  %iv.inc = add i32 %iv, 1
+  %cmp = icmp slt i32 %iv.inc, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+; CHECK: sunkaddr
+  %v = load volatile i32, i32* %c, align 4
+  ret i32 %v
+}
+
+; Complex case with address value from previous iteration.
+define i32 @test12(i32 %N, i1 %cond, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test12
+entry:
+  %a1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %c1 = bitcast i64* %a1 to i32*
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK-YES: sunk_phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: br
+  %iv = phi i32 [0, %entry], [%iv.inc, %backedge]
+  %c3 = phi i32* [%c1, %entry], [%c, %backedge]
+  %b4 = phi i64* [%b1, %entry], [%b5, %backedge]
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %a2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %c2 = bitcast i64* %a2 to i32*
+  br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO: phi
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+  %c = phi i32* [%c3, %loop], [%c2, %if.then]
+  %b6 = phi i64* [%b4, %loop], [%b2, %if.then]
+  %v = load volatile i32, i32* %c, align 4
+  %a4 = getelementptr inbounds i64, i64* %b4, i64 5
+  %c4 = bitcast i64* %a4 to i32*
+  %cmp = icmp slt i32 %iv, 20
+  br i1 %cmp, label %backedge, label %if.then.2
+
+if.then.2:
+  br label %backedge
+
+backedge:
+  %b5 = phi i64* [%b4, %fallthrough], [%b6, %if.then.2]
+  %iv.inc = add i32 %iv, 1
+  %cmp2 = icmp slt i32 %iv.inc, %N
+  br i1 %cmp2, label %loop, label %exit
+
+exit:
+  ret i32 %v
+}
+
+%struct.S = type {i32, i32}
+; Case with index
+define i32 @test13(i1 %cond, %struct.S* %b1, %struct.S* %b2, i64 %Index) {
+; CHECK-LABEL: @test13
+entry:
+  %a1 = getelementptr inbounds %struct.S, %struct.S* %b1, i64 %Index, i32 1
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %i2 = mul i64 %Index, 2
+  %a2 = getelementptr inbounds %struct.S, %struct.S* %b2, i64 %Index, i32 1
+  br label %fallthrough
+
+fallthrough:
+; CHECK-YES: sunk_phi
+; CHECK-NO-LABEL: fallthrough:
+; CHECK-NO-NEXT: phi
+; CHECK-NO-NEXT: load
+  %a = phi i32* [%a1, %entry], [%a2, %if.then]
+  %v = load i32, i32* %a, align 4
+  ret i32 %v
+}
+
+; Select of Select case.
+define i64 @test14(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test14
+entry:
+; CHECK-LABEL: entry:
+  %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+  %s1 = select i1 %c1, i64* %g1, i64* %g2
+  %s2 = select i1 %c2, i64* %s1, i64* %g3
+; CHECK: sunkaddr
+  %v = load i64 , i64* %s2, align 8
+  ret i64 %v
+}
+
+; Select of Phi case.
+define i64 @test15(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test15
+entry:
+  %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+  br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+  br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+  %p1 = phi i64* [%g1, %entry], [%g2, %if.then]
+  %s1 = select i1 %c2, i64* %p1, i64* %g3
+; CHECK-YES: sunkaddr
+; CHECK-NO: phi
+; CHECK-NO-NEXT: select
+; CHECK-NO-NEXT: load
+  %v = load i64 , i64* %s1, align 8
+  ret i64 %v
+}
+
+; Select of Phi case. Phi exists
+define i64 @test16(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test16
+entry:
+  %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+  br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+  br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+  %p = phi i64* [%b1, %entry], [%b2, %if.then]
+  %p1 = phi i64* [%g1, %entry], [%g2, %if.then]
+  %s1 = select i1 %c2, i64* %p1, i64* %g3
+; CHECK: sunkaddr
+  %v = load i64 , i64* %s1, align 8
+  ret i64 %v
+}
+
+; Phi of Select case.
+define i64 @test17(i1 %c1, i1 %c2, i64* %b1, i64* %b2, i64* %b3) {
+; CHECK-LABEL: @test17
+entry:
+  %g1 = getelementptr inbounds i64, i64* %b1, i64 5
+  %g2 = getelementptr inbounds i64, i64* %b2, i64 5
+  %g3 = getelementptr inbounds i64, i64* %b3, i64 5
+  %s1 = select i1 %c2, i64* %g1, i64* %g2
+  br i1 %c1, label %if.then, label %fallthrough
+
+if.then:
+  br label %fallthrough
+
+fallthrough:
+; CHECK-LABEL: fallthrough:
+  %p1 = phi i64* [%s1, %entry], [%g3, %if.then]
+; CHECK-YES: sunkaddr
+; CHECK-NO: phi
+; CHECK-NO-NEXT: load
+  %v = load i64 , i64* %p1, align 8
+  ret i64 %v
+}
diff --git a/test/Transforms/ExpandMemCmp/X86/lit.local.cfg b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg
new file mode 100644
index 000000000000..e71f3cc4c41e
--- /dev/null
+++ b/test/Transforms/ExpandMemCmp/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index a4f635c956df..1abfb20f3696 100644
--- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ b/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown   -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown   -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: opt -S -expandmemcmp -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
 declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
 
@@ -23,30 +23,33 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp3(
-; ALL-NEXT:  loadbb:
-; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
-; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
-; ALL-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
-; ALL-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; ALL-NEXT:    [[TMP6:%.*]] = icmp eq i16 [[TMP4]], [[TMP5]]
-; ALL-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT:    br label [[LOADBB:%.*]]
 ; ALL:       res_block:
-; ALL-NEXT:    [[TMP7:%.*]] = icmp ult i16 [[TMP4]], [[TMP5]]
-; ALL-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ]
+; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb:
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; ALL-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP3]]
+; ALL-NEXT:    [[TMP6:%.*]] = load i16, i16* [[TMP4]]
+; ALL-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
+; ALL-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 2
-; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 2
-; ALL-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 2
 ; ALL-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; ALL-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; ALL-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
 ; ALL-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; ALL-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; ALL-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; ALL-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; ALL-NEXT:    br label [[ENDBLOCK]]
 ; ALL:       endblock:
-; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; ALL-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
@@ -74,30 +77,33 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp5(
-; ALL-NEXT:  loadbb:
-; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; ALL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; ALL-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT:    br label [[LOADBB:%.*]]
 ; ALL:       res_block:
-; ALL-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ]
+; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb:
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; ALL-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; ALL-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; ALL-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; ALL-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 4
 ; ALL-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; ALL-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; ALL-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
 ; ALL-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; ALL-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; ALL-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; ALL-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; ALL-NEXT:    br label [[ENDBLOCK]]
 ; ALL:       endblock:
-; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; ALL-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
@@ -106,36 +112,37 @@ define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp6(
-; ALL-NEXT:  loadbb:
-; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; ALL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; ALL-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; ALL-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; ALL-NEXT:    br label [[LOADBB:%.*]]
 ; ALL:       res_block:
-; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; ALL-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; ALL-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb:
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; ALL-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; ALL-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; ALL-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
-; ALL-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
-; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 2
+; ALL-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i16*
 ; ALL-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 2
-; ALL-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; ALL-NEXT:    [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 2
 ; ALL-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
-; ALL-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; ALL-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
 ; ALL-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; ALL-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i32
+; ALL-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
 ; ALL-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
-; ALL-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP17]], [[TMP18]]
-; ALL-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; ALL-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
+; ALL-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
+; ALL-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; ALL:       endblock:
-; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; ALL-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
@@ -153,34 +160,35 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-LABEL: @cmp8(
-; X32-NEXT:  loadbb:
-; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
-; X32-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
-; X32-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; X32-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb:
+; X32-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]]
+; X32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]]
+; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
-; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 1
+; X32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i32*
 ; X32-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 1
-; X32-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 1
 ; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
-; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]]
 ; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], [[TMP16]]
-; X32-NEXT:    br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
+; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
-; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
 ; X64-LABEL: @cmp8(
@@ -207,30 +215,33 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
 ; X64-LABEL: @cmp9(
-; X64-NEXT:  loadbb:
-; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]]
-; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]]
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i8 8
 ; X64-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]]
-; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]]
 ; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP15]], [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9)
@@ -243,36 +254,37 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
 ; X64-LABEL: @cmp10(
-; X64-NEXT:  loadbb:
-; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i16*
-; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i16*
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i16, i16* [[TMP9]], i16 4
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i16*
 ; X64-NEXT:    [[TMP12:%.*]] = getelementptr i16, i16* [[TMP10]], i16 4
-; X64-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i16, i16* [[TMP11]], i16 4
 ; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
-; X64-NEXT:    [[TMP15:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP13]])
+; X64-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
 ; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = zext i16 [[TMP15]] to i64
+; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
 ; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i64
-; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
-; X64-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i64
+; X64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
+; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10)
@@ -294,36 +306,37 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
 ; X64-LABEL: @cmp12(
-; X64-NEXT:  loadbb:
-; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1]] ]
-; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i32*
-; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i32*
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i32*
 ; X64-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 2
-; X64-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2
 ; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
-; X64-NEXT:    [[TMP15:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]]
 ; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = zext i32 [[TMP15]] to i64
+; X64-NEXT:    [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
 ; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
-; X64-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP17]], [[TMP18]]
-; X64-NEXT:    br i1 [[TMP19]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP19]] = zext i32 [[TMP17]] to i64
+; X64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
+; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12)
@@ -363,34 +376,35 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
 ; X64-LABEL: @cmp16(
-; X64-NEXT:  loadbb:
-; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
-; X64-NEXT:    br i1 [[TMP6]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4]], [[LOADBB:%.*]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1]] ]
-; X64-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; X64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 -1, i32 1
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP9:%.*]] = bitcast i8* [[X]] to i64*
-; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[Y]] to i64*
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 1
+; X64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[X]] to i64*
+; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[Y]] to i64*
 ; X64-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[TMP10]], i64 1
-; X64-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP11]]
+; X64-NEXT:    [[TMP13:%.*]] = getelementptr i64, i64* [[TMP11]], i64 1
 ; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
-; X64-NEXT:    [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]])
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
 ; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP16]]
-; X64-NEXT:    br i1 [[TMP17]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP8]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16)
@@ -417,22 +431,23 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq3(
-; ALL-NEXT:  loadbb:
-; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP0]]
-; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
-; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i16 [[TMP2]], [[TMP3]]
-; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT:    br label [[LOADBB:%.*]]
 ; ALL:       res_block:
 ; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb:
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]]
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
+; ALL-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 2
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 2
-; ALL-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 2
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 2
 ; ALL-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; ALL-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; ALL-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; ALL-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; ALL-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; ALL:       endblock:
 ; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -465,22 +480,23 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq5(
-; ALL-NEXT:  loadbb:
-; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT:    br label [[LOADBB:%.*]]
 ; ALL:       res_block:
 ; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb:
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; ALL-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 4
-; ALL-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 4
-; ALL-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; ALL-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 4
+; ALL-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 4
 ; ALL-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; ALL-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; ALL-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; ALL-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; ALL-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; ALL:       endblock:
 ; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -495,24 +511,25 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; ALL-LABEL: @cmp_eq6(
-; ALL-NEXT:  loadbb:
-; ALL-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; ALL-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; ALL-NEXT:    br label [[LOADBB:%.*]]
 ; ALL:       res_block:
 ; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
+; ALL:       loadbb:
+; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; ALL-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; ALL-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
-; ALL-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
-; ALL-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 2
+; ALL-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; ALL-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
 ; ALL-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 2
-; ALL-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; ALL-NEXT:    [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 2
 ; ALL-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
-; ALL-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
-; ALL-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; ALL-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
+; ALL-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
+; ALL-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; ALL:       endblock:
 ; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -540,24 +557,25 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 
 define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq8(
-; X32-NEXT:  loadbb:
-; X32-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]]
-; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
-; X32-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], [[TMP3]]
-; X32-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb:
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]]
+; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
+; X32-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
-; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 1
+; X32-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
 ; X32-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 1
-; X32-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 1
 ; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
-; X32-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
-; X32-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
+; X32-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -589,22 +607,23 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
 ; X64-LABEL: @cmp_eq9(
-; X64-NEXT:  loadbb:
-; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[X]], i8 8
-; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[Y]], i8 8
-; X64-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]]
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i8 8
+; X64-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i8 8
 ; X64-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP7]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
+; X64-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -625,24 +644,25 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
 ; X64-LABEL: @cmp_eq10(
-; X64-NEXT:  loadbb:
-; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i16*
-; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i16*
-; X64-NEXT:    [[TMP7:%.*]] = getelementptr i16, i16* [[TMP5]], i16 4
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i16*
+; X64-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i16*
 ; X64-NEXT:    [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i16 4
-; X64-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i16, i16* [[TMP7]], i16 4
 ; X64-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
-; X64-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP9]], [[TMP10]]
-; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
+; X64-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -676,24 +696,25 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
 ; X64-LABEL: @cmp_eq12(
-; X64-NEXT:  loadbb:
-; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
+; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i32*
-; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i32*
-; X64-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 2
+; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[X]] to i32*
+; X64-NEXT:    [[TMP7:%.*]] = bitcast i8* [[Y]] to i32*
 ; X64-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 2
-; X64-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[TMP7]], i32 2
 ; X64-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
-; X64-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], [[TMP10]]
-; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
+; X64-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
diff --git a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
index 488d4b479bab..0225af903ef4 100644
--- a/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
+++ b/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
@@ -38,7 +38,7 @@ exit:                                             ; preds = %in.bounds, %entry
 define void @single_access_with_preloop(i32 *%arr, i32 *%a_len_ptr, i32 %n, i32 %offset) {
 ; CHECK-LABEL: @single_access_with_preloop(
 ; CHECK-LABEL: in.bounds.preloop
-; CHECK: br i1 %14, label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7
+; CHECK: br i1 [[COND:%[^ ]+]], label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7
 ; CHECK-LABEL: in.bounds.postloop
 ; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !9, !irce.loop.clone !7
  entry:
diff --git a/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll
new file mode 100644
index 000000000000..dc6aae8d8aa6
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll
@@ -0,0 +1,71 @@
+; RUN: opt %s -indvars -S -o - | FileCheck %s
+source_filename = "/Data/llvm/test/Transforms/IndVarSimplify/scev-phi-debug-info.ll"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.status = type { i32, i8* }
+
+@status = internal unnamed_addr global [32 x %struct.status] zeroinitializer, align 16, !dbg !0
+
+define void @f0() local_unnamed_addr !dbg !20 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 0, metadata !23, metadata !DIExpression()), !dbg !24
+  br label %for.cond, !dbg !24
+
+for.cond:                                         ; preds = %for.body, %entry
+  ; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  ; CHECK: call void @llvm.dbg.value(metadata i64 %indvars.iv, metadata !23, metadata !DIExpression()), !dbg !24
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.dbg.value(metadata i32 %i.0, metadata !23, metadata !DIExpression()), !dbg !24
+  %cmp = icmp slt i32 %i.0, 32, !dbg !24
+  br i1 %cmp, label %for.body, label %for.end, !dbg !24
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64, !dbg !24
+  %value = getelementptr inbounds [32 x %struct.status], [32 x %struct.status]* @status, i64 0, i64 %idxprom, i32 0, !dbg !24
+  store i32 42, i32* %value, align 16, !dbg !24
+  tail call void @use(i32 %i.0), !dbg !24
+  %inc = add nsw i32 %i.0, 1, !dbg !24
+  tail call void @llvm.dbg.value(metadata i32 %inc, metadata !23, metadata !DIExpression()), !dbg !24
+  br label %for.cond, !dbg !24
+
+for.end:                                          ; preds = %for.cond
+  ret void, !dbg !24
+}
+
+declare void @use(i32)
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+
+attributes #0 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!16, !17, !18}
+!llvm.ident = !{!19}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "status", scope: !2, file: !3, line: 5, type: !6, isLocal: true, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "x.c", directory: "/home/davide/work/llvm/build-release/bin")
+!4 = !{}
+!5 = !{!0}
+!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 4096, elements: !14)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "status", file: !3, line: 2, size: 128, elements: !8)
+!8 = !{!9, !11}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !7, file: !3, line: 3, baseType: !10, size: 32)
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !7, file: !3, line: 4, baseType: !12, size: 64, offset: 64)
+!12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64)
+!13 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+!14 = !{!15}
+!15 = !DISubrange(count: 32)
+!16 = !{i32 2, !"Dwarf Version", i32 4}
+!17 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{i32 1, !"wchar_size", i32 4}
+!19 = !{!"clang version 6.0.0 (trunk 316001) (llvm/trunk 316171)"}
+!20 = distinct !DISubprogram(name: "f0", scope: !3, file: !3, line: 6, type: !21, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !22)
+!21 = !DISubroutineType(types: !4)
+!22 = !{!23}
+!23 = !DILocalVariable(name: "i", scope: !20, file: !3, line: 8, type: !10)
+!24 = !DILocation(line: 9, scope: !20)
diff --git a/test/Transforms/InstCombine/debuginfo_add.ll b/test/Transforms/InstCombine/debuginfo_add.ll
new file mode 100644
index 000000000000..0d194cc65c7a
--- /dev/null
+++ b/test/Transforms/InstCombine/debuginfo_add.ll
@@ -0,0 +1,108 @@
+; RUN: opt -instcombine %s -o - -S | FileCheck %s
+; typedef struct v *v_t;
+; struct v {
+;   unsigned long long p;
+; };
+;  
+; void f(v_t object, unsigned long long *start) {
+;   unsigned head_size;
+;   unsigned long long orig_start;
+;   unsigned long long offset;
+;   orig_start = *start;
+;   for (offset = orig_start - (unsigned long long)(1 << 12); head_size;
+;        offset -= (unsigned long long)(1 << 12), head_size -= (1 << 12))
+;     use(offset, (object));
+; }
+source_filename = "test.i"
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7s-apple-ios5.0.0"
+
+%struct.vm_object = type { i64 }
+
+; Function Attrs: nounwind ssp
+define void @f(%struct.vm_object* %object, i64* nocapture readonly %start) local_unnamed_addr #0 !dbg !11 {
+entry:
+  tail call void @llvm.dbg.value(metadata %struct.vm_object* %object, metadata !21, metadata !DIExpression()), !dbg !27
+  tail call void @llvm.dbg.value(metadata i64* %start, metadata !22, metadata !DIExpression()), !dbg !28
+  %0 = load i64, i64* %start, align 4, !dbg !29
+  tail call void @llvm.dbg.value(metadata i64 %0, metadata !25, metadata !DIExpression()), !dbg !30
+  %offset.08 = add i64 %0, -4096
+  tail call void @llvm.dbg.value(metadata i64 %offset.08, metadata !26, metadata !DIExpression()), !dbg !31
+  ; CHECK: call void @llvm.dbg.value(metadata i64 %0, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !30
+  tail call void @llvm.dbg.value(metadata i32 undef, metadata !23, metadata !DIExpression()), !dbg !32
+  br i1 undef, label %for.end, label %for.body.lr.ph, !dbg !32
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body, !dbg !32
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %offset.010 = phi i64 [ %offset.08, %for.body.lr.ph ], [ %offset.0, %for.body ]
+  %head_size.09 = phi i32 [ undef, %for.body.lr.ph ], [ %sub2, %for.body ]
+  tail call void @llvm.dbg.value(metadata i32 %head_size.09, metadata !23, metadata !DIExpression()), !dbg !31
+  %call = tail call i32 bitcast (i32 (...)* @use to i32 (i64, %struct.vm_object*)*)(i64 %offset.010, %struct.vm_object* %object) #3, !dbg !34
+  %sub2 = add i32 %head_size.09, -4096, !dbg !37
+  %offset.0 = add i64 %offset.010, -4096
+  tail call void @llvm.dbg.value(metadata i64 %offset.0, metadata !26, metadata !DIExpression()), !dbg !30
+  ; CHECK: call void @llvm.dbg.value(metadata i64 %offset.010, metadata !26, metadata !DIExpression(DW_OP_constu, 4096, DW_OP_minus, DW_OP_stack_value)), !dbg !29
+  tail call void @llvm.dbg.value(metadata i32 %sub2, metadata !23, metadata !DIExpression()), !dbg !31
+  %tobool = icmp eq i32 %sub2, 0, !dbg !32
+  br i1 %tobool, label %for.end, label %for.body, !dbg !32, !llvm.loop !38
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !40
+}
+
+declare i32 @use(...) local_unnamed_addr
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind ssp }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5, !6, !7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.i", directory: "/Data/radar/31209283")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!5 = !{i32 2, !"Dwarf Version", i32 2}
+!6 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = !{i32 1, !"wchar_size", i32 4}
+!8 = !{i32 1, !"min_enum_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 6.0.0 (trunk 317434) (llvm/trunk 317437)"}
+!11 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !14, !19}
+!14 = !DIDerivedType(tag: DW_TAG_typedef, name: "v_t", file: !1, line: 1, baseType: !15)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32)
+!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "v", file: !1, line: 2, size: 64, elements: !17)
+!17 = !{!18}
+!18 = !DIDerivedType(tag: DW_TAG_member, name: "p", scope: !16, file: !1, line: 3, baseType: !4, size: 64)
+!19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32)
+!20 = !{!21, !22, !23, !25, !26}
+!21 = !DILocalVariable(name: "object", arg: 1, scope: !11, file: !1, line: 6, type: !14)
+!22 = !DILocalVariable(name: "start", arg: 2, scope: !11, file: !1, line: 6, type: !19)
+!23 = !DILocalVariable(name: "head_size", scope: !11, file: !1, line: 7, type: !24)
+!24 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!25 = !DILocalVariable(name: "orig_start", scope: !11, file: !1, line: 8, type: !4)
+!26 = !DILocalVariable(name: "offset", scope: !11, file: !1, line: 9, type: !4)
+!27 = !DILocation(line: 6, column: 20, scope: !11)
+!28 = !DILocation(line: 6, column: 48, scope: !11)
+!29 = !DILocation(line: 8, column: 22, scope: !11)
+!30 = !DILocation(line: 7, column: 12, scope: !11)
+!31 = !DILocation(line: 10, column: 16, scope: !11)
+!32 = !DILocation(line: 11, column: 5, scope: !33)
+!33 = distinct !DILexicalBlock(scope: !11, file: !1, line: 11, column: 5)
+!34 = !DILocation(line: 13, column: 7, scope: !35)
+!35 = distinct !DILexicalBlock(scope: !36, file: !1, line: 12, column: 75)
+!36 = distinct !DILexicalBlock(scope: !33, file: !1, line: 11, column: 5)
+!37 = !DILocation(line: 12, column: 61, scope: !36)
+!38 = distinct !{!38, !32, !39}
+!39 = !DILocation(line: 14, column: 3, scope: !33)
+!40 = !DILocation(line: 15, column: 1, scope: !11)
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index cbb3d614db23..ba52023e0dbf 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -1332,3 +1332,263 @@ define i7 @test65(i7 %a, i7 %b) {
   %y = and i7 %x, 1 ; this extracts the lsb which should be 0 because we shifted an even number of bits and all even bits of the shift input are 0.
   ret i7 %y
 }
+
+define i32 @shl_select_add_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_add_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = add i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = shl i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @shl_select_add_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_add_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = add i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = shl i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @shl_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_and_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = and i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = shl i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @shl_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_and_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = and i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = shl i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @lshr_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_and_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = and i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = lshr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @lshr_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_and_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = and i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = lshr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @ashr_select_and_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_and_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073741821
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = and i32 %x, 2147483655
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = ashr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @ashr_select_and_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_and_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], -1073741821
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = and i32 %x, 2147483655
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = ashr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @shl_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_or_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = or i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = shl i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @shl_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_or_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = or i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = shl i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @lshr_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_or_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = or i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = lshr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @lshr_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_or_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = or i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = lshr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @ashr_select_or_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_or_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = or i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = ashr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @ashr_select_or_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_or_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = or i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = ashr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @shl_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_xor_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = xor i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = shl i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @shl_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @shl_select_xor_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 14
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = xor i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = shl i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @lshr_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_xor_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = xor i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = lshr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @lshr_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @lshr_select_xor_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = xor i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = lshr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @ashr_select_xor_true(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_xor_true(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = xor i32 %x, 7
+  %2 = select i1 %cond, i32 %1, i32 %x
+  %3 = ashr i32 %2, 1
+  ret i32 %3
+}
+
+define i32 @ashr_select_xor_false(i32 %x, i1 %cond) {
+; CHECK-LABEL: @ashr_select_xor_false(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+;
+  %1 = xor i32 %x, 7
+  %2 = select i1 %cond, i32 %x, i32 %1
+  %3 = ashr i32 %2, 1
+  ret i32 %3
+}
diff --git a/test/Transforms/LICM/sinking.ll b/test/Transforms/LICM/sinking.ll
index 6e9e8d4b7b6f..b28eea0bc2aa 100644
--- a/test/Transforms/LICM/sinking.ll
+++ b/test/Transforms/LICM/sinking.ll
@@ -392,6 +392,288 @@ lab60:
   indirectbr i8* undef, [label %lab21, label %lab19]
 }
 
-declare void @f(i32*)
+; Check if LICM can sink a sinkable instruction the exit blocks through
+; a non-trivially replacable PHI node.
+;
+; CHECK-LABEL: @test14
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL: Out12.split.loop.exit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ]
+; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]]
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12.split.loop.exit1:
+; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ]
+; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]]
+; CHECK: %[[SUB:.*]] = sub i32 %[[MUL2]], %N
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12:
+; CHECK: phi i32 [ %[[MUL]], %Out12.split.loop.exit ], [ %[[SUB]], %Out12.split.loop.exit1 ]
+define i32 @test14(i32 %N, i32 %N2, i1 %C) {
+Entry:
+        br label %Loop
+Loop:
+        %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+        %sink.mul = mul i32 %N, %N_addr.0.pn
+        %sink.sub = sub i32 %sink.mul, %N
+        %dec = add i32 %N_addr.0.pn, -1
+        br i1 %C, label %ContLoop, label %Out12
+ContLoop:
+        %tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+        br i1 %tmp.1, label %Loop, label %Out12
+Out12:
+  %tmp = phi i32 [%sink.mul,  %ContLoop], [%sink.sub, %Loop]
+  ret i32 %tmp
+}
+
+; In this test, splitting predecessors is not really required because the
+; operations of sinkable instructions (sub and mul) are same. In this case, we
+; can sink the same sinkable operations and modify the PHI to pass the operands
+; to the shared operations. As of now, we split predecessors of non-trivially
+; replicalbe PHIs by default in LICM because all incoming edges of a
+; non-trivially replacable PHI in LCSSA is critical.
+;
+; CHECK-LABEL: @test15
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL: Out12.split.loop.exit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop ]
+; CHECK: %[[MUL:.*]] = mul i32 %N, %[[LCSSAPHI]]
+; CHECK: %[[SUB:.*]] = sub i32 %[[MUL]], %N2
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12.split.loop.exit1:
+; CHECK: %[[LCSSAPHI2:.*]] = phi i32 [ %N_addr.0.pn, %Loop ]
+; CHECK: %[[MUL2:.*]] = mul i32 %N, %[[LCSSAPHI2]]
+; CHECK: %[[SUB2:.*]] = sub i32 %[[MUL2]], %N
+; CHECK: br label %Out12
+;
+; CHECK-LABEL: Out12:
+; CHECK: phi i32 [ %[[SUB]], %Out12.split.loop.exit ], [ %[[SUB2]], %Out12.split.loop.exit1 ]
+define i32 @test15(i32 %N, i32 %N2, i1 %C) {
+Entry:
+        br label %Loop
+Loop:
+        %N_addr.0.pn = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+        %sink.mul = mul i32 %N, %N_addr.0.pn
+        %sink.sub = sub i32 %sink.mul, %N
+        %sink.sub2 = sub i32 %sink.mul, %N2
+        %dec = add i32 %N_addr.0.pn, -1
+        br i1 %C, label %ContLoop, label %Out12
+ContLoop:
+        %tmp.1 = icmp ne i32 %N_addr.0.pn, 1
+        br i1 %tmp.1, label %Loop, label %Out12
+Out12:
+  %tmp = phi i32 [%sink.sub2, %ContLoop], [%sink.sub, %Loop]
+  ret i32 %tmp
+}
+
+; Sink through a non-trivially replacable PHI node which use the same sinkable
+; instruction multiple times.
+;
+; CHECK-LABEL: @test16
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+;
+; CHECK-LABEL: Out.split.loop.exit:
+; CHECK: %[[PHI:.*]] = phi i32 [ %l2, %ContLoop ]
+; CHECK: br label %Out
+;
+; CHECK-LABEL: Out.split.loop.exit1:
+; CHECK: %[[SINKABLE:.*]] = mul i32 %l2.lcssa, %t.le
+; CHECK: br label %Out
+;
+; CHECK-LABEL: Out:
+; CHECK: %idx = phi i32 [ %[[PHI]], %Out.split.loop.exit ], [ %[[SINKABLE]], %Out.split.loop.exit1 ]
+define i32 @test16(i1 %c, i8** %P, i32* %P2, i64 %V) {
+entry:
+  br label %loop.ph
+loop.ph:
+  br label %Loop
+Loop:
+  %iv = phi i64 [ 0, %loop.ph ], [ %next, %ContLoop ]
+  %l2 = call i32 @getv()
+  %t = trunc i64 %iv to i32
+  %sinkable = mul i32 %l2,  %t
+  switch i32 %l2, label %ContLoop [
+    i32 32, label %Out
+    i32 46, label %Out
+    i32 95, label %Out
+  ]
+ContLoop:
+  %next = add nuw i64 %iv, 1
+  %c1 = call i1 @getc()
+  br i1 %c1, label %Loop, label %Out
+Out:
+  %idx = phi i32 [ %l2, %ContLoop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ], [ %sinkable, %Loop ]
+  ret i32 %idx
+}
+
+; Sink a sinkable instruction through multiple non-trivially replacable PHIs in
+; differect exit blocks.
+;
+; CHECK-LABEL: @test17
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+;
+; CHECK-LABEL:OutA.split.loop.exit{{.*}}:
+; CHECK:  %[[OP1:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop1 ]
+; CHECK:  %[[SINKABLE:.*]] = mul i32 %N, %[[OP1]]
+; CHECK:  br label %OutA
+;
+; CHECK-LABEL:OutA:
+; CHECK: phi i32{{.*}}[ %[[SINKABLE]], %OutA.split.loop.exit{{.*}} ]
+;
+; CHECK-LABEL:OutB.split.loop.exit{{.*}}:
+; CHECK:  %[[OP2:.*]] = phi i32 [ %N_addr.0.pn, %ContLoop2 ]
+; CHECK:  %[[SINKABLE2:.*]] = mul i32 %N, %[[OP2]]
+; CHECK:  br label %OutB
+;
+; CHECK-LABEL:OutB:
+; CHECK:  phi i32 {{.*}}[ %[[SINKABLE2]], %OutB.split.loop.exit{{.*}} ]
+define i32 @test17(i32 %N, i32 %N2) {
+Entry:
+        br label %Loop
+Loop:
+        %N_addr.0.pn = phi i32 [ %dec, %ContLoop3 ], [ %N, %Entry ]
+        %sink.mul = mul i32 %N, %N_addr.0.pn
+        %c0 = call i1 @getc()
+        br i1 %c0 , label %ContLoop1, label %OutA
+ContLoop1:
+        %c1 = call i1 @getc()
+        br i1 %c1, label %ContLoop2, label %OutA
+
+ContLoop2:
+        %c2 = call i1 @getc()
+        br i1 %c2, label %ContLoop3, label %OutB
+ContLoop3:
+        %c3 = call i1 @getc()
+        %dec = add i32 %N_addr.0.pn, -1
+        br i1 %c3, label %Loop, label %OutB
+OutA:
+        %tmp1 = phi i32 [%sink.mul, %ContLoop1], [%N2, %Loop]
+        br label %Out12
+OutB:
+        %tmp2 = phi i32 [%sink.mul, %ContLoop2], [%dec, %ContLoop3]
+        br label %Out12
+Out12:
+  %tmp = phi i32 [%tmp1, %OutA], [%tmp2, %OutB]
+  ret i32 %tmp
+}
+
+
+; Sink a sinkable instruction through both trivially and non-trivially replacable PHIs.
+;
+; CHECK-LABEL: @test18
+; CHECK-LABEL: Loop:
+; CHECK-NOT: mul
+; CHECK-NOT: sub
+;
+; CHECK-LABEL:Out12.split.loop.exit:
+; CHECK:  %[[OP:.*]] = phi i32 [ %iv, %ContLoop ]
+; CHECK:  %[[DEC:.*]] = phi i32 [ %dec, %ContLoop ]
+; CHECK:  %[[SINKMUL:.*]] = mul i32 %N, %[[OP]]
+; CHECK:  %[[SINKSUB:.*]] = sub i32 %[[SINKMUL]], %N2
+; CHECK:  br label %Out12
+;
+; CHECK-LABEL:Out12.split.loop.exit1:
+; CHECK:  %[[OP2:.*]] = phi i32 [ %iv, %Loop ]
+; CHECK:  %[[SINKMUL2:.*]] = mul i32 %N, %[[OP2]]
+; CHECK:  %[[SINKSUB2:.*]] = sub i32 %[[SINKMUL2]], %N2
+; CHECK:  br label %Out12
+;
+; CHECK-LABEL:Out12:
+; CHECK:  %tmp1 = phi i32 [ %[[SINKSUB]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ]
+; CHECK:  %tmp2 = phi i32 [ %[[DEC]], %Out12.split.loop.exit ], [ %[[SINKSUB2]], %Out12.split.loop.exit1 ]
+; CHECK:  %add = add i32 %tmp1, %tmp2
+define i32 @test18(i32 %N, i32 %N2) {
+Entry:
+        br label %Loop
+Loop:
+        %iv = phi i32 [ %dec, %ContLoop ], [ %N, %Entry ]
+        %sink.mul = mul i32 %N, %iv
+        %sink.sub = sub i32 %sink.mul, %N2
+        %c0 = call i1 @getc()
+        br i1 %c0, label %ContLoop, label %Out12
+ContLoop:
+        %dec = add i32 %iv, -1
+        %c1 = call i1 @getc()
+        br i1 %c1, label %Loop, label %Out12
+Out12:
+  %tmp1 = phi i32 [%sink.sub, %ContLoop], [%sink.sub, %Loop]
+  %tmp2 = phi i32 [%dec, %ContLoop], [%sink.sub, %Loop]
+  %add = add i32 %tmp1, %tmp2
+  ret i32 %add
+}
+
+; Do not sink an instruction through a non-trivially replacable PHI, to avoid
+; assert while splitting predecessors, if the terminator of predecessor is an
+; indirectbr.
+; CHECK-LABEL: @test19
+; CHECK-LABEL: L0:
+; CHECK: %sinkable = mul
+; CHECK: %sinkable2 = add
+
+define i32 @test19(i1 %cond, i1 %cond2, i8* %address, i32 %v1) nounwind {
+entry:
+  br label %L0
+L0:
+  %indirect.goto.dest = select i1 %cond, i8* blockaddress(@test19, %exit), i8* %address
+  %v2 = call i32 @getv()
+  %sinkable = mul i32 %v1, %v2
+  %sinkable2 = add i32 %v1, %v2
+  indirectbr i8* %indirect.goto.dest, [label %L1, label %exit]
+
+L1:
+  %indirect.goto.dest2 = select i1 %cond2, i8* blockaddress(@test19, %exit), i8* %address
+  indirectbr i8* %indirect.goto.dest2, [label %L0, label %exit]
+
+exit:
+  %r = phi i32 [%sinkable, %L0], [%sinkable2, %L1]
+  ret i32 %r
+}
 
+
+; Do not sink through a non-trivially replacable PHI if splitting predecessors
+; not allowed in SplitBlockPredecessors().
+;
+; CHECK-LABEL: @test20
+; CHECK-LABEL: while.cond
+; CHECK: %sinkable = mul
+; CHECK: %sinkable2 = add
+define void @test20(i32* %s, i1 %b, i32 %v1, i32 %v2) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  br label %while.cond
+while.cond:
+  %v = call i32 @getv()
+  %sinkable = mul i32 %v, %v2
+  %sinkable2 = add  i32 %v, %v2
+  br i1 %b, label %try.cont, label %while.body
+while.body:
+  invoke void @may_throw()
+          to label %while.body2 unwind label %catch.dispatch
+while.body2:
+  invoke void @may_throw2()
+          to label %while.cond unwind label %catch.dispatch
+catch.dispatch:
+  %.lcssa1 = phi i32 [ %sinkable, %while.body ], [ %sinkable2, %while.body2 ]
+  %cp = cleanuppad within none []
+  store i32 %.lcssa1, i32* %s
+  cleanupret from %cp unwind to caller
+try.cont:
+  ret void
+}
+
+declare void @may_throw()
+declare void @may_throw2()
+declare i32 @__CxxFrameHandler3(...)
+declare i32 @getv()
+declare i1 @getc()
+declare void @f(i32*)
 declare void @g()
diff --git a/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
new file mode 100644
index 000000000000..3c283dcb6e53
--- /dev/null
+++ b/test/Transforms/LoadStoreVectorizer/X86/merge-tbaa.ll
@@ -0,0 +1,46 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -S < %s | \
+; RUN:     FileCheck %s
+;
+; The GPU Load & Store Vectorizer may merge differently-typed accesses into a
+; single instruction. This test checks that we merge TBAA tags for such
+; accesses correctly.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; struct S {
+;   float f;
+;   int i;
+; };
+%struct.S = type { float, i32 }
+
+; float foo(S *p) {
+;   p->f -= 1;
+;   p->i -= 1;
+;   return p->f;
+; }
+define float @foo(%struct.S* %p) {
+entry:
+; CHECK-LABEL: foo
+; CHECK: load <2 x i32>, {{.*}}, !tbaa [[TAG_char:!.*]]
+; CHECK: store <2 x i32> {{.*}}, !tbaa [[TAG_char]]
+  %f = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 0
+  %0 = load float, float* %f, align 4, !tbaa !2
+  %sub = fadd float %0, -1.000000e+00
+  store float %sub, float* %f, align 4, !tbaa !2
+  %i = getelementptr inbounds %struct.S, %struct.S* %p, i64 0, i32 1
+  %1 = load i32, i32* %i, align 4, !tbaa !8
+  %sub1 = add nsw i32 %1, -1
+  store i32 %sub1, i32* %i, align 4, !tbaa !8
+  ret float %sub
+}
+
+!2 = !{!3, !4, i64 0}
+!3 = !{!"_ZTS1S", !4, i64 0, !7, i64 4}
+!4 = !{!"float", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!"int", !5, i64 0}
+!8 = !{!3, !7, i64 4}
+
+; CHECK-DAG: [[TYPE_char:!.*]] = !{!"omnipotent char", {{.*}}, i64 0}
+; CHECK-FAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
diff --git a/test/Transforms/LoopPredication/widened.ll b/test/Transforms/LoopPredication/widened.ll
new file mode 100644
index 000000000000..33c4e2706133
--- /dev/null
+++ b/test/Transforms/LoopPredication/widened.ll
@@ -0,0 +1,138 @@
+; RUN: opt -S -loop-predication -loop-predication-enable-iv-truncation=true < %s 2>&1 | FileCheck %s
+declare void @llvm.experimental.guard(i1, ...)
+
+declare i32 @length(i8*)
+
+declare i16 @short_length(i8*)
+; Consider range check of type i16 and i32, while IV is of type i64
+; We can loop predicate this because the IV range is within i16 and within i32.
+define i64 @iv_wider_type_rc_two_narrow_types(i32 %offA, i16 %offB, i8* %arrA, i8* %arrB) {
+; CHECK-LABEL: iv_wider_type_rc_two_narrow_types
+entry:
+; CHECK-LABEL: entry:
+; CHECK: [[idxB:[^ ]+]] = sub i16 %lengthB, %offB
+; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i16 16, [[idxB]]
+; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i16 %offB, %lengthB
+; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]]
+; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 %lengthA, %offA
+; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 16, [[idxA]]
+; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA
+; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]]
+  %lengthA = call i32 @length(i8* %arrA)
+  %lengthB = call i16 @short_length(i8* %arrB)
+   br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: [[invariant_check:[^ ]+]] = and i1 [[WideChkB]], [[WideChkA]]
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[invariant_check]], i32 9)
+  %iv = phi i64 [0, %entry ], [ %iv.next, %loop ]
+  %iv.trunc.32 = trunc i64 %iv to i32
+  %iv.trunc.16 = trunc i64 %iv to i16
+  %indexA = add i32 %iv.trunc.32, %offA
+  %indexB = add i16 %iv.trunc.16, %offB
+  %rcA = icmp ult i32 %indexA, %lengthA
+  %rcB = icmp ult i16 %indexB, %lengthB
+  %wide.chk = and i1 %rcA, %rcB
+  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk, i32 9) [ "deopt"() ]
+  %indexA.ext = zext i32 %indexA to i64
+  %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+  %eltA = load i8, i8* %addrA
+  %indexB.ext = zext i16 %indexB to i64
+  %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext
+  store i8 %eltA, i8* %addrB
+  %iv.next = add nuw nsw i64 %iv, 1
+  %latch.check = icmp ult i64 %iv.next, 16
+  br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
+
+
+; Consider an IV of type long and an array access into int array.
+; IV is of type i64 while the range check operands are of type i32 and i64.
+define i64 @iv_rc_different_types(i32 %offA, i32 %offB, i8* %arrA, i8* %arrB, i64 %max)
+{
+; CHECK-LABEL: iv_rc_different_types
+entry:
+; CHECK-LABEL: entry:
+; CHECK: [[lenB:[^ ]+]] = add i32 %lengthB, -1
+; CHECK-NEXT: [[idxB:[^ ]+]] = sub i32 [[lenB]], %offB
+; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i32 15, [[idxB]]
+; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i32 %offB, %lengthB
+; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]]
+; CHECK-NEXT: [[maxMinusOne:[^ ]+]] = add i64 %max, -1
+; CHECK-NEXT: [[limit_checkMax:[^ ]+]] = icmp ule i64 15, [[maxMinusOne]]
+; CHECK-NEXT: [[first_iteration_checkMax:[^ ]+]] = icmp ult  i64 0, %max
+; CHECK-NEXT: [[WideChkMax:[^ ]+]] = and i1 [[first_iteration_checkMax]], [[limit_checkMax]]
+; CHECK-NEXT: [[lenA:[^ ]+]] = add i32 %lengthA, -1
+; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 [[lenA]], %offA
+; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 15, [[idxA]]
+; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA
+; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]]
+  %lengthA = call i32 @length(i8* %arrA)
+  %lengthB = call i32 @length(i8* %arrB)
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: [[BandMax:[^ ]+]] = and i1 [[WideChkB]], [[WideChkMax]]
+; CHECK: [[ABandMax:[^ ]+]] = and i1 [[BandMax]], [[WideChkA]]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[ABandMax]], i32 9)
+  %iv = phi i64 [0, %entry ], [ %iv.next, %loop ]
+  %iv.trunc = trunc i64 %iv to i32
+  %indexA = add i32 %iv.trunc, %offA
+  %indexB = add i32 %iv.trunc, %offB
+  %rcA = icmp ult i32 %indexA, %lengthA
+  %rcIV = icmp ult i64 %iv, %max
+  %wide.chk = and i1 %rcA, %rcIV
+  %rcB = icmp ult i32 %indexB, %lengthB
+  %wide.chk.final = and i1 %wide.chk, %rcB
+  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk.final, i32 9) [ "deopt"() ]
+  %indexA.ext = zext i32 %indexA to i64
+  %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+  %eltA = load i8, i8* %addrA
+  %indexB.ext = zext i32 %indexB to i64
+  %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext
+  %eltB = load i8, i8* %addrB
+  %result = xor i8 %eltA, %eltB
+  store i8 %result, i8* %addrA
+  %iv.next = add nuw nsw i64 %iv, 1
+  %latch.check = icmp ult i64 %iv, 15
+  br i1 %latch.check, label %loop, label %exit
+
+exit:
+  ret i64 %iv
+}
+
+; cannot narrow the IV to the range type, because we lose information.
+; for (i64 i= 5; i>= 2; i++)
+; this loop wraps around after reaching 2^64.
+define i64 @iv_rc_different_type(i32 %offA, i8* %arrA) {
+; CHECK-LABEL: iv_rc_different_type
+entry:
+  %lengthA = call i32 @length(i8* %arrA)
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: %rcA = icmp ult i32 %indexA, %lengthA
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9)
+  %iv = phi i64 [ 5, %entry ], [ %iv.next, %loop ]
+  %iv.trunc.32 = trunc i64 %iv to i32
+  %indexA = add i32 %iv.trunc.32, %offA
+  %rcA = icmp ult i32 %indexA, %lengthA
+  call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) [ "deopt"() ]
+  %indexA.ext = zext i32 %indexA to i64
+  %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+  %eltA = load i8, i8* %addrA
+  %res = add i8 %eltA, 2
+  store i8 %eltA, i8* %addrA
+  %iv.next = add i64 %iv, 1
+  %latch.check = icmp sge i64 %iv.next, 2
+  br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
diff --git a/test/Transforms/LoopVectorize/pr34681.ll b/test/Transforms/LoopVectorize/pr34681.ll
new file mode 100644
index 000000000000..e93265e2ed5c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr34681.ll
@@ -0,0 +1,122 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, so if we specialize the loop for the Stride==1 case,
+; this also implies that the loop will iterate no more than a single iteration,
+; as in the following example: 
+;
+;       unsigned int N;
+;       int tmp = 0;
+;       for(unsigned int k=0;k<N;k++) {
+;         tmp+=(int)B[k*N+j];
+;       }
+;
+; We check here that the following runtime scev guard for Stride==1 is NOT generated:
+; vector.scevcheck:
+;   %ident.check = icmp ne i32 %N, 1
+;   %0 = or i1 false, %ident.check
+;   br i1 %0, label %scalar.ph, label %vector.ph
+; Instead the loop is vectorized with an unknown stride.
+
+; CHECK-LABEL: @foo1
+; CHECK: for.body.lr.ph
+; CHECK-NOT: %ident.check = icmp ne i32 %N, 1
+; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check
+; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph
+; CHECK: vector.ph
+; CHECK: vector.body
+; CHECK: <4 x i32>
+; CHECK: middle.block
+; CHECK: scalar.ph
+
+
+define i32 @foo1(i32 %N, i16* nocapture readnone %A, i16* nocapture readonly %B, i32 %i, i32 %j)  {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add1, %for.body ]
+  %k.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul i32 %k.09, %N
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %add
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  %add1 = add nsw i32 %tmp.010, %conv
+  %inc = add nuw i32 %k.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  br label %for.end
+
+for.end: 
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add1.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
+
+
+; Check the same, but also where the Stride and the loop iteration count
+; are not of the same data type. 
+;
+;       unsigned short N;
+;       int tmp = 0;
+;       for(unsigned int k=0;k<N;k++) {
+;         tmp+=(int)B[k*N+j];
+;       }
+;
+; We check here that the following runtime scev guard for Stride==1 is NOT generated:
+; vector.scevcheck:
+; %ident.check = icmp ne i16 %N, 1
+; %0 = or i1 false, %ident.check
+; br i1 %0, label %scalar.ph, label %vector.ph
+
+
+; CHECK-LABEL: @foo2
+; CHECK: for.body.lr.ph
+; CHECK-NOT: %ident.check = icmp ne i16 %N, 1
+; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check
+; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph
+; CHECK: vector.ph
+; CHECK: vector.body
+; CHECK: <4 x i32>
+; CHECK: middle.block
+; CHECK: scalar.ph
+
+define i32 @foo2(i16 zeroext %N, i16* nocapture readnone %A, i16* nocapture readonly %B, i32 %i, i32 %j) {
+entry:
+  %conv = zext i16 %N to i32
+  %cmp11 = icmp eq i16 %N, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw i32 %k.012, %conv
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %add
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv3 = sext i16 %0 to i32
+  %add4 = add nsw i32 %tmp.013, %conv3
+  %inc = add nuw nsw i32 %k.012, 1
+  %exitcond = icmp eq i32 %inc, %conv
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add4.lcssa = phi i32 [ %add4, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/version-mem-access.ll b/test/Transforms/LoopVectorize/version-mem-access.ll
index a9d319e5a2dd..774b6f268599 100644
--- a/test/Transforms/LoopVectorize/version-mem-access.ll
+++ b/test/Transforms/LoopVectorize/version-mem-access.ll
@@ -65,7 +65,8 @@ for.end:
 define void @fn1(double* noalias %x, double* noalias %c, double %a) {
 entry:
   %conv = fptosi double %a to i32
-  %cmp8 = icmp sgt i32 %conv, 0
+  %conv2 = add i32 %conv, 4
+  %cmp8 = icmp sgt i32 %conv2, 0
   br i1 %cmp8, label %for.body.preheader, label %for.end
 
 for.body.preheader:
@@ -82,7 +83,7 @@ for.body:
   store double %1, double* %arrayidx3, align 8
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %conv
+  %exitcond = icmp eq i32 %lftr.wideiv, %conv2
   br i1 %exitcond, label %for.end.loopexit, label %for.body
 
 for.end.loopexit:
diff --git a/test/Transforms/LowerTypeTests/blockaddress.ll b/test/Transforms/LowerTypeTests/blockaddress.ll
new file mode 100644
index 000000000000..ecc4814cfd58
--- /dev/null
+++ b/test/Transforms/LowerTypeTests/blockaddress.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S %s -lowertypetests | FileCheck %s
+
+
+; CHECK: define internal i8* @f2.cfi() !type !0 {
+; CHECK-NEXT:  br label %b
+; CHECK: b:
+; CHECK-NEXT:  ret i8* blockaddress(@f2.cfi, %b)
+; CHECK-NEXT: }
+
+target triple = "x86_64-unknown-linux"
+
+define void @f1() {
+entry:
+  %0 = call i1 @llvm.type.test(i8* bitcast (i8* ()* @f2 to i8*), metadata !"_ZTSFvP3bioE")
+  ret void
+}
+
+declare i1 @llvm.type.test(i8*, metadata)
+
+define i8* @f2() !type !5 {
+  br label %b
+
+b:
+  ret i8* blockaddress(@f2, %b)
+}
+
+!5 = !{i64 0, !"_ZTSFvP3bioE"}
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
index 6cb9b26fb574..b9eb552dd662 100644
--- a/test/Transforms/LowerTypeTests/import-unsat.ll
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -7,6 +7,7 @@
 ; SUMMARY-NEXT:    - Linkage:             0
 ; SUMMARY-NEXT:      NotEligibleToImport: false
 ; SUMMARY-NEXT:      Live:                true
+; SUMMARY-NEXT:      Local:               false
 ; SUMMARY-NEXT:      TypeTests: [ 123 ]
 ; SUMMARY-NEXT: TypeIdMap:
 ; SUMMARY-NEXT:   typeid1:
diff --git a/test/Transforms/PGOProfile/Inputs/irreducible.proftext b/test/Transforms/PGOProfile/Inputs/irreducible.proftext
new file mode 100644
index 000000000000..9b0210d9a309
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/irreducible.proftext
@@ -0,0 +1,29 @@
+:ir
+_Z11irreducibleii
+# Func Hash:
+64451410787
+# Num Counters:
+6
+# Counter Values:
+1000
+950
+100
+373
+1
+0
+
+_Z11irreduciblePh
+# Func Hash:
+104649601521
+# Num Counters:
+9
+# Counter Values:
+100
+300
+99
+300
+201
+1
+1
+0
+0
diff --git a/test/Transforms/PGOProfile/irreducible.ll b/test/Transforms/PGOProfile/irreducible.ll
new file mode 100644
index 000000000000..37f6e206ee92
--- /dev/null
+++ b/test/Transforms/PGOProfile/irreducible.ll
@@ -0,0 +1,184 @@
+; RUN: llvm-profdata merge %S/Inputs/irreducible.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+
+; GEN: $__llvm_profile_raw_version = comdat any
+
+; Function Attrs: noinline norecurse nounwind readnone uwtable
+define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr #0 {
+entry:
+  %cmp24 = icmp sgt i32 %iter_outer, 0
+  br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge
+
+entry.for.cond.cleanup_crit_edge:                 ; preds = %entry
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %entry.for.cond.cleanup_crit_edge, %for.end
+  %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
+  ret i32 %sum.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.end
+  %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
+  %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
+  %rem23 = and i32 %k.026, 1
+  %cmp1 = icmp eq i32 %rem23, 0
+  br i1 %cmp1, label %entry8, label %for.cond2
+
+for.cond2:                                        ; preds = %for.body, %if.end9
+  %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
+  %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
+  %cmp3 = icmp slt i32 %i.0, %iter_inner
+  br i1 %cmp3, label %for.body4, label %for.end
+; USE: br i1 %cmp3, label %for.body4, label %for.end, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[FOR_COND2_IRR_LOOP:[0-9]+]]
+
+for.body4:                                        ; preds = %for.cond2
+  %rem5 = srem i32 %k.026, 3
+  %cmp6 = icmp eq i32 %rem5, 0
+  br i1 %cmp6, label %entry8, label %if.end9
+
+entry8:                                           ; preds = %for.body4, %for.body
+  %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
+  %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
+  %add = add nsw i32 %sum.2, 4
+  br label %if.end9
+; USE: br label %if.end9,
+; USE-SAME: !irr_loop ![[ENTRY8_IRR_LOOP:[0-9]+]]
+
+if.end9:                                          ; preds = %entry8, %for.body4
+  %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
+  %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
+  %add10 = add nsw i32 %sum.3, 1
+  %inc = add nsw i32 %i.2, 1
+  br label %for.cond2
+; USE: br label %for.cond2,
+; USE-SAME: !irr_loop ![[IF_END9_IRR_LOOP:[0-9]+]]
+
+for.end:                                          ; preds = %for.cond2
+  %inc12 = add nuw nsw i32 %k.026, 1
+  %exitcond = icmp eq i32 %inc12, %iter_outer
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
+
+@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
+@tracing = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) {
+entry:
+  store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
+  store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
+  %0 = load i32, i32* @tracing, align 4
+  %tobool = icmp eq i32 %0, 0
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %sw.default, %entry
+  %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
+  %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
+  %1 = load i8, i8* %p.addr.0, align 1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
+  %2 = load i8, i8* %incdec.ptr, align 1
+  %conv3 = zext i8 %2 to i32
+  br label %dispatch_op
+
+dispatch_op:                                      ; preds = %sw.bb6, %for.cond1
+  %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
+  %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
+  %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
+  %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
+  switch i8 %op.0, label %sw.default [
+    i8 0, label %sw.bb
+    i8 1, label %dispatch_op.sw.bb6_crit_edge
+    i8 2, label %sw.bb15
+  ]
+
+dispatch_op.sw.bb6_crit_edge:                     ; preds = %dispatch_op
+  br label %sw.bb6
+
+sw.bb:                                            ; preds = %indirectgoto, %dispatch_op
+  %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
+  %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
+  %add.neg = sub i32 -5, %oparg.1
+  %sub = add i32 %add.neg, %sum.2
+  br label %exit
+
+TARGET_1:                                         ; preds = %indirectgoto
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+  %3 = load i8, i8* %p.addr.5, align 1
+  %conv5 = zext i8 %3 to i32
+  br label %sw.bb6
+
+sw.bb6:                                           ; preds = %dispatch_op.sw.bb6_crit_edge, %TARGET_1
+  %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
+  %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
+  %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
+  %mul = mul nsw i32 %oparg.2, 7
+  %add7 = add nsw i32 %sum.3, %mul
+  %rem46 = and i32 %add7, 1
+  %cmp8 = icmp eq i32 %rem46, 0
+  br i1 %cmp8, label %dispatch_op, label %if.then
+; USE: br i1 %cmp8, label %dispatch_op, label %if.then, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[SW_BB6_IRR_LOOP:[0-9]+]]
+
+if.then:                                          ; preds = %sw.bb6
+  %mul9 = mul nsw i32 %add7, 9
+  br label %indirectgoto
+
+TARGET_2:                                         ; preds = %indirectgoto
+  %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+  %4 = load i8, i8* %p.addr.5, align 1
+  %conv14 = zext i8 %4 to i32
+  br label %sw.bb15
+
+sw.bb15:                                          ; preds = %TARGET_2, %dispatch_op
+  %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
+  %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
+  %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
+  %add16 = add nsw i32 %oparg.3, 3
+  %add17 = add nsw i32 %add16, %sum.4
+  br i1 %tobool, label %if.then18, label %exit
+; USE: br i1 %tobool, label %if.then18, label %exit, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[SW_BB15_IRR_LOOP:[0-9]+]]
+
+if.then18:                                        ; preds = %sw.bb15
+  %idx.ext = sext i32 %oparg.3 to i64
+  %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
+  %mul19 = mul nsw i32 %add17, 17
+  br label %indirectgoto
+
+unknown_op:                                       ; preds = %indirectgoto
+  %sub24 = add nsw i32 %sum.7, -4
+  br label %sw.default
+
+sw.default:                                       ; preds = %unknown_op, %dispatch_op
+  %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
+  %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
+  %add25 = add nsw i32 %sum.5, 11
+  br label %for.cond1
+
+exit:                                             ; preds = %sw.bb15, %sw.bb
+  %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
+  ret i32 %sum.6
+
+indirectgoto:                                     ; preds = %if.then18, %if.then
+  %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
+  %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
+  %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
+  %5 = load i8, i8* %add.ptr.pn, align 1
+  %idxprom21 = zext i8 %5 to i64
+  %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+  %6 = load i8*, i8** %arrayidx22, align 8
+  indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2]
+; USE: indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[INDIRECTGOTO_IRR_LOOP:[0-9]+]]
+}
+
+; USE: ![[FOR_COND2_IRR_LOOP]] = !{!"loop_header_weight", i64 1050}
+; USE: ![[ENTRY8_IRR_LOOP]] = !{!"loop_header_weight", i64 373}
+; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000}
+; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501}
+; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100}
+; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400}
diff --git a/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll b/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
index c1c074e75a70..1751854d448d 100644
--- a/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
+++ b/test/Transforms/PGOProfile/thinlto_samplepgo_icp2.ll
@@ -22,7 +22,7 @@
 ; RUN: llvm-nm %t3.2 | FileCheck %s --check-prefix=NM
 ; NM: _ZL3barv
 ; RUN: llvm-dis < %t3.2.2.internalize.bc | FileCheck %s --check-prefix=INTERNALIZE
-; INTERNALIZE: define void @_ZL3barv
+; INTERNALIZE: define dso_local void @_ZL3barv
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
index 105afa9def5c..ebc15865a67d 100644
--- a/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
+++ b/test/Transforms/RewriteStatepointsForGC/drop-invalid-metadata.ll
@@ -75,6 +75,54 @@ define void @test_dereferenceable(i32 addrspace(1)* addrspace(1)* %p, i32 %x, i3
   ret void
 }
 
+; invariant.start allows us to sink the load past the baz statepoint call into taken block, which is
+; incorrect. remove the invariant.start and RAUW undef.
+define void @test_inv_start(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_inv_start
+; CHECK-NOT: invariant.start
+; CHECK: gc.statepoint
+  %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+  %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1)
+  %v2 = load i32, i32 addrspace(1)* %v1
+  call void @baz(i32 %x)
+  br i1 %cond, label %taken, label %untaken
+
+taken:
+  store i32 %v2, i32 addrspace(1)* %q, align 16
+  call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1)
+  ret void
+
+; CHECK-LABEL: untaken:
+; CHECK: gc.statepoint
+untaken:
+  %foo = call i32 @escaping.invariant.start({}* %invst)
+  call void @dummy(i32 %foo)
+  ret void
+}
+
+; invariant.start is removed and the uses are undef'ed.
+define void @test_inv_start2(i1 %cond, i32 addrspace(1)* addrspace(1)* %p, i32 %x, i32 addrspace(1)* %q) gc "statepoint-example" {
+; CHECK-LABEL: test_inv_start2
+; CHECK-NOT: invariant.start
+; CHECK: gc.statepoint
+  %v1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
+  %invst = call {}* @llvm.invariant.start.p1i32(i64 1, i32 addrspace(1)* %v1)
+  %v2 = load i32, i32 addrspace(1)* %v1
+  call void @baz(i32 %x)
+  br i1 %cond, label %taken, label %untaken
+
+taken:
+  store i32 %v2, i32 addrspace(1)* %q, align 16
+  call void @llvm.invariant.end.p1i32({}* %invst, i64 4, i32 addrspace(1)* %v1)
+  ret void
+
+untaken:
+  ret void
+}
+declare {}* @llvm.invariant.start.p1i32(i64, i32 addrspace(1)*  nocapture) nounwind readonly
+declare void @llvm.invariant.end.p1i32({}*, i64, i32 addrspace(1)* nocapture) nounwind
+declare i32 @escaping.invariant.start({}*) nounwind
+declare void @dummy(i32)
 declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...)
 
 ; Function Attrs: nounwind readonly
diff --git a/test/Transforms/SLPVectorizer/X86/call.ll b/test/Transforms/SLPVectorizer/X86/call.ll
index 03b1e837a0ca..8397d348483c 100644
--- a/test/Transforms/SLPVectorizer/X86/call.ll
+++ b/test/Transforms/SLPVectorizer/X86/call.ll
@@ -11,133 +11,158 @@ declare double @sqrt(double)
 declare i64 @round(i64)
 
 
-; CHECK: sin_libm
-; CHECK: call <2 x double> @llvm.sin.v2f64
-; CHECK: ret void
-define void @sin_libm(double* %a, double* %b, double* %c) {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %call = tail call double @sin(double %mul) nounwind readnone
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  %call5 = tail call double @sin(double %mul5) nounwind readnone
-  store double %call, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %call5, double* %arrayidx5, align 8
+define void @sin_libm(double* %a, double* %b) {
+; CHECK-LABEL: @sin_libm(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %sin1 = tail call double @sin(double %a0) nounwind readnone
+  %sin2 = tail call double @sin(double %a1) nounwind readnone
+  store double %sin1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %sin2, double* %idx2, align 8
   ret void
 }
 
-; CHECK: cos_libm
-; CHECK: call <2 x double> @llvm.cos.v2f64
-; CHECK: ret void
-define void @cos_libm(double* %a, double* %b, double* %c) {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %call = tail call double @cos(double %mul) nounwind readnone
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  %call5 = tail call double @cos(double %mul5) nounwind readnone
-  store double %call, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %call5, double* %arrayidx5, align 8
+define void @cos_libm(double* %a, double* %b) {
+; CHECK-LABEL: @cos_libm(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.cos.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %cos1 = tail call double @cos(double %a0) nounwind readnone
+  %cos2 = tail call double @cos(double %a1) nounwind readnone
+  store double %cos1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %cos2, double* %idx2, align 8
   ret void
 }
 
-; CHECK: pow_libm
-; CHECK: call <2 x double> @llvm.pow.v2f64
-; CHECK: ret void
-define void @pow_libm(double* %a, double* %b, double* %c) {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %call = tail call double @pow(double %mul,double %mul) nounwind readnone
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  %call5 = tail call double @pow(double %mul5,double %mul5) nounwind readnone
-  store double %call, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %call5, double* %arrayidx5, align 8
+define void @pow_libm(double* %a, double* %b) {
+; CHECK-LABEL: @pow_libm(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %pow1 = tail call double @pow(double %a0, double %a0) nounwind readnone
+  %pow2 = tail call double @pow(double %a1, double %a1) nounwind readnone
+  store double %pow1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %pow2, double* %idx2, align 8
   ret void
 }
 
-
-; CHECK: exp2_libm
-; CHECK: call <2 x double> @llvm.exp2.v2f64
-; CHECK: ret void
-define void @exp2_libm(double* %a, double* %b, double* %c) {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %call = tail call double @exp2(double %mul) nounwind readnone
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  %call5 = tail call double @exp2(double %mul5) nounwind readnone
-  store double %call, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %call5, double* %arrayidx5, align 8
+define void @exp_libm(double* %a, double* %b) {
+; CHECK-LABEL: @exp_libm(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.exp2.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %exp1 = tail call double @exp2(double %a0) nounwind readnone
+  %exp2 = tail call double @exp2(double %a1) nounwind readnone
+  store double %exp1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %exp2, double* %idx2, align 8
   ret void
 }
 
-
-; CHECK: sqrt_libm
-; CHECK: call nnan <2 x double> @llvm.sqrt.v2f64
-; CHECK: ret void
-define void @sqrt_libm(double* %a, double* %b, double* %c) {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %call = tail call nnan double @sqrt(double %mul) nounwind readnone
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  %call5 = tail call nnan double @sqrt(double %mul5) nounwind readnone
-  store double %call, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %call5, double* %arrayidx5, align 8
+; No fast-math-flags are required to convert sqrt library calls to an intrinsic. 
+; We just need to know that errno is not set (readnone).
+
+define void @sqrt_libm_no_errno(double* %a, double* %b) {
+; CHECK-LABEL: @sqrt_libm_no_errno(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* %a to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* %b to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %sqrt1 = tail call double @sqrt(double %a0) nounwind readnone
+  %sqrt2 = tail call double @sqrt(double %a1) nounwind readnone
+  store double %sqrt1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %sqrt2, double* %idx2, align 8
   ret void
 }
 
+; The sqrt intrinsic does not set errno, but a non-constant sqrt call might, so this can't vectorize.
+; The nnan on the call does not matter because there's no guarantee in the C standard that a negative
+; input would result in a nan output ("On a domain error, the function returns an 
+; implementation-defined value.")
+
+define void @sqrt_libm_errno(double* %a, double* %b) {
+; CHECK-LABEL: @sqrt_libm_errno(
+; CHECK-NEXT:    [[A0:%.*]] = load double, double* %a, align 8
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* %a, i64 1
+; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDX1]], align 8
+; CHECK-NEXT:    [[SQRT1:%.*]] = tail call nnan double @sqrt(double [[A0]]) #2
+; CHECK-NEXT:    [[SQRT2:%.*]] = tail call nnan double @sqrt(double [[A1]]) #2
+; CHECK-NEXT:    store double [[SQRT1]], double* %b, align 8
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* %b, i64 1
+; CHECK-NEXT:    store double [[SQRT2]], double* [[IDX2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %sqrt1 = tail call nnan double @sqrt(double %a0) nounwind
+  %sqrt2 = tail call nnan double @sqrt(double %a1) nounwind
+  store double %sqrt1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %sqrt2, double* %idx2, align 8
+  ret void
+}
 
 ; Negative test case
-; CHECK: round_custom
-; CHECK-NOT: load <4 x i64>
-; CHECK: ret void
-define void @round_custom(i64* %a, i64* %b, i64* %c) {
-entry:
-  %i0 = load i64, i64* %a, align 8
-  %i1 = load i64, i64* %b, align 8
-  %mul = mul i64 %i0, %i1
-  %call = tail call i64 @round(i64 %mul) nounwind readnone
-  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1
-  %i3 = load i64, i64* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds i64, i64* %b, i64 1
-  %i4 = load i64, i64* %arrayidx4, align 8
-  %mul5 = mul i64 %i3, %i4
-  %call5 = tail call i64 @round(i64 %mul5) nounwind readnone
-  store i64 %call, i64* %c, align 8
-  %arrayidx5 = getelementptr inbounds i64, i64* %c, i64 1
-  store i64 %call5, i64* %arrayidx5, align 8
+define void @round_custom(i64* %a, i64* %b) {
+; CHECK-LABEL: @round_custom(
+; CHECK-NEXT:    [[A0:%.*]] = load i64, i64* %a, align 8
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i64, i64* %a, i64 1
+; CHECK-NEXT:    [[A1:%.*]] = load i64, i64* [[IDX1]], align 8
+; CHECK-NEXT:    [[ROUND1:%.*]] = tail call i64 @round(i64 [[A0]]) #3
+; CHECK-NEXT:    [[ROUND2:%.*]] = tail call i64 @round(i64 [[A1]]) #3
+; CHECK-NEXT:    store i64 [[ROUND1]], i64* %b, align 8
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i64, i64* %b, i64 1
+; CHECK-NEXT:    store i64 [[ROUND2]], i64* [[IDX2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load i64, i64* %a, align 8
+  %idx1 = getelementptr inbounds i64, i64* %a, i64 1
+  %a1 = load i64, i64* %idx1, align 8
+  %round1 = tail call i64 @round(i64 %a0) nounwind readnone
+  %round2 = tail call i64 @round(i64 %a1) nounwind readnone
+  store i64 %round1, i64* %b, align 8
+  %idx2 = getelementptr inbounds i64, i64* %b, i64 1
+  store i64 %round2, i64* %idx2, align 8
   ret void
 }
 
diff --git a/test/Transforms/SLPVectorizer/X86/cast.ll b/test/Transforms/SLPVectorizer/X86/cast.ll
index 5d7118753e92..2f9f84948eaf 100644
--- a/test/Transforms/SLPVectorizer/X86/cast.ll
+++ b/test/Transforms/SLPVectorizer/X86/cast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -14,10 +14,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define i32 @test_sext_4i8_to_4i32(i32* noalias nocapture %A, i8* noalias nocapture %B) {
 ; CHECK-LABEL: @test_sext_4i8_to_4i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* %B to <4 x i8>*
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <4 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* %A to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
@@ -46,10 +46,10 @@ entry:
 define i32 @test_zext_4i16_to_4i32(i32* noalias nocapture %A, i16* noalias nocapture %B) {
 ; CHECK-LABEL: @test_zext_4i16_to_4i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* %B to <4 x i16>*
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <4 x i16>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* %A to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
@@ -76,30 +76,21 @@ entry:
 }
 
 define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) {
-; SSE-LABEL: @test_sext_4i16_to_4i64(
-; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = bitcast i16* %B to <2 x i16>*
-; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
-; SSE-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast i64* %A to <2 x i64>*
-; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
-; SSE-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* %B, i64 2
-; SSE-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* %A, i64 2
-; SSE-NEXT:    [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
-; SSE-NEXT:    [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
-; SSE-NEXT:    [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
-; SSE-NEXT:    ret i64 undef
-;
-; AVX-LABEL: @test_sext_4i16_to_4i64(
-; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = bitcast i16* %B to <4 x i16>*
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1
-; AVX-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64>
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast i64* %A to <4 x i64>*
-; AVX-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 4
-; AVX-NEXT:    ret i64 undef
+; CHECK-LABEL: @test_sext_4i16_to_4i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
+; CHECK-NEXT:    ret i64 undef
 ;
 entry:
   %0 = load i16, i16* %B, align 1
diff --git a/test/Transforms/SLPVectorizer/X86/load-merge.ll b/test/Transforms/SLPVectorizer/X86/load-merge.ll
new file mode 100644
index 000000000000..df990be073b1
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+
+;unsigned load_le32(unsigned char *data) {
+;    unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+;    return le32;
+;}
+
+define i32 @_Z9load_le32Ph(i8* nocapture readonly %data) {
+; CHECK-LABEL: @_Z9load_le32Ph(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[DATA:%.*]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 8
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL3]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 16
+; CHECK-NEXT:    [[OR7:%.*]] = or i32 [[OR]], [[SHL6]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[CONV9:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[SHL10:%.*]] = shl nuw i32 [[CONV9]], 24
+; CHECK-NEXT:    [[OR11:%.*]] = or i32 [[OR7]], [[SHL10]]
+; CHECK-NEXT:    ret i32 [[OR11]]
+;
+entry:
+  %0 = load i8, i8* %data, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i8, i8* %data, i64 1
+  %1 = load i8, i8* %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %shl3 = shl nuw nsw i32 %conv2, 8
+  %or = or i32 %shl3, %conv
+  %arrayidx4 = getelementptr inbounds i8, i8* %data, i64 2
+  %2 = load i8, i8* %arrayidx4, align 1
+  %conv5 = zext i8 %2 to i32
+  %shl6 = shl nuw nsw i32 %conv5, 16
+  %or7 = or i32 %or, %shl6
+  %arrayidx8 = getelementptr inbounds i8, i8* %data, i64 3
+  %3 = load i8, i8* %arrayidx8, align 1
+  %conv9 = zext i8 %3 to i32
+  %shl10 = shl nuw i32 %conv9, 24
+  %or11 = or i32 %or7, %shl10
+  ret i32 %or11
+}
diff --git a/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
new file mode 100644
index 000000000000..79fb782db8f5
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
+
+;void Distance(float *p1, int p2, unsigned long p3[], float p4[]) {
+;  long a = p3[0] = 5;
+;  p1 += p2;
+;  p4[3] += p1[a];
+;  p3[0] >>= 5;
+;  p3[1] >>= 5;
+;  p3[2] >>= 5;
+;  p3[3] >>= 5;
+;  p1 += p2;
+;  p4[0] += p1[p3[0] & a];
+;}
+
+define void @_Z8DistanceIlLi5EEvPfiPmS0_(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @_Z8DistanceIlLi5EEvPfiPmS0_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[P3]], align 8
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP2]], 5
+; CHECK-NEXT:    store i64 [[SHR]], i64* [[P3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <2 x i64> [[TMP4]], <i64 5, i64 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX4]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
+; CHECK-NEXT:    store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ADD_PTR11:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[SHR]], 5
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[ADD_PTR11]], i64 [[AND]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[P4]], align 4
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    store float [[ADD15]], float* [[P4]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i64 5, i64* %p3, align 8
+  %idx.ext = sext i32 %p2 to i64
+  %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+  %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+  %0 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx2, align 4
+  %2 = load i64, i64* %p3, align 8
+  %shr = lshr i64 %2, 5
+  store i64 %shr, i64* %p3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shr5 = lshr i64 %3, 5
+  store i64 %shr5, i64* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx6, align 8
+  %shr7 = lshr i64 %4, 5
+  store i64 %shr7, i64* %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shr9 = lshr i64 %5, 5
+  store i64 %shr9, i64* %arrayidx8, align 8
+  %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext
+  %and = and i64 %shr, 5
+  %arrayidx13 = getelementptr inbounds float, float* %add.ptr11, i64 %and
+  %6 = load float, float* %arrayidx13, align 4
+  %7 = load float, float* %p4, align 4
+  %add15 = fadd float %6, %7
+  store float %add15, float* %p4, align 4
+  ret void
+}
diff --git a/test/Transforms/SampleProfile/indirect-call.ll b/test/Transforms/SampleProfile/indirect-call.ll
index 61a1bc519966..0c00639e6c00 100644
--- a/test/Transforms/SampleProfile/indirect-call.ll
+++ b/test/Transforms/SampleProfile/indirect-call.ll
@@ -182,7 +182,7 @@ define void @test_direct() !dbg !22 {
 ; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398}
 ; CHECK: ![[BR1]] = !{!"branch_weights", i32 4000, i32 4000}
 ; CHECK: ![[BR2]] = !{!"branch_weights", i32 3000, i32 1000}
-; CHECK: ![[VP]] = !{!"VP", i32 0, i64 1000, i64 -6391416044382067764, i64 1000}
+; CHECK: ![[VP]] = !{!"VP", i32 0, i64 8000, i64 -6391416044382067764, i64 1000}
 !6 = distinct !DISubprogram(name: "test_inline", scope: !1, file: !1, line: 6, unit: !0)
 !7 = !DILocation(line: 7, scope: !6)
 !8 = distinct !DISubprogram(name: "test_inline_strip", scope: !1, file: !1, line: 8, unit: !0)
diff --git a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
index a2b940380016..a2ca63d0a2df 100644
--- a/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
+++ b/test/Transforms/SimplifyCFG/merge-cond-stores-2.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 | FileCheck %s
+; RUN: opt -S < %s -simplifycfg -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-gnueabihf"
diff --git a/test/Transforms/WholeProgramDevirt/import-indir.ll b/test/Transforms/WholeProgramDevirt/import-indir.ll
index 052a34948343..927ee16b370c 100644
--- a/test/Transforms/WholeProgramDevirt/import-indir.ll
+++ b/test/Transforms/WholeProgramDevirt/import-indir.ll
@@ -7,6 +7,7 @@
 ; SUMMARY-NEXT:    - Linkage:             0
 ; SUMMARY-NEXT:      NotEligibleToImport: false
 ; SUMMARY-NEXT:      Live:                true
+; SUMMARY-NEXT:      Local:               false
 ; SUMMARY-NEXT:      TypeTestAssumeVCalls:
 ; SUMMARY-NEXT:        - GUID:            123
 ; SUMMARY-NEXT:          Offset:          0