summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2022-09-26 23:07:49 -0400
committerTom Stellard <tstellar@redhat.com>2022-11-01 20:11:40 -0700
commit5c68a1cb123161b54b72ce90e7975d95a8eaf2a4 (patch)
treeb1013f8b89a70291c382a80263eb7c06a34cb657
parent80a9fc840b1b0c5bdd6509578283af3b02782d48 (diff)
downloadllvm-5c68a1cb123161b54b72ce90e7975d95a8eaf2a4.tar.gz
AMDGPU: Make various vector undefs legalllvmorg-15.0.4
Surprisingly these were getting legalized to something zero initialized. This fixes an infinite loop when combining some vector types. Also fixes zero initializing some undef values. SimplifyDemandedVectorElts / SimplifyDemandedBits are not checking for the legality of the output undefs they are replacing unused operations with. This resulted in turning vectors into undefs that were later re-legalized back into zero vectors. (cherry picked from commit 7a84624079a2656c684bed6100708544500c5a32)
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2
-rw-r--r--llvm/test/CodeGen/AMDGPU/commute-shifts.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll78
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-undef.ll219
-rw-r--r--llvm/test/CodeGen/AMDGPU/skip-if-dead.ll113
-rw-r--r--llvm/test/CodeGen/AMDGPU/v1024.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll144
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.ll162
11 files changed, 306 insertions, 458 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f7d139adc63b..f6b7d1ffc6d2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -249,6 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
case ISD::STORE:
case ISD::BUILD_VECTOR:
case ISD::BITCAST:
+ case ISD::UNDEF:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
case ISD::EXTRACT_SUBVECTOR:
@@ -516,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
case ISD::STORE:
case ISD::BUILD_VECTOR:
case ISD::BITCAST:
+ case ISD::UNDEF:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
case ISD::INSERT_SUBVECTOR:
diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
index 8df85ba872bf..3697946cb5c3 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -5,14 +5,6 @@
define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
; SI-LABEL: main:
; SI: ; %bb.0: ; %bb
-; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: s_mov_b32 s1, s0
-; SI-NEXT: s_mov_b32 s2, s0
-; SI-NEXT: s_mov_b32 s3, s0
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s0
-; SI-NEXT: s_mov_b32 s6, s0
-; SI-NEXT: s_mov_b32 s7, s0
; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 7, v0
@@ -26,14 +18,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
;
; VI-LABEL: main:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: s_mov_b32 s1, s0
-; VI-NEXT: s_mov_b32 s2, s0
-; VI-NEXT: s_mov_b32 s3, s0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s0
-; VI-NEXT: s_mov_b32 s6, s0
-; VI-NEXT: s_mov_b32 s7, s0
; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
; VI-NEXT: v_and_b32_e32 v0, 7, v0
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 29fc098899ee..5d985850446c 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -213,7 +213,7 @@ if.else: ; preds = %entry
br label %if.end
if.end: ; preds = %if.else, %if.then
- %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ]
+ %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef
ret void
}
@@ -266,7 +266,7 @@ if.else: ; preds = %entry
br label %if.end
if.end: ; preds = %if.else, %if.then
- %call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ]
+ %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index 8af7575f03d0..0b629efffbb3 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -4,16 +4,8 @@
define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GCN-LABEL: _amdgpu_ps_main:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: s_mov_b32 s0, 0
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_mov_b32 s1, s0
-; GCN-NEXT: s_mov_b32 s2, s0
-; GCN-NEXT: s_mov_b32 s3, s0
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s0
-; GCN-NEXT: s_mov_b32 s6, s0
-; GCN-NEXT: s_mov_b32 s7, s0
; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
+; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 6456c87a31fb..cbfd8ec5cb16 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -100,14 +100,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: s_branch .LBB0_4
; GFX9-NEXT: .LBB0_2:
-; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: s_mov_b32 s9, s8
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s8
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mov_b32_e32 v4, s10
-; GFX9-NEXT: v_mov_b32_e32 v5, s11
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX9-NEXT: .LBB0_3: ; %T
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -244,14 +237,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i
; GFX9-NEXT: s_cbranch_execz .LBB1_3
; GFX9-NEXT: s_branch .LBB1_4
; GFX9-NEXT: .LBB1_2:
-; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: s_mov_b32 s9, s8
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s8
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mov_b32_e32 v4, s10
-; GFX9-NEXT: v_mov_b32_e32 v5, s11
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX9-NEXT: .LBB1_3: ; %T
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -386,14 +372,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h
; GFX9-NEXT: s_cbranch_execz .LBB2_3
; GFX9-NEXT: s_branch .LBB2_4
; GFX9-NEXT: .LBB2_2:
-; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: s_mov_b32 s9, s8
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s8
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mov_b32_e32 v4, s10
-; GFX9-NEXT: v_mov_b32_e32 v5, s11
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX9-NEXT: .LBB2_3: ; %T
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -567,22 +546,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
; GFX9-NEXT: s_cbranch_execz .LBB3_3
; GFX9-NEXT: s_branch .LBB3_4
; GFX9-NEXT: .LBB3_2:
-; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: s_mov_b32 s9, s8
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s8
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s8
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s8
-; GFX9-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-NEXT: v_mov_b32_e32 v6, s10
-; GFX9-NEXT: v_mov_b32_e32 v7, s11
-; GFX9-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-NEXT: v_mov_b32_e32 v9, s13
-; GFX9-NEXT: v_mov_b32_e32 v10, s14
-; GFX9-NEXT: v_mov_b32_e32 v11, s15
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB3_3: ; %T
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
@@ -759,22 +723,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
; GFX9-NEXT: s_cbranch_execz .LBB4_3
; GFX9-NEXT: s_branch .LBB4_4
; GFX9-NEXT: .LBB4_2:
-; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: s_mov_b32 s9, s8
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s8
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s8
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s8
-; GFX9-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-NEXT: v_mov_b32_e32 v6, s10
-; GFX9-NEXT: v_mov_b32_e32 v7, s11
-; GFX9-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-NEXT: v_mov_b32_e32 v9, s13
-; GFX9-NEXT: v_mov_b32_e32 v10, s14
-; GFX9-NEXT: v_mov_b32_e32 v11, s15
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB4_3: ; %T
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
@@ -949,22 +898,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
; GFX9-NEXT: s_cbranch_execz .LBB5_3
; GFX9-NEXT: s_branch .LBB5_4
; GFX9-NEXT: .LBB5_2:
-; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: s_mov_b32 s9, s8
-; GFX9-NEXT: s_mov_b32 s10, s8
-; GFX9-NEXT: s_mov_b32 s11, s8
-; GFX9-NEXT: s_mov_b32 s12, s8
-; GFX9-NEXT: s_mov_b32 s13, s8
-; GFX9-NEXT: s_mov_b32 s14, s8
-; GFX9-NEXT: s_mov_b32 s15, s8
-; GFX9-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-NEXT: v_mov_b32_e32 v6, s10
-; GFX9-NEXT: v_mov_b32_e32 v7, s11
-; GFX9-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-NEXT: v_mov_b32_e32 v9, s13
-; GFX9-NEXT: v_mov_b32_e32 v10, s14
-; GFX9-NEXT: v_mov_b32_e32 v11, s15
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB5_3: ; %T
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index cc4ece6c7059..f742d2c0bda4 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -374,18 +374,10 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
; GCN-LABEL: insertelement_to_sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s12, 0
-; GCN-NEXT: s_mov_b32 s4, s12
-; GCN-NEXT: s_mov_b32 s5, s12
-; GCN-NEXT: s_mov_b32 s6, s12
-; GCN-NEXT: s_mov_b32 s7, s12
-; GCN-NEXT: s_mov_b32 s8, s12
-; GCN-NEXT: s_mov_b32 s9, s12
-; GCN-NEXT: s_mov_b32 s10, s12
-; GCN-NEXT: s_mov_b32 s11, s12
-; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
+; GCN-NEXT: s_mov_b32 s4, 0
+; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 6597d6784e0c..f02cd3fc5e4e 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}select_undef_lhs:
; GCN: s_waitcnt
@@ -43,3 +43,220 @@ define void @select_undef_n2(float addrspace(1)* %a, i32 %c) {
}
declare float @llvm.amdgcn.rcp.f32(float)
+
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v6f32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <6 x float>, <6 x float> addrspace(3)* undef
+ %add = fadd <6 x float> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <6 x float> %add, <6 x float> addrspace(3)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}undef_v6i32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef
+ %add = add <6 x i32> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef
+ ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v5f32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <5 x float>, <5 x float> addrspace(3)* undef
+ %add = fadd <5 x float> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <5 x float> %add, <5 x float> addrspace(3)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}undef_v5i32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef
+ %add = add <5 x i32> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef
+ ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v3f64:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr
+ %add = fadd <3 x double> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}undef_v3i64:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr
+ %add = add <3 x i64> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr
+ ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v4f16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr
+ %add = fadd <4 x half> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}undef_v4i16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr
+ %add = add <4 x i16> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr
+ ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v2f16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr
+ %add = fadd <2 x half> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}undef_v2i16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) {
+entry:
+ br label %loop
+
+loop:
+ %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
+ %load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr
+ %add = add <2 x i16> %load, %phi
+ br i1 %cond, label %loop, label %ret
+
+ret:
+ store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr
+ ret void
+}
+
+; We were expanding undef vectors into zero vectors. Optimizations
+; would then see we used no elements of the vector, and reform the
+; undef vector resulting in a combiner loop.
+; GCN-LABEL: {{^}}inf_loop_undef_vector:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mad_u64_u32
+; GCN-NEXT: v_mul_lo_u32
+; GCN-NEXT: v_mul_lo_u32
+; GCN-NEXT: v_add3_u32
+; GCN-NEXT: global_store_dwordx2
+define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
+ %i = insertelement <6 x float> %arg, float %arg1, i64 2
+ %i3 = bitcast <6 x float> %i to <3 x i64>
+ %i4 = extractelement <3 x i64> %i3, i64 0
+ %i5 = extractelement <3 x i64> %i3, i64 1
+ %i6 = mul i64 %i5, %arg2
+ %i7 = add i64 %i6, %i4
+ store volatile i64 %i7, i64 addrspace(1)* undef, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index ada6c1da04e2..7080c84f7b50 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1397,28 +1397,20 @@ bb7: ; preds = %bb4
define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 {
; SI-LABEL: if_after_kill_block:
; SI: ; %bb.0: ; %bb
-; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_mov_b64 s[0:1], exec
; SI-NEXT: s_wqm_b64 exec, exec
-; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
-; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; SI-NEXT: s_cbranch_execz .LBB13_3
; SI-NEXT: ; %bb.1: ; %bb3
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
-; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
+; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
; SI-NEXT: s_cbranch_scc0 .LBB13_6
; SI-NEXT: ; %bb.2: ; %bb3
; SI-NEXT: s_andn2_b64 exec, exec, vcc
; SI-NEXT: .LBB13_3: ; %bb4
-; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_mov_b32 s1, s0
-; SI-NEXT: s_mov_b32 s2, s0
-; SI-NEXT: s_mov_b32 s3, s0
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s0
-; SI-NEXT: s_mov_b32 s6, s0
-; SI-NEXT: s_mov_b32 s7, s0
+; SI-NEXT: s_or_b64 exec, exec, s[2:3]
; SI-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
@@ -1439,28 +1431,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
;
; GFX10-WAVE64-LABEL: if_after_kill_block:
; GFX10-WAVE64: ; %bb.0: ; %bb
-; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec
; GFX10-WAVE64-NEXT: s_wqm_b64 exec, exec
; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
-; GFX10-WAVE64-NEXT: s_mov_b32 s0, 0
-; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_3
; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3
; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
-; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
+; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_6
; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3
; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc
; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4
-; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX10-WAVE64-NEXT: s_mov_b32 s1, s0
-; GFX10-WAVE64-NEXT: s_mov_b32 s2, s0
-; GFX10-WAVE64-NEXT: s_mov_b32 s3, s0
-; GFX10-WAVE64-NEXT: s_mov_b32 s4, s0
-; GFX10-WAVE64-NEXT: s_mov_b32 s5, s0
-; GFX10-WAVE64-NEXT: s_mov_b32 s6, s0
-; GFX10-WAVE64-NEXT: s_mov_b32 s7, s0
+; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX10-WAVE64-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0)
; GFX10-WAVE64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
@@ -1479,28 +1463,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
;
; GFX10-WAVE32-LABEL: if_after_kill_block:
; GFX10-WAVE32: ; %bb.0: ; %bb
-; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-WAVE32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v1
-; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0
-; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX10-WAVE32-NEXT: s_xor_b32 s2, exec_lo, s2
+; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_3
; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3
; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0
-; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo
+; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo
; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_6
; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3
; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4
-; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-WAVE32-NEXT: s_mov_b32 s1, s0
-; GFX10-WAVE32-NEXT: s_mov_b32 s2, s0
-; GFX10-WAVE32-NEXT: s_mov_b32 s3, s0
-; GFX10-WAVE32-NEXT: s_mov_b32 s4, s0
-; GFX10-WAVE32-NEXT: s_mov_b32 s5, s0
-; GFX10-WAVE32-NEXT: s_mov_b32 s6, s0
-; GFX10-WAVE32-NEXT: s_mov_b32 s7, s0
+; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-WAVE32-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0)
; GFX10-WAVE32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0
@@ -1519,29 +1495,22 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
;
; GFX11-LABEL: if_after_kill_block:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_mov_b64 s[2:3], exec
+; GFX11-NEXT: s_mov_b64 s[0:1], exec
; GFX11-NEXT: s_wqm_b64 exec, exec
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_mov_b64 s[4:5], exec
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mov_b64 s[2:3], exec
; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1
-; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX11-NEXT: s_cbranch_execz .LBB13_3
; GFX11-NEXT: ; %bb.1: ; %bb3
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
-; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc
+; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB13_6
; GFX11-NEXT: ; %bb.2: ; %bb3
; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
; GFX11-NEXT: .LBB13_3: ; %bb4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX11-NEXT: s_mov_b32 s1, s0
-; GFX11-NEXT: s_mov_b32 s2, s0
-; GFX11-NEXT: s_mov_b32 s3, s0
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s0
-; GFX11-NEXT: s_mov_b32 s6, s0
-; GFX11-NEXT: s_mov_b32 s7, s0
+; GFX11-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_mov_b64 s[0:1], exec
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1584,19 +1553,11 @@ bb9: ; preds = %bb4
define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; SI-LABEL: cbranch_kill:
; SI: ; %bb.0: ; %.entry
-; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b64 s[0:1], exec
; SI-NEXT: v_mov_b32_e32 v4, 0
; SI-NEXT: v_mov_b32_e32 v2, v1
; SI-NEXT: v_mov_b32_e32 v3, v1
-; SI-NEXT: s_mov_b32 s5, s4
-; SI-NEXT: s_mov_b32 s6, s4
-; SI-NEXT: s_mov_b32 s7, s4
-; SI-NEXT: s_mov_b32 s8, s4
-; SI-NEXT: s_mov_b32 s9, s4
-; SI-NEXT: s_mov_b32 s10, s4
-; SI-NEXT: s_mov_b32 s11, s4
-; SI-NEXT: image_sample_l v1, v[1:4], s[4:11], s[0:3] dmask:0x1 da
+; SI-NEXT: image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
@@ -1627,16 +1588,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; GFX10-WAVE64-LABEL: cbranch_kill:
; GFX10-WAVE64: ; %bb.0: ; %.entry
; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-WAVE64-NEXT: s_mov_b32 s4, 0
; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec
-; GFX10-WAVE64-NEXT: s_mov_b32 s5, s4
-; GFX10-WAVE64-NEXT: s_mov_b32 s6, s4
-; GFX10-WAVE64-NEXT: s_mov_b32 s7, s4
-; GFX10-WAVE64-NEXT: s_mov_b32 s8, s4
-; GFX10-WAVE64-NEXT: s_mov_b32 s9, s4
-; GFX10-WAVE64-NEXT: s_mov_b32 s10, s4
-; GFX10-WAVE64-NEXT: s_mov_b32 s11, s4
-; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0)
; GFX10-WAVE64-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1
; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc
@@ -1667,16 +1620,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; GFX10-WAVE32-LABEL: cbranch_kill:
; GFX10-WAVE32: ; %bb.0: ; %.entry
; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-WAVE32-NEXT: s_mov_b32 s4, 0
; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo
-; GFX10-WAVE32-NEXT: s_mov_b32 s5, s4
-; GFX10-WAVE32-NEXT: s_mov_b32 s6, s4
-; GFX10-WAVE32-NEXT: s_mov_b32 s7, s4
-; GFX10-WAVE32-NEXT: s_mov_b32 s8, s4
-; GFX10-WAVE32-NEXT: s_mov_b32 s9, s4
-; GFX10-WAVE32-NEXT: s_mov_b32 s10, s4
-; GFX10-WAVE32-NEXT: s_mov_b32 s11, s4
-; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0)
; GFX10-WAVE32-NEXT: v_cmp_ge_f32_e32 vcc_lo, 0, v1
; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo
@@ -1707,16 +1652,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; GFX11-LABEL: cbranch_kill:
; GFX11: ; %bb.0: ; %.entry
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: s_mov_b64 s[0:1], exec
-; GFX11-NEXT: s_mov_b32 s5, s4
-; GFX11-NEXT: s_mov_b32 s6, s4
-; GFX11-NEXT: s_mov_b32 s7, s4
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s4
-; GFX11-NEXT: s_mov_b32 s10, s4
-; GFX11-NEXT: s_mov_b32 s11, s4
-; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX11-NEXT: s_mov_b64 s[2:3], exec
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmpx_ge_f32_e32 0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/v1024.ll b/llvm/test/CodeGen/AMDGPU/v1024.ll
index a5e0454a3634..1326ba437f94 100644
--- a/llvm/test/CodeGen/AMDGPU/v1024.ll
+++ b/llvm/test/CodeGen/AMDGPU/v1024.ll
@@ -10,6 +10,7 @@ define amdgpu_kernel void @test_v1024() {
entry:
%alloca = alloca <32 x i32>, align 16, addrspace(5)
%cast = bitcast <32 x i32> addrspace(5)* %alloca to i8 addrspace(5)*
+ call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %cast, i8 0, i32 128, i1 false)
br i1 undef, label %if.then.i.i, label %if.else.i
if.then.i.i: ; preds = %entry
@@ -24,6 +25,7 @@ if.then.i62.i: ; preds = %if.else.i, %if.then
ret void
}
+declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture readonly, i8, i32, i1 immarg)
declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg)
declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 5164b072a6dd..ed0de729dafd 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -14,7 +14,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: v_mov_b32_e32 v36, v16
@@ -22,13 +21,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: v_mov_b32_e32 v34, v14
; GFX9-NEXT: v_mov_b32_e32 v33, v13
; GFX9-NEXT: v_mov_b32_e32 v32, v12
-; GFX9-NEXT: s_mov_b32 s5, s4
-; GFX9-NEXT: s_mov_b32 s6, s4
-; GFX9-NEXT: s_mov_b32 s7, s4
-; GFX9-NEXT: s_mov_b32 s8, s4
-; GFX9-NEXT: s_mov_b32 s9, s4
-; GFX9-NEXT: s_mov_b32 s10, s4
-; GFX9-NEXT: s_mov_b32 s11, s4
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -82,16 +74,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: v_mov_b32_e32 v34, v14
; GFX10-NEXT: v_mov_b32_e32 v33, v13
; GFX10-NEXT: v_mov_b32_e32 v32, v12
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
-; GFX10-NEXT: s_mov_b32 s5, s4
-; GFX10-NEXT: s_mov_b32 s6, s4
-; GFX10-NEXT: s_mov_b32 s7, s4
-; GFX10-NEXT: s_mov_b32 s8, s4
-; GFX10-NEXT: s_mov_b32 s9, s4
-; GFX10-NEXT: s_mov_b32 s10, s4
-; GFX10-NEXT: s_mov_b32 s11, s4
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -145,16 +129,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
; GFX11-NEXT: v_mov_b32_e32 v32, v12
-; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_mov_b32 s1, s0
-; GFX11-NEXT: s_mov_b32 s2, s0
-; GFX11-NEXT: s_mov_b32 s3, s0
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s0
-; GFX11-NEXT: s_mov_b32 s6, s0
-; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8
@@ -225,65 +201,41 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: v_writelane_b32 v40, s33, 10
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: v_writelane_b32 v40, s36, 2
-; GFX9-NEXT: v_writelane_b32 v40, s37, 3
-; GFX9-NEXT: v_writelane_b32 v40, s38, 4
-; GFX9-NEXT: v_writelane_b32 v40, s39, 5
-; GFX9-NEXT: v_writelane_b32 v40, s40, 6
-; GFX9-NEXT: v_writelane_b32 v40, s41, 7
+; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: v_writelane_b32 v40, s42, 8
-; GFX9-NEXT: s_mov_b32 s36, 0
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v40, s43, 9
; GFX9-NEXT: v_mov_b32_e32 v45, v16
; GFX9-NEXT: v_mov_b32_e32 v44, v15
; GFX9-NEXT: v_mov_b32_e32 v43, v14
; GFX9-NEXT: v_mov_b32_e32 v42, v13
; GFX9-NEXT: v_mov_b32_e32 v41, v12
-; GFX9-NEXT: s_mov_b32 s37, s36
-; GFX9-NEXT: s_mov_b32 s38, s36
-; GFX9-NEXT: s_mov_b32 s39, s36
-; GFX9-NEXT: s_mov_b32 s40, s36
-; GFX9-NEXT: s_mov_b32 s41, s36
-; GFX9-NEXT: s_mov_b32 s42, s36
-; GFX9-NEXT: s_mov_b32 s43, s36
-; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT: v_readlane_b32 s43, v40, 9
-; GFX9-NEXT: v_readlane_b32 s42, v40, 8
-; GFX9-NEXT: v_readlane_b32 s41, v40, 7
-; GFX9-NEXT: v_readlane_b32 s40, v40, 6
-; GFX9-NEXT: v_readlane_b32 s39, v40, 5
-; GFX9-NEXT: v_readlane_b32 s38, v40, 4
-; GFX9-NEXT: v_readlane_b32 s37, v40, 3
-; GFX9-NEXT: v_readlane_b32 s36, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-NEXT: v_readlane_b32 s33, v40, 10
+; GFX9-NEXT: v_readlane_b32 s33, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
@@ -298,66 +250,42 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
-; GFX10-NEXT: v_writelane_b32 v40, s33, 10
+; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_addk_i32 s32, 0x400
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v41, v16
; GFX10-NEXT: v_mov_b32_e32 v42, v15
; GFX10-NEXT: v_mov_b32_e32 v43, v14
-; GFX10-NEXT: v_mov_b32_e32 v44, v13
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: v_mov_b32_e32 v44, v13
; GFX10-NEXT: v_mov_b32_e32 v45, v12
-; GFX10-NEXT: v_writelane_b32 v40, s36, 2
-; GFX10-NEXT: s_mov_b32 s36, 0
-; GFX10-NEXT: v_writelane_b32 v40, s37, 3
-; GFX10-NEXT: s_mov_b32 s37, s36
-; GFX10-NEXT: v_writelane_b32 v40, s38, 4
-; GFX10-NEXT: s_mov_b32 s38, s36
-; GFX10-NEXT: v_writelane_b32 v40, s39, 5
-; GFX10-NEXT: s_mov_b32 s39, s36
-; GFX10-NEXT: v_writelane_b32 v40, s40, 6
-; GFX10-NEXT: s_mov_b32 s40, s36
-; GFX10-NEXT: v_writelane_b32 v40, s41, 7
-; GFX10-NEXT: s_mov_b32 s41, s36
-; GFX10-NEXT: v_writelane_b32 v40, s42, 8
-; GFX10-NEXT: s_mov_b32 s42, s36
-; GFX10-NEXT: v_writelane_b32 v40, s43, 9
-; GFX10-NEXT: s_mov_b32 s43, s36
-; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_getpc_b64 s[4:5]
-; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
-; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_clause 0x4
; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4
; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16
-; GFX10-NEXT: v_readlane_b32 s43, v40, 9
-; GFX10-NEXT: v_readlane_b32 s42, v40, 8
-; GFX10-NEXT: v_readlane_b32 s41, v40, 7
-; GFX10-NEXT: v_readlane_b32 s40, v40, 6
-; GFX10-NEXT: v_readlane_b32 s39, v40, 5
-; GFX10-NEXT: v_readlane_b32 s38, v40, 4
-; GFX10-NEXT: v_readlane_b32 s37, v40, 3
-; GFX10-NEXT: v_readlane_b32 s36, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
-; GFX10-NEXT: v_readlane_b32 s33, v40, 10
+; GFX10-NEXT: v_readlane_b32 s33, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -372,7 +300,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_writelane_b32 v40, s33, 10
+; GFX11-NEXT: v_writelane_b32 v40, s33, 2
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16
@@ -380,56 +308,32 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8
; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4
; GFX11-NEXT: scratch_store_b32 off, v45, s33
+; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_add_i32 s32, s32, 32
+; GFX11-NEXT: s_getpc_b64 s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: v_mov_b32_e32 v45, v12
-; GFX11-NEXT: v_writelane_b32 v40, s36, 2
-; GFX11-NEXT: s_mov_b32 s36, 0
-; GFX11-NEXT: v_writelane_b32 v40, s37, 3
-; GFX11-NEXT: s_mov_b32 s37, s36
-; GFX11-NEXT: v_writelane_b32 v40, s38, 4
-; GFX11-NEXT: s_mov_b32 s38, s36
-; GFX11-NEXT: v_writelane_b32 v40, s39, 5
-; GFX11-NEXT: s_mov_b32 s39, s36
-; GFX11-NEXT: v_writelane_b32 v40, s40, 6
-; GFX11-NEXT: s_mov_b32 s40, s36
-; GFX11-NEXT: v_writelane_b32 v40, s41, 7
-; GFX11-NEXT: s_mov_b32 s41, s36
-; GFX11-NEXT: v_writelane_b32 v40, s42, 8
-; GFX11-NEXT: s_mov_b32 s42, s36
-; GFX11-NEXT: v_writelane_b32 v40, s43, 9
-; GFX11-NEXT: s_mov_b32 s43, s36
-; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: scratch_load_b32 v45, off, s33
; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4
; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8
; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16
-; GFX11-NEXT: v_readlane_b32 s43, v40, 9
-; GFX11-NEXT: v_readlane_b32 s42, v40, 8
-; GFX11-NEXT: v_readlane_b32 s41, v40, 7
-; GFX11-NEXT: v_readlane_b32 s40, v40, 6
-; GFX11-NEXT: v_readlane_b32 s39, v40, 5
-; GFX11-NEXT: v_readlane_b32 s38, v40, 4
-; GFX11-NEXT: v_readlane_b32 s37, v40, 3
-; GFX11-NEXT: v_readlane_b32 s36, v40, 2
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
-; GFX11-NEXT: v_readlane_b32 s33, v40, 10
+; GFX11-NEXT: v_readlane_b32 s33, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index dc85462631d4..16c30174657a 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1833,87 +1833,54 @@ main_body:
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
; GFX9-W64-LABEL: test_loop_vcc:
; GFX9-W64: ; %bb.0: ; %entry
-; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec
+; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3
; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2
; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9]
-; GFX9-W64-NEXT: s_mov_b32 s0, 0
-; GFX9-W64-NEXT: s_mov_b32 s1, s0
-; GFX9-W64-NEXT: s_mov_b32 s2, s0
-; GFX9-W64-NEXT: s_mov_b32 s3, s0
-; GFX9-W64-NEXT: s_mov_b32 s4, s0
-; GFX9-W64-NEXT: s_mov_b32 s5, s0
-; GFX9-W64-NEXT: s_mov_b32 s6, s0
-; GFX9-W64-NEXT: s_mov_b32 s7, s0
+; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-W64-NEXT: s_mov_b32 s10, 0x40e00000
+; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000
; GFX9-W64-NEXT: s_branch .LBB31_2
; GFX9-W64-NEXT: .LBB31_1: ; %body
; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1
-; GFX9-W64-NEXT: s_mov_b32 s1, s0
-; GFX9-W64-NEXT: s_mov_b32 s2, s0
-; GFX9-W64-NEXT: s_mov_b32 s3, s0
-; GFX9-W64-NEXT: s_mov_b32 s4, s0
-; GFX9-W64-NEXT: s_mov_b32 s5, s0
-; GFX9-W64-NEXT: s_mov_b32 s6, s0
-; GFX9-W64-NEXT: s_mov_b32 s7, s0
; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8
-; GFX9-W64-NEXT: s_mov_b64 s[2:3], 0
; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4
; GFX9-W64-NEXT: .LBB31_2: ; %loop
; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s10, v8
+; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8
; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6
; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7
; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1
; GFX9-W64-NEXT: ; %bb.3:
-; GFX9-W64-NEXT: s_mov_b64 s[2:3], -1
; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-W64-NEXT: ; implicit-def: $vgpr8
; GFX9-W64-NEXT: .LBB31_4: ; %break
-; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9]
+; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_loop_vcc:
; GFX10-W32: ; %bb.0: ; %entry
-; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo
+; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-W32-NEXT: s_mov_b32 s0, 0
-; GFX10-W32-NEXT: s_mov_b32 s1, s0
-; GFX10-W32-NEXT: s_mov_b32 s2, s0
-; GFX10-W32-NEXT: s_mov_b32 s3, s0
-; GFX10-W32-NEXT: s_mov_b32 s4, s0
-; GFX10-W32-NEXT: s_mov_b32 s5, s0
-; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8
-; GFX10-W32-NEXT: s_mov_b32 s6, s0
-; GFX10-W32-NEXT: s_mov_b32 s7, s0
+; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_branch .LBB31_2
; GFX10-W32-NEXT: .p2align 6
; GFX10-W32-NEXT: .LBB31_1: ; %body
; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1
-; GFX10-W32-NEXT: s_mov_b32 s1, s0
-; GFX10-W32-NEXT: s_mov_b32 s2, s0
-; GFX10-W32-NEXT: s_mov_b32 s3, s0
-; GFX10-W32-NEXT: s_mov_b32 s4, s0
-; GFX10-W32-NEXT: s_mov_b32 s5, s0
-; GFX10-W32-NEXT: s_mov_b32 s6, s0
-; GFX10-W32-NEXT: s_mov_b32 s7, s0
-; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8
; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT: s_mov_b32 s1, 0
+; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8
; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4
; GFX10-W32-NEXT: .LBB31_2: ; %loop
; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1925,11 +1892,10 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1
; GFX10-W32-NEXT: ; %bb.3:
-; GFX10-W32-NEXT: s_mov_b32 s1, -1
; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10-W32-NEXT: ; implicit-def: $vgpr8
; GFX10-W32-NEXT: .LBB31_4: ; %break
-; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4
; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5
@@ -1999,14 +1965,6 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
; GFX9-W64-NEXT: v_lshl_add_u32 v0, v2, 2, v0
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
-; GFX9-W64-NEXT: s_mov_b32 s0, 0
-; GFX9-W64-NEXT: s_mov_b32 s1, s0
-; GFX9-W64-NEXT: s_mov_b32 s2, s0
-; GFX9-W64-NEXT: s_mov_b32 s3, s0
-; GFX9-W64-NEXT: s_mov_b32 s4, s0
-; GFX9-W64-NEXT: s_mov_b32 s5, s0
-; GFX9-W64-NEXT: s_mov_b32 s6, s0
-; GFX9-W64-NEXT: s_mov_b32 s7, s0
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
@@ -2035,14 +1993,6 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v0, v2, s[8:11], 0 offen
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
-; GFX10-W32-NEXT: s_mov_b32 s0, 0
-; GFX10-W32-NEXT: s_mov_b32 s1, s0
-; GFX10-W32-NEXT: s_mov_b32 s2, s0
-; GFX10-W32-NEXT: s_mov_b32 s3, s0
-; GFX10-W32-NEXT: s_mov_b32 s4, s0
-; GFX10-W32-NEXT: s_mov_b32 s5, s0
-; GFX10-W32-NEXT: s_mov_b32 s6, s0
-; GFX10-W32-NEXT: s_mov_b32 s7, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
@@ -2079,18 +2029,10 @@ entry:
define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
; GFX9-W64-LABEL: test_nonvoid_return:
; GFX9-W64: ; %bb.0:
-; GFX9-W64-NEXT: s_mov_b32 s0, 0
-; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec
-; GFX9-W64-NEXT: s_mov_b32 s1, s0
-; GFX9-W64-NEXT: s_mov_b32 s2, s0
-; GFX9-W64-NEXT: s_mov_b32 s3, s0
-; GFX9-W64-NEXT: s_mov_b32 s4, s0
-; GFX9-W64-NEXT: s_mov_b32 s5, s0
-; GFX9-W64-NEXT: s_mov_b32 s6, s0
-; GFX9-W64-NEXT: s_mov_b32 s7, s0
+; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
-; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9]
+; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
@@ -2098,18 +2040,10 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
;
; GFX10-W32-LABEL: test_nonvoid_return:
; GFX10-W32: ; %bb.0:
-; GFX10-W32-NEXT: s_mov_b32 s0, 0
-; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo
-; GFX10-W32-NEXT: s_mov_b32 s1, s0
-; GFX10-W32-NEXT: s_mov_b32 s2, s0
-; GFX10-W32-NEXT: s_mov_b32 s3, s0
-; GFX10-W32-NEXT: s_mov_b32 s4, s0
-; GFX10-W32-NEXT: s_mov_b32 s5, s0
-; GFX10-W32-NEXT: s_mov_b32 s6, s0
-; GFX10-W32-NEXT: s_mov_b32 s7, s0
+; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
@@ -2128,20 +2062,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
; GFX9-W64: ; %bb.0: ; %entry
-; GFX9-W64-NEXT: s_mov_b32 s4, 0
-; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-W64-NEXT: s_mov_b32 s5, s4
-; GFX9-W64-NEXT: s_mov_b32 s6, s4
-; GFX9-W64-NEXT: s_mov_b32 s7, s4
-; GFX9-W64-NEXT: s_mov_b32 s8, s4
-; GFX9-W64-NEXT: s_mov_b32 s9, s4
-; GFX9-W64-NEXT: s_mov_b32 s10, s4
-; GFX9-W64-NEXT: s_mov_b32 s11, s4
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
-; GFX9-W64-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1
-; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
+; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
+; GFX9-W64-NEXT: s_and_b64 exec, exec, exec
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB34_2
; GFX9-W64-NEXT: ; %bb.1: ; %else
@@ -2155,20 +2080,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) noun
;
; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
; GFX10-W32: ; %bb.0: ; %entry
-; GFX10-W32-NEXT: s_mov_b32 s4, 0
-; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX10-W32-NEXT: s_mov_b32 s5, s4
-; GFX10-W32-NEXT: s_mov_b32 s6, s4
-; GFX10-W32-NEXT: s_mov_b32 s7, s4
-; GFX10-W32-NEXT: s_mov_b32 s8, s4
-; GFX10-W32-NEXT: s_mov_b32 s9, s4
-; GFX10-W32-NEXT: s_mov_b32 s10, s4
-; GFX10-W32-NEXT: s_mov_b32 s11, s4
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
+; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB34_2
; GFX10-W32-NEXT: ; %bb.1: ; %else
@@ -2215,33 +2131,17 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB35_2
; GFX9-W64-NEXT: ; %bb.1: ; %else
-; GFX9-W64-NEXT: s_mov_b32 s4, 0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-W64-NEXT: s_mov_b32 s5, s4
-; GFX9-W64-NEXT: s_mov_b32 s6, s4
-; GFX9-W64-NEXT: s_mov_b32 s7, s4
-; GFX9-W64-NEXT: s_mov_b32 s8, s4
-; GFX9-W64-NEXT: s_mov_b32 s9, s4
-; GFX9-W64-NEXT: s_mov_b32 s10, s4
-; GFX9-W64-NEXT: s_mov_b32 s11, s4
-; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3
; GFX9-W64-NEXT: s_branch .LBB35_4
; GFX9-W64-NEXT: .LBB35_2:
; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX9-W64-NEXT: .LBB35_3: ; %if
-; GFX9-W64-NEXT: s_mov_b32 s4, 0
-; GFX9-W64-NEXT: s_mov_b32 s5, s4
-; GFX9-W64-NEXT: s_mov_b32 s6, s4
-; GFX9-W64-NEXT: s_mov_b32 s7, s4
-; GFX9-W64-NEXT: s_mov_b32 s8, s4
-; GFX9-W64-NEXT: s_mov_b32 s9, s4
-; GFX9-W64-NEXT: s_mov_b32 s10, s4
-; GFX9-W64-NEXT: s_mov_b32 s11, s4
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: .LBB35_4: ; %end
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0
@@ -2252,21 +2152,13 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
; GFX10-W32-LABEL: test_scc:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo
+; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB35_2
; GFX10-W32-NEXT: ; %bb.1: ; %else
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-W32-NEXT: s_mov_b32 s0, 0
-; GFX10-W32-NEXT: s_mov_b32 s1, s0
-; GFX10-W32-NEXT: s_mov_b32 s2, s0
-; GFX10-W32-NEXT: s_mov_b32 s3, s0
-; GFX10-W32-NEXT: s_mov_b32 s4, s0
-; GFX10-W32-NEXT: s_mov_b32 s5, s0
-; GFX10-W32-NEXT: s_mov_b32 s6, s0
-; GFX10-W32-NEXT: s_mov_b32 s7, s0
; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3
; GFX10-W32-NEXT: s_branch .LBB35_4
@@ -2275,17 +2167,9 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
; GFX10-W32-NEXT: .LBB35_3: ; %if
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT: s_mov_b32 s0, 0
-; GFX10-W32-NEXT: s_mov_b32 s1, s0
-; GFX10-W32-NEXT: s_mov_b32 s2, s0
-; GFX10-W32-NEXT: s_mov_b32 s3, s0
-; GFX10-W32-NEXT: s_mov_b32 s4, s0
-; GFX10-W32-NEXT: s_mov_b32 s5, s0
-; GFX10-W32-NEXT: s_mov_b32 s6, s0
-; GFX10-W32-NEXT: s_mov_b32 s7, s0
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: .LBB35_4: ; %end
-; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0
; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)