summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJay Foad <jay.foad@amd.com>2021-08-05 14:32:25 +0100
committerJay Foad <jay.foad@amd.com>2021-08-05 15:57:40 +0100
commit2b63933115f75f4b06c317a77fdef0e4e5656976 (patch)
tree2cb14003480e85848dc1b6b5eb7fd57c70335e3f
parente6c364a62456409f22470e163ffd7f68772d4198 (diff)
downloadllvm-2b63933115f75f4b06c317a77fdef0e4e5656976.tar.gz
[AMDGPU][SDag] Better lowering for 32-bit ctlz/cttz
Differential Revision: https://reviews.llvm.org/D107566
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp8
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz.ll197
-rw-r--r--llvm/test/CodeGen/AMDGPU/cttz.ll176
-rw-r--r--llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll224
6 files changed, 273 insertions, 369 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5e30e1c1341b..e8a46e050974 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2386,8 +2386,16 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
if (Src.getValueType() == MVT::i32) {
- assert(ZeroUndef);
- return DAG.getNode(NewOpc, SL, MVT::i32, Src);
+ // (ctlz hi:lo) -> (umin (ffbh src), 32)
+ // (cttz hi:lo) -> (umin (ffbl src), 32)
+ // (ctlz_zero_undef src) -> (ffbh src)
+ // (cttz_zero_undef src) -> (ffbl src)
+ SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
+ if (!ZeroUndef) {
+ const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
+ NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
+ }
+ return NewOpr;
}
SDValue Lo, Hi;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c3d9ea4381c2..eba19106d4f0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -465,11 +465,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (!Subtarget->hasBCNT(64))
setOperationAction(ISD::CTPOP, MVT::i64, Expand);
- if (Subtarget->hasFFBH())
+ if (Subtarget->hasFFBH()) {
+ setOperationAction(ISD::CTLZ, MVT::i32, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+ }
- if (Subtarget->hasFFBL())
+ if (Subtarget->hasFFBL()) {
+ setOperationAction(ISD::CTTZ, MVT::i32, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+ }
// We only really have 32-bit BFE instructions (and 16-bit on VI).
//
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index c51e12500ff5..1b1fd1b5a630 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -22,15 +22,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
; SI-LABEL: s_ctlz_i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_flbit_i32_b32 s5, s4
+; SI-NEXT: s_flbit_i32_b32 s2, s2
+; SI-NEXT: s_min_u32 s4, s2, 32
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -41,9 +40,8 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s1, s0
-; VI-NEXT: s_cmp_lg_u32 s0, 0
-; VI-NEXT: s_cselect_b32 s0, s1, 32
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: s_min_u32 s0, s0, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -68,8 +66,7 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_flbit_i32_b32 s0, s4
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s0, s0, 32
+; GFX10-NEXT: s_min_u32 s0, s0, 32
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
@@ -98,17 +95,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbh_u32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -126,9 +122,8 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -157,14 +152,13 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_ctlz_i32:
@@ -203,12 +197,10 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbh_u32_e32 v2, v1
-; SI-NEXT: v_ffbh_u32_e32 v3, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v1, 32, v1
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -226,12 +218,10 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v1
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
-; VI-NEXT: v_ffbh_u32_e32 v3, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
+; VI-NEXT: v_ffbh_u32_e32 v1, v1
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v1, 32, v1
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -263,17 +253,15 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_ffbh_u32_e32 v3, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_ctlz_v2i32:
@@ -315,18 +303,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbh_u32_e32 v4, v3
-; SI-NEXT: v_ffbh_u32_e32 v5, v2
-; SI-NEXT: v_ffbh_u32_e32 v6, v1
-; SI-NEXT: v_ffbh_u32_e32 v7, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
+; SI-NEXT: v_ffbh_u32_e32 v3, v3
+; SI-NEXT: v_ffbh_u32_e32 v2, v2
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v3, 32, v3
+; SI-NEXT: v_min_u32_e32 v2, 32, v2
+; SI-NEXT: v_min_u32_e32 v1, 32, v1
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -344,18 +328,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v4, v3
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
-; VI-NEXT: v_ffbh_u32_e32 v5, v2
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
-; VI-NEXT: v_ffbh_u32_e32 v6, v1
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
-; VI-NEXT: v_ffbh_u32_e32 v7, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
+; VI-NEXT: v_ffbh_u32_e32 v3, v3
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
+; VI-NEXT: v_ffbh_u32_e32 v1, v1
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v3, 32, v3
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
+; VI-NEXT: v_min_u32_e32 v1, 32, v1
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -397,18 +377,14 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbh_u32_e32 v5, v3
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_ffbh_u32_e32 v6, v2
-; GFX10-NEXT: v_ffbh_u32_e32 v7, v1
-; GFX10-NEXT: v_ffbh_u32_e32 v8, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v3, 32, v5, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v6, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v7, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v8, vcc_lo
+; GFX10-NEXT: v_ffbh_u32_e32 v3, v3
+; GFX10-NEXT: v_ffbh_u32_e32 v2, v2
+; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
+; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -455,9 +431,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbh_u32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -474,9 +449,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
; VI-NEXT: v_add_u16_e32 v0, -8, v0
; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -520,9 +494,8 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
+; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
; GFX10-NEXT: v_add_nc_u16 v1, v1, -8
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
@@ -1152,9 +1125,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbh_u32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1174,9 +1146,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -1211,13 +1182,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
@@ -1263,9 +1233,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbh_u32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1285,9 +1254,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -1322,13 +1290,12 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
@@ -1493,11 +1460,11 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
-; VI-NEXT: v_cndmask_b32_e64 v0, 32, v1, s[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_mov_b32_e32 v1, 0xffff
-; VI-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[0:1]
+; VI-NEXT: v_min_u32_e32 v1, 32, v1
+; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v0, 0xffff
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -1538,9 +1505,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v1, vcc_lo
+; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 7172260f36eb..54f7e238b230 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -22,15 +22,14 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
; SI-LABEL: s_cttz_i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_ff1_i32_b32 s5, s4
+; SI-NEXT: s_ff1_i32_b32 s2, s2
+; SI-NEXT: s_min_u32 s4, s2, 32
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -41,9 +40,8 @@ define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s1, s0
-; VI-NEXT: s_cmp_lg_u32 s0, 0
-; VI-NEXT: s_cselect_b32 s0, s1, 32
+; VI-NEXT: s_ff1_i32_b32 s0, s0
+; VI-NEXT: s_min_u32 s0, s0, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
@@ -68,8 +66,7 @@ define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ff1_i32_b32 s0, s4
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s0, s0, 32
+; GFX10-NEXT: s_min_u32 s0, s0, 32
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
@@ -98,17 +95,16 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbl_b32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbl_b32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -126,9 +122,8 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbl_b32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT: v_ffbl_b32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -157,14 +152,13 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_cttz_i32:
@@ -203,12 +197,10 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbl_b32_e32 v2, v1
-; SI-NEXT: v_ffbl_b32_e32 v3, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
+; SI-NEXT: v_ffbl_b32_e32 v1, v1
+; SI-NEXT: v_ffbl_b32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v1, 32, v1
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -226,12 +218,10 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbl_b32_e32 v2, v1
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
-; VI-NEXT: v_ffbl_b32_e32 v3, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
+; VI-NEXT: v_ffbl_b32_e32 v1, v1
+; VI-NEXT: v_ffbl_b32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v1, 32, v1
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -263,17 +253,15 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbl_b32_e32 v2, v1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_ffbl_b32_e32 v3, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
+; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_cttz_v2i32:
@@ -315,18 +303,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbl_b32_e32 v4, v3
-; SI-NEXT: v_ffbl_b32_e32 v5, v2
-; SI-NEXT: v_ffbl_b32_e32 v6, v1
-; SI-NEXT: v_ffbl_b32_e32 v7, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
+; SI-NEXT: v_ffbl_b32_e32 v3, v3
+; SI-NEXT: v_ffbl_b32_e32 v2, v2
+; SI-NEXT: v_ffbl_b32_e32 v1, v1
+; SI-NEXT: v_ffbl_b32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v3, 32, v3
+; SI-NEXT: v_min_u32_e32 v2, 32, v2
+; SI-NEXT: v_min_u32_e32 v1, 32, v1
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -344,18 +328,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbl_b32_e32 v4, v3
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc
-; VI-NEXT: v_ffbl_b32_e32 v5, v2
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v5, vcc
-; VI-NEXT: v_ffbl_b32_e32 v6, v1
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
-; VI-NEXT: v_ffbl_b32_e32 v7, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
+; VI-NEXT: v_ffbl_b32_e32 v3, v3
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
+; VI-NEXT: v_ffbl_b32_e32 v1, v1
+; VI-NEXT: v_ffbl_b32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v3, 32, v3
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
+; VI-NEXT: v_min_u32_e32 v1, 32, v1
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -397,18 +377,14 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbl_b32_e32 v5, v3
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_ffbl_b32_e32 v6, v2
-; GFX10-NEXT: v_ffbl_b32_e32 v7, v1
-; GFX10-NEXT: v_ffbl_b32_e32 v8, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v3, 32, v5, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v6, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v7, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v8, vcc_lo
+; GFX10-NEXT: v_ffbl_b32_e32 v3, v3
+; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
+; GFX10-NEXT: v_ffbl_b32_e32 v1, v1
+; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v3, 32, v3
+; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
+; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -1141,9 +1117,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbl_b32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbl_b32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1163,9 +1138,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbl_b32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT: v_ffbl_b32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -1200,13 +1174,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
@@ -1252,9 +1225,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbl_b32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbl_b32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -1274,9 +1246,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbl_b32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT: v_ffbl_b32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -1311,13 +1282,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_ffbl_b32_e32 v1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT: v_min_u32_e32 v0, 32, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
@@ -1482,9 +1452,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: v_mov_b32_e32 v1, 0xffff
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT: v_ffbl_b32_e32 v3, v2
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -1526,10 +1495,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1
-; GFX10-NEXT: v_ffbl_b32_e32 v3, v2
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
+; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index d13edccf27c2..36e4773f53e1 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -782,9 +782,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_ffbl_b32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbl_b32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -820,9 +819,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT: v_ffbl_b32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v2, 32, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -1365,9 +1363,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_ffbl_b32_e32 v1, v0
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT: v_ffbl_b32_e32 v0, v0
+; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1405,9 +1402,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v1, v0
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
+; VI-NEXT: v_ffbl_b32_e32 v0, v0
+; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -1605,9 +1601,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT: v_ffbl_b32_e32 v3, v2
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
+; VI-NEXT: v_ffbl_b32_e32 v2, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 7abd17ab49f0..59ef31d06d49 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -9,21 +9,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_flbit_i32_b32 s8, s3
; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s3, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
-; GFX6-NEXT: v_lshl_b64 v[0:1], s[2:3], v2
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_flbit_i32_b32 s0, s3
+; GFX6-NEXT: s_min_u32 s8, s0, 32
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
+; GFX6-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
+; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX6-NEXT: v_or_b32_e32 v0, s1, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
-; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX6-NEXT: s_sub_i32 s0, 32, s8
+; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
@@ -33,8 +31,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_flbit_i32_b32 s4, s3
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
-; GFX8-NEXT: s_cselect_b32 s6, s4, 32
+; GFX8-NEXT: s_min_u32 s6, s4, 32
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -67,8 +64,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_ffbh_u32_e32 v0, v4
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
+; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -93,8 +89,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v4, v2
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v4, 32, v4, vcc
+; GFX8-NEXT: v_min_u32_e32 v4, 32, v4
; GFX8-NEXT: v_lshlrev_b64 v[1:2], v4, v[1:2]
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -122,21 +117,19 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_flbit_i32_b32 s8, s3
; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s3, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
-; GFX6-NEXT: v_lshl_b64 v[0:1], s[2:3], v2
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_flbit_i32_b32 s0, s3
+; GFX6-NEXT: s_min_u32 s8, s0, 32
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8
+; GFX6-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
+; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX6-NEXT: v_or_b32_e32 v0, s1, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
-; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX6-NEXT: s_sub_i32 s0, 32, s8
+; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
@@ -145,8 +138,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_flbit_i32_b32 s4, s3
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
-; GFX8-NEXT: s_cselect_b32 s6, s4, 32
+; GFX8-NEXT: s_min_u32 s6, s4, 32
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -178,8 +170,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_ffbh_u32_e32 v0, v4
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
+; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -203,8 +194,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v2
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v5, 32, v0, vcc
+; GFX8-NEXT: v_min_u32_e32 v5, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[1:2]
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
@@ -236,56 +226,50 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)*
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_flbit_i32_b32 s8, s7
; GFX6-NEXT: s_flbit_i32_b32 s9, s5
-; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s7, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
-; GFX6-NEXT: v_mov_b32_e32 v0, s9
-; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s5, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v4, 32, v0, vcc
-; GFX6-NEXT: v_lshl_b64 v[0:1], s[6:7], v2
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 32, v2
-; GFX6-NEXT: v_lshl_b64 v[2:3], s[4:5], v4
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v2
+; GFX6-NEXT: s_min_u32 s8, s8, 32
+; GFX6-NEXT: s_min_u32 s9, s9, 32
+; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
+; GFX6-NEXT: s_sub_i32 s10, 32, s8
+; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
+; GFX6-NEXT: s_sub_i32 s11, 32, s9
+; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s6, 0
+; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s4, 0
+; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX6-NEXT: v_or_b32_e32 v0, s7, v0
+; GFX6-NEXT: v_or_b32_e32 v1, s5, v1
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v1
-; GFX6-NEXT: v_ldexp_f32_e32 v1, v0, v5
-; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v4
+; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s10
+; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s11
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s2, s7
-; GFX8-NEXT: s_cmp_lg_u32 s7, 0
-; GFX8-NEXT: s_cselect_b32 s9, s2, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
-; GFX8-NEXT: s_sub_i32 s9, 32, s9
+; GFX8-NEXT: s_flbit_i32_b32 s6, s3
+; GFX8-NEXT: s_min_u32 s8, s6, 32
+; GFX8-NEXT: s_flbit_i32_b32 s7, s1
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
+; GFX8-NEXT: s_min_u32 s9, s7, 32
; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0
-; GFX8-NEXT: s_flbit_i32_b32 s8, s5
-; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
-; GFX8-NEXT: s_cselect_b32 s6, s8, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX8-NEXT: v_or_b32_e32 v1, s1, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v1
-; GFX8-NEXT: s_sub_i32 s2, 32, s6
-; GFX8-NEXT: v_ldexp_f32 v1, v0, s9
-; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s8
+; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
+; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
%result = uitofp <2 x i64> %in to <2 x float>
@@ -314,14 +298,10 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_ffbh_u32_e32 v12, v8
; GFX6-NEXT: v_ffbh_u32_e32 v13, v6
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v9, 32, v9, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX6-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX6-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
+; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
+; GFX6-NEXT: v_min_u32_e32 v9, 32, v9
+; GFX6-NEXT: v_min_u32_e32 v12, 32, v12
+; GFX6-NEXT: v_min_u32_e32 v13, 32, v13
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
@@ -374,16 +354,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)*
; GFX8-NEXT: v_ffbh_u32_e32 v12, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v8
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX8-NEXT: v_ffbh_u32_e32 v11, v6
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v11, 32, v11, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
; GFX8-NEXT: v_ffbh_u32_e32 v13, v2
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
+; GFX8-NEXT: v_min_u32_e32 v0, 32, v0
+; GFX8-NEXT: v_min_u32_e32 v11, 32, v11
+; GFX8-NEXT: v_min_u32_e32 v12, 32, v12
+; GFX8-NEXT: v_min_u32_e32 v13, 32, v13
; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8]
; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6]
@@ -433,26 +409,22 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_flbit_i32_b32 s8, s7
; GFX6-NEXT: s_flbit_i32_b32 s9, s5
-; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s7, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 32, v0, vcc
-; GFX6-NEXT: v_mov_b32_e32 v0, s9
-; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s5, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v4, 32, v0, vcc
-; GFX6-NEXT: v_lshl_b64 v[0:1], s[6:7], v2
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 32, v2
-; GFX6-NEXT: v_lshl_b64 v[2:3], s[4:5], v4
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 32, v4
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v2
+; GFX6-NEXT: s_min_u32 s8, s8, 32
+; GFX6-NEXT: s_min_u32 s9, s9, 32
+; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
+; GFX6-NEXT: s_sub_i32 s10, 32, s8
+; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
+; GFX6-NEXT: s_sub_i32 s11, 32, s9
+; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s6, 0
+; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s4, 0
+; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX6-NEXT: v_or_b32_e32 v0, s7, v0
+; GFX6-NEXT: v_or_b32_e32 v1, s5, v1
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v4
+; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10
+; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s11
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -466,24 +438,22 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)*
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_flbit_i32_b32 s2, s7
-; GFX8-NEXT: s_cmp_lg_u32 s7, 0
-; GFX8-NEXT: s_cselect_b32 s9, s2, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
-; GFX8-NEXT: s_sub_i32 s9, 32, s9
+; GFX8-NEXT: s_flbit_i32_b32 s3, s5
+; GFX8-NEXT: s_min_u32 s8, s2, 32
+; GFX8-NEXT: s_min_u32 s9, s3, 32
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0
-; GFX8-NEXT: s_flbit_i32_b32 s8, s5
-; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
-; GFX8-NEXT: s_cselect_b32 s6, s8, 32
; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9
; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT: s_sub_i32 s2, 32, s6
-; GFX8-NEXT: v_ldexp_f32 v0, v0, s9
+; GFX8-NEXT: s_sub_i32 s8, 32, s8
+; GFX8-NEXT: s_sub_i32 s2, 32, s9
+; GFX8-NEXT: v_ldexp_f32 v0, v0, s8
; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -518,14 +488,10 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_ffbh_u32_e32 v12, v8
; GFX6-NEXT: v_ffbh_u32_e32 v13, v6
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v9, 32, v9, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX6-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX6-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
+; GFX6-NEXT: v_min_u32_e32 v0, 32, v0
+; GFX6-NEXT: v_min_u32_e32 v9, 32, v9
+; GFX6-NEXT: v_min_u32_e32 v12, 32, v12
+; GFX6-NEXT: v_min_u32_e32 v13, 32, v13
; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0
; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0
; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9
@@ -584,16 +550,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)*
; GFX8-NEXT: v_ffbh_u32_e32 v13, v4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v8
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX8-NEXT: v_ffbh_u32_e32 v12, v6
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v13, 32, v13, vcc
; GFX8-NEXT: v_ffbh_u32_e32 v14, v2
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v14, 32, v14, vcc
+; GFX8-NEXT: v_min_u32_e32 v0, 32, v0
+; GFX8-NEXT: v_min_u32_e32 v12, 32, v12
+; GFX8-NEXT: v_min_u32_e32 v13, 32, v13
+; GFX8-NEXT: v_min_u32_e32 v14, 32, v14
; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8]
; GFX8-NEXT: v_sub_u32_e32 v15, vcc, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]