diff options
Diffstat (limited to 'test/CodeGen')
187 files changed, 17050 insertions, 5340 deletions
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 25c0e78a7b20..4a4c3c58072c 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -167,3 +167,70 @@ end: %vec = load <2 x i16*>, <2 x i16*>* undef br label %block } + +; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(s96) = G_INSERT %vreg2, %vreg0, 0; (in function: nonpow2_insertvalue_narrowing +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg2<def>(s96) = G_IMPLICIT_DEF; (in function: nonpow2_insertvalue_narrowing +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_insertvalue_narrowing +; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_insertvalue_narrowing: +%struct96 = type { float, float, float } +define void @nonpow2_insertvalue_narrowing(float %a) { + %dummy = insertvalue %struct96 undef, float %a, 0 + ret void +} + +; FALLBACK-WITH-REPORT-ERR remark: <unknown>:0:0: unable to legalize instruction: %vreg3<def>(s96) = G_ADD %vreg2, %vreg2; (in function: nonpow2_add_narrowing +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_add_narrowing +; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_add_narrowing: +define void @nonpow2_add_narrowing() { + %a = add i128 undef, undef + %b = trunc i128 %a to i96 + %dummy = add i96 %b, %b + ret void +} + +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg3<def>(s96) = G_OR %vreg2, %vreg2; (in function: nonpow2_or_narrowing +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_or_narrowing +; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_or_narrowing: +define void @nonpow2_or_narrowing() { + %a = add i128 undef, undef + %b = trunc i128 %a to i96 + %dummy = or i96 %b, %b + ret void +} + +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(s96) = G_LOAD %vreg1; mem:LD12[undef](align=16) (in function: nonpow2_load_narrowing +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_load_narrowing +; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_load_narrowing: +define void @nonpow2_load_narrowing() { + %dummy = load i96, i96* undef + ret void +} + +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: G_STORE %vreg3, %vreg0; mem:ST12[%c](align=16) (in function: nonpow2_store_narrowing +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_store_narrowing +; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_store_narrowing: +define void @nonpow2_store_narrowing(i96* %c) { + %a = add i128 undef, undef + %b = trunc i128 %a to i96 + store i96 %b, i96* %c + ret void +} + +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(s96) = G_CONSTANT 0; (in function: nonpow2_constant_narrowing +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_constant_narrowing +; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_constant_narrowing: +define void @nonpow2_constant_narrowing() { + store i96 0, i96* undef + ret void +} + +; Currently can't handle vector lengths that aren't an exact multiple of +; natively supported vector lengths. Test that the fall-back works for those. +; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(<7 x s64>) = G_ADD %vreg0, %vreg0; (in function: nonpow2_vector_add_fewerelements +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(<7 x s64>) = G_IMPLICIT_DEF; (in function: nonpow2_vector_add_fewerelements +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_vector_add_fewerelements +; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_vector_add_fewerelements: +define void @nonpow2_vector_add_fewerelements() { + %dummy = add <7 x i64> undef, undef + ret void +} diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir index 4042047dfc24..cc158a29c3e1 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir +++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir @@ -92,6 +92,10 @@ store double %vres, double* %addr ret void } + + define void @fp16Ext32() { ret void } + define void @fp16Ext64() { ret void } + define void @fp32Ext64() { ret void } ... --- @@ -742,3 +746,103 @@ body: | RET_ReallyLR ... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp16Ext32 +name: fp16Ext32 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %1:gpr(s32) = COPY %w0 +# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %3:fpr(s16) = COPY %0 +# CHECK-NEXT: %2:fpr(s32) = G_FPEXT %3 +# CHECK-NEXT: %s0 = COPY %2 +# CHECK-NEXT: RET_ReallyLR + +body: | + bb.1: + liveins: %w0 + + %1(s32) = COPY %w0 + %0(s16) = G_TRUNC %1(s32) + %2(s32) = G_FPEXT %0(s16) + %s0 = COPY %2(s32) + RET_ReallyLR implicit %s0 + +... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp16Ext64 +name: fp16Ext64 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 3, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %1:gpr(s32) = COPY %w0 +# CHECK-NEXT: %0:gpr(s16) = G_TRUNC %1 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %3:fpr(s16) = COPY %0 +# CHECK-NEXT: %2:fpr(s64) = G_FPEXT %3 +# CHECK-NEXT: %d0 = COPY %2 +# CHECK-NEXT: RET_ReallyLR + +body: | + bb.1: + liveins: %w0 + + %1(s32) = COPY %w0 + %0(s16) = G_TRUNC %1(s32) + %2(s64) = G_FPEXT %0(s16) + %d0 = COPY %2(s64) + RET_ReallyLR implicit %d0 + +... + +--- +# Make sure we map FPEXT on FPR register bank. +# CHECK-LABEL: name: fp32Ext64 +name: fp32Ext64 +alignment: 2 +legalized: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: fpr, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: fpr, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +# CHECK: %0:gpr(s32) = COPY %w0 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %2:fpr(s32) = COPY %0 +# CHECK-NEXT: %1:fpr(s64) = G_FPEXT %2 +# CHECK-NEXT: %d0 = COPY %1 +# CHECK-NEXT: RET_ReallyLR +body: | + bb.1: + liveins: %w0 + + %0(s32) = COPY %w0 + %1(s64) = G_FPEXT %0(s32) + %d0 = COPY %1(s64) + RET_ReallyLR implicit %d0 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir index fa6727da1bb1..20449c53a592 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir @@ -8,6 +8,10 @@ entry: ret void } + define void @test_scalar_add_big_nonpow2() { + entry: + ret void + } define void @test_scalar_add_small() { entry: ret void @@ -16,6 +20,10 @@ entry: ret void } + define void @test_vector_add_nonpow2() { + entry: + ret void + } ... --- @@ -58,6 +66,49 @@ body: | ... --- +name: test_scalar_add_big_nonpow2 +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } + - { id: 8, class: _ } + - { id: 9, class: _ } +body: | + bb.0.entry: + liveins: %x0, %x1, %x2, %x3 + ; CHECK-LABEL: name: test_scalar_add_big_nonpow2 + ; CHECK-NOT: G_MERGE_VALUES + ; CHECK-NOT: G_UNMERGE_VALUES + ; CHECK-DAG: [[CARRY0_32:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-DAG: [[CARRY0:%[0-9]+]]:_(s1) = G_TRUNC [[CARRY0_32]] + ; CHECK: [[RES_LO:%[0-9]+]]:_(s64), [[CARRY1:%[0-9]+]]:_(s1) = G_UADDE %0, %1, [[CARRY0]] + ; CHECK: [[RES_MI:%[0-9]+]]:_(s64), [[CARRY2:%[0-9]+]]:_(s1) = G_UADDE %1, %2, [[CARRY1]] + ; CHECK: [[RES_HI:%[0-9]+]]:_(s64), {{%.*}}(s1) = G_UADDE %2, %3, [[CARRY2]] + ; CHECK-NOT: G_MERGE_VALUES + ; CHECK-NOT: G_UNMERGE_VALUES + ; CHECK: %x0 = COPY [[RES_LO]] + ; CHECK: %x1 = COPY [[RES_MI]] + ; CHECK: %x2 = COPY [[RES_HI]] + + %0(s64) = COPY %x0 + %1(s64) = COPY %x1 + %2(s64) = COPY %x2 + %3(s64) = COPY %x3 + %4(s192) = G_MERGE_VALUES %0, %1, %2 + %5(s192) = G_MERGE_VALUES %1, %2, %3 + %6(s192) = G_ADD %4, %5 + %7(s64), %8(s64), %9(s64) = G_UNMERGE_VALUES %6 + %x0 = COPY %7 + %x1 = COPY %8 + %x2 = COPY %9 +... + +--- name: test_scalar_add_small registers: - { id: 0, class: _ } @@ -124,3 +175,43 @@ body: | %q0 = COPY %7 %q1 = COPY %8 ... +--- +name: test_vector_add_nonpow2 +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } + - { id: 8, class: _ } + - { id: 9, class: _ } +body: | + bb.0.entry: + liveins: %q0, %q1, %q2, %q3 + ; CHECK-LABEL: name: test_vector_add_nonpow2 + ; CHECK-NOT: G_EXTRACT + ; CHECK-NOT: G_SEQUENCE + ; CHECK: [[RES_LO:%[0-9]+]]:_(<2 x s64>) = G_ADD %0, %1 + ; CHECK: [[RES_MI:%[0-9]+]]:_(<2 x s64>) = G_ADD %1, %2 + ; CHECK: [[RES_HI:%[0-9]+]]:_(<2 x s64>) = G_ADD %2, %3 + ; CHECK-NOT: G_EXTRACT + ; CHECK-NOT: G_SEQUENCE + ; CHECK: %q0 = COPY [[RES_LO]] + ; CHECK: %q1 = COPY [[RES_MI]] + ; CHECK: %q2 = COPY [[RES_HI]] + + %0(<2 x s64>) = COPY %q0 + %1(<2 x s64>) = COPY %q1 + %2(<2 x s64>) = COPY %q2 + %3(<2 x s64>) = COPY %q3 + %4(<6 x s64>) = G_MERGE_VALUES %0, %1, %2 + %5(<6 x s64>) = G_MERGE_VALUES %1, %2, %3 + %6(<6 x s64>) = G_ADD %4, %5 + %7(<2 x s64>), %8(<2 x s64>), %9(<2 x s64>) = G_UNMERGE_VALUES %6 + %q0 = COPY %7 + %q1 = COPY %8 + %q2 = COPY %9 +... diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir index 7432b6761b73..405e6b546633 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir @@ -9,6 +9,7 @@ define void @test_inserts_4() { ret void } define void @test_inserts_5() { ret void } define void @test_inserts_6() { ret void } + define void @test_inserts_nonpow2() { ret void } ... --- @@ -141,3 +142,21 @@ body: | %4:_(s128) = G_INSERT %3, %2, 32 RET_ReallyLR ... + +--- +name: test_inserts_nonpow2 +body: | + bb.0: + liveins: %x0, %x1, %x2 + + + ; CHECK-LABEL: name: test_inserts_nonpow2 + ; CHECK: %5:_(s192) = G_MERGE_VALUES %3(s64), %1(s64), %2(s64) + %0:_(s64) = COPY %x0 + %1:_(s64) = COPY %x1 + %2:_(s64) = COPY %x2 + %3:_(s64) = COPY %x3 + %4:_(s192) = G_MERGE_VALUES %0, %1, %2 + %5:_(s192) = G_INSERT %4, %3, 0 + RET_ReallyLR +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir index c7b7ec9b6fe6..33b483511065 100644 --- a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir +++ b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir @@ -15,11 +15,11 @@ body: | %1:gpr(s64) = G_IMPLICIT_DEF ; CHECK: body: - ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, 15 + ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32 ; CHECK: %2:gpr64 = BFMXri %1, [[TMP]], 0, 31 %2:gpr(s64) = G_INSERT %1, %0, 0 - ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, 15 + ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32 ; CHECK: %3:gpr64 = BFMXri %1, [[TMP]], 51, 31 %3:gpr(s64) = G_INSERT %1, %0, 13 diff --git a/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir index 2c2e475a87a8..bd75c4e661ea 100644 --- a/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir +++ b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir @@ -33,7 +33,7 @@ body: | ; CHECK-LABEL: name: anyext_s64_from_s32 ; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %w0 - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, [[COPY]], 15 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32 ; CHECK: [[COPY1:%[0-9]+]]:gpr64all = COPY [[SUBREG_TO_REG]] ; CHECK: %x0 = COPY [[COPY1]] %0(s32) = COPY %w0 @@ -80,7 +80,7 @@ body: | ; CHECK-LABEL: name: zext_s64_from_s32 ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0 - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], 15 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32 ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[SUBREG_TO_REG]], 0, 31 ; CHECK: %x0 = COPY [[UBFMXri]] %0(s32) = COPY %w0 @@ -177,7 +177,7 @@ body: | ; CHECK-LABEL: name: sext_s64_from_s32 ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0 - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], 15 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32 ; CHECK: [[SBFMXri:%[0-9]+]]:gpr64 = SBFMXri [[SUBREG_TO_REG]], 0, 31 ; CHECK: %x0 = COPY [[SBFMXri]] %0(s32) = COPY %w0 diff --git a/test/CodeGen/AArch64/dwarf-cfi.ll b/test/CodeGen/AArch64/dwarf-cfi.ll new file mode 100644 index 000000000000..a75bcd19c69c --- /dev/null +++ b/test/CodeGen/AArch64/dwarf-cfi.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple aarch64-windows-gnu -filetype=asm -o - %s | FileCheck %s + +define void @_Z1gv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + invoke void @_Z1fv() + to label %try.cont unwind label %lpad + +lpad: + %0 = landingpad { i8*, i32 } + catch i8* null + %1 = extractvalue { i8*, i32 } %0, 0 + %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2 + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: + ret void +} + +declare void @_Z1fv() + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) + +declare void @__cxa_end_catch() + +; CHECK-LABEL: _Z1gv: +; CHECK: .cfi_startproc +; CHECK: .cfi_personality 0, __gxx_personality_v0 +; CHECK: .cfi_lsda 0, .Lexception0 +; CHECK: str x30, [sp, #-16]! +; CHECK: .cfi_def_cfa_offset 16 +; CHECK: .cfi_offset w30, -16 +; CHECK: ldr x30, [sp], #16 +; CHECK: .cfi_endproc diff --git a/test/CodeGen/AArch64/recp-fastmath.ll b/test/CodeGen/AArch64/recp-fastmath.ll index 38e0fb360e49..4776931cf062 100644 --- a/test/CodeGen/AArch64/recp-fastmath.ll +++ b/test/CodeGen/AArch64/recp-fastmath.ll @@ -18,6 +18,8 @@ define float @frecp1(float %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:s[0-7]]] ; CHECK-NEXT: frecps {{s[0-7](, s[0-7])?}}, [[R]] +; CHECK: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frecps {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} } define <2 x float> @f2recp0(<2 x float> %x) #0 { @@ -38,6 +40,8 @@ define <2 x float> @f2recp1(<2 x float> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.2s]] ; CHECK-NEXT: frecps {{v[0-7]\.2s(, v[0-7].2s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frecps {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} } define <4 x float> @f4recp0(<4 x float> %x) #0 { @@ -58,6 +62,8 @@ define <4 x float> @f4recp1(<4 x float> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]] ; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} } define <8 x float> @f8recp0(<8 x float> %x) #0 { @@ -77,10 +83,12 @@ define <8 x float> @f8recp1(<8 x float> %x) #1 { ; CHECK-LABEL: f8recp1: ; CHECK-NEXT: BB#0 -; CHECK-NEXT: frecpe [[RA:v[0-7]\.4s]] -; CHECK-NEXT: frecpe [[RB:v[0-7]\.4s]] -; CHECK-NEXT: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RA]] -; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[RB]] +; CHECK-NEXT: frecpe [[R:v[0-7]\.4s]] +; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.4s(, v[0-7].4s)?}}, {{v[0-7]\.4s}} +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frecps {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} } define double @drecp0(double %x) #0 { @@ -101,6 +109,9 @@ define double @drecp1(double %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:d[0-7]]] ; CHECK-NEXT: frecps {{d[0-7](, d[0-7])?}}, [[R]] +; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frecps {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} } define <2 x double> @d2recp0(<2 x double> %x) #0 { @@ -121,6 +132,9 @@ define <2 x double> @d2recp1(<2 x double> %x) #1 { ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]] ; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} } define <4 x double> @d4recp0(<4 x double> %x) #0 { @@ -140,10 +154,14 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 { ; CHECK-LABEL: d4recp1: ; CHECK-NEXT: BB#0 -; CHECK-NEXT: frecpe [[RA:v[0-7]\.2d]] -; CHECK-NEXT: frecpe [[RB:v[0-7]\.2d]] -; CHECK-NEXT: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RA]] -; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[RB]] +; CHECK-NEXT: frecpe [[R:v[0-7]\.2d]] +; CHECK: frecps {{v[0-7]\.2d(, v[0-7].2d)?}}, [[R]] +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} } attributes #0 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AArch64/sqrt-fastmath.ll b/test/CodeGen/AArch64/sqrt-fastmath.ll index 079562c05819..4dd0516faf0c 100644 --- a/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -22,7 +22,9 @@ define float @fsqrt(float %a) #0 { ; CHECK-NEXT: frsqrte [[RA:s[0-7]]] ; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]] -; CHECK: fcmp s0, #0 +; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK: fcmp {{s[0-7]}}, #0 } define <2 x float> @f2sqrt(<2 x float> %a) #0 { @@ -38,7 +40,9 @@ define <2 x float> @f2sqrt(<2 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0 +; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0 } define <4 x float> @f4sqrt(<4 x float> %a) #0 { @@ -54,7 +58,9 @@ define <4 x float> @f4sqrt(<4 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define <8 x float> @f8sqrt(<8 x float> %a) #0 { @@ -69,9 +75,16 @@ define <8 x float> @f8sqrt(<8 x float> %a) #0 { ; CHECK-LABEL: f8sqrt: ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] -; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] -; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.4s, v[0-1]\.4s}}, #0 +; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] +; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 +; CHECK: frsqrte [[RC:v[0-7]\.4s]] +; CHECK-NEXT: fmul [[RD:v[0-7]\.4s]], [[RC]], [[RC]] +; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RD]] +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define double @dsqrt(double %a) #0 { @@ -87,7 +100,10 @@ define double @dsqrt(double %a) #0 { ; CHECK-NEXT: frsqrte [[RA:d[0-7]]] ; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]] -; CHECK: fcmp d0, #0 +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: fcmp {{d[0-7]}}, #0 } define <2 x double> @d2sqrt(<2 x double> %a) #0 { @@ -103,7 +119,10 @@ define <2 x double> @d2sqrt(<2 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define <4 x double> @d4sqrt(<4 x double> %a) #0 { @@ -118,9 +137,19 @@ define <4 x double> @d4sqrt(<4 x double> %a) #0 { ; CHECK-LABEL: d4sqrt: ; CHECK-NEXT: BB#0 ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] -; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] -; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK: fcmeq {{v[0-7]\.2d, v[0-1]\.2d}}, #0 +; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] +; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 +; CHECK: frsqrte [[RC:v[0-7]\.2d]] +; CHECK-NEXT: fmul [[RD:v[0-7]\.2d]], [[RC]], [[RC]] +; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RD]] +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define float @frsqrt(float %a) #0 { @@ -137,6 +166,8 @@ define float @frsqrt(float %a) #0 { ; CHECK-NEXT: frsqrte [[RA:s[0-7]]] ; CHECK-NEXT: fmul [[RB:s[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{s[0-7](, s[0-7])?}}, [[RB]] +; CHECK: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} +; CHECK-NOT: frsqrts {{s[0-7]}}, {{s[0-7]}}, {{s[0-7]}} ; CHECK-NOT: fcmp {{s[0-7]}}, #0 } @@ -154,7 +185,9 @@ define <2 x float> @f2rsqrt(<2 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2s(, v[0-7]\.2s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2s, v0\.2s}}, #0 +; CHECK: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: frsqrts {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, {{v[0-7]\.2s}} +; CHECK-NOT: fcmeq {{v[0-7]\.2s}}, {{v[0-7]\.2s}}, #0 } define <4 x float> @f4rsqrt(<4 x float> %a) #0 { @@ -171,7 +204,9 @@ define <4 x float> @f4rsqrt(<4 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define <8 x float> @f8rsqrt(<8 x float> %a) #0 { @@ -189,7 +224,11 @@ define <8 x float> @f8rsqrt(<8 x float> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.4s]] ; CHECK: fmul [[RB:v[0-7]\.4s]], [[RA]], [[RA]] ; CHECK: frsqrts {{v[0-7]\.4s(, v[0-7]\.4s)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.4s, v0\.4s}}, #0 +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: frsqrts {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, {{v[0-7]\.4s}} +; CHECK-NOT: fcmeq {{v[0-7]\.4s}}, {{v[0-7]\.4s}}, #0 } define double @drsqrt(double %a) #0 { @@ -206,6 +245,9 @@ define double @drsqrt(double %a) #0 { ; CHECK-NEXT: frsqrte [[RA:d[0-7]]] ; CHECK-NEXT: fmul [[RB:d[0-7]]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{d[0-7](, d[0-7])?}}, [[RB]] +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} +; CHECK-NOT: frsqrts {{d[0-7]}}, {{d[0-7]}}, {{d[0-7]}} ; CHECK-NOT: fcmp d0, #0 } @@ -223,7 +265,10 @@ define <2 x double> @d2rsqrt(<2 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK-NEXT: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK-NEXT: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } define <4 x double> @d4rsqrt(<4 x double> %a) #0 { @@ -241,7 +286,13 @@ define <4 x double> @d4rsqrt(<4 x double> %a) #0 { ; CHECK-NEXT: frsqrte [[RA:v[0-7]\.2d]] ; CHECK: fmul [[RB:v[0-7]\.2d]], [[RA]], [[RA]] ; CHECK: frsqrts {{v[0-7]\.2d(, v[0-7]\.2d)?}}, [[RB]] -; CHECK-NOT: fcmeq {{v[0-7]\.2d, v0\.2d}}, #0 +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: frsqrts {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}} +; CHECK-NOT: fcmeq {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, #0 } attributes #0 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir index 4c05383615a6..70e2b5e4ae2b 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -44,28 +44,28 @@ regBankSelected: true # Max immediate for CI # SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292 # SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 3 -# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1 # SIVI-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0 # SIVI-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0 # SIVI: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] # SIVI-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1 # SIVI-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1 # SIVI: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] -# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1 # SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0 # Immediate overflow for CI # GCN: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0 # GCN: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 4 -# GCN: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# GCN: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1 # GCN-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0 # GCN-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0 # GCN: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] # GCN-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1 # GCN-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1 # GCN: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] -# GCN: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# GCN: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1 # GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 # Max 32-bit byte offset @@ -76,14 +76,14 @@ regBankSelected: true # Overflow 32-bit byte offset # SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0 # SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 1 -# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2 +# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1 # SIVI-DAG: [[K_SUB0:%[0-9]+]]:sgpr_32 = COPY [[K]].sub0 # SIVI-DAG: [[PTR_LO:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub0 # SIVI: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]] # SIVI-DAG: [[K_SUB1:%[0-9]+]]:sgpr_32 = COPY [[K]].sub1 # SIVI-DAG: [[PTR_HI:%[0-9]+]]:sgpr_32 = COPY [[PTR]].sub1 # SIVI: [[ADD_PTR_HI:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]] -# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2 +# SIVI: [[ADD_PTR:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[ADD_PTR_LO]], %subreg.sub0, [[ADD_PTR_HI]], %subreg.sub1 # SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741824, 0 diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir index b2f5e816b263..12460d25f3b2 100644 --- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir +++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir @@ -6,7 +6,7 @@ # CHECK: S_NOP 0, implicit-def %0 # CHECK: S_NOP 0, implicit-def %1 # CHECK: S_NOP 0, implicit-def dead %2 -# CHECK: %3:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}} +# CHECK: %3:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, undef %2, %subreg.sub3 # CHECK: S_NOP 0, implicit %3.sub0 # CHECK: S_NOP 0, implicit %3.sub1 # CHECK: S_NOP 0, implicit undef %3.sub2 @@ -42,9 +42,9 @@ body: | # Check defined lanes transfer; Includes checking for some special cases like # undef operands or IMPLICIT_DEF definitions. # CHECK-LABEL: name: test1 -# CHECK: %0:sreg_128 = REG_SEQUENCE %sgpr0, {{[0-9]+}}, %sgpr0, {{[0-9]+}} -# CHECK: %1:sreg_128 = INSERT_SUBREG %0, %sgpr1, {{[0-9]+}} -# CHECK: %2:sreg_64 = INSERT_SUBREG %0.sub2_sub3, %sgpr42, {{[0-9]+}} +# CHECK: %0:sreg_128 = REG_SEQUENCE %sgpr0, %subreg.sub0, %sgpr0, %subreg.sub2 +# CHECK: %1:sreg_128 = INSERT_SUBREG %0, %sgpr1, %subreg.sub3 +# CHECK: %2:sreg_64 = INSERT_SUBREG %0.sub2_sub3, %sgpr42, %subreg.sub0 # CHECK: S_NOP 0, implicit %1.sub0 # CHECK: S_NOP 0, implicit undef %1.sub1 # CHECK: S_NOP 0, implicit %1.sub2 @@ -53,24 +53,24 @@ body: | # CHECK: S_NOP 0, implicit undef %2.sub1 # CHECK: %3:sreg_32_xm0 = IMPLICIT_DEF -# CHECK: %4:sreg_128 = INSERT_SUBREG %0, undef %3, {{[0-9]+}} +# CHECK: %4:sreg_128 = INSERT_SUBREG %0, undef %3, %subreg.sub0 # CHECK: S_NOP 0, implicit undef %4.sub0 # CHECK: S_NOP 0, implicit undef %4.sub1 # CHECK: S_NOP 0, implicit %4.sub2 # CHECK: S_NOP 0, implicit undef %4.sub3 -# CHECK: %5:sreg_64 = EXTRACT_SUBREG %0, {{[0-9]+}} -# CHECK: %6:sreg_32_xm0 = EXTRACT_SUBREG %5, {{[0-9]+}} -# CHECK: %7:sreg_32_xm0 = EXTRACT_SUBREG %5, {{[0-9]+}} +# CHECK: %5:sreg_64 = EXTRACT_SUBREG %0, %subreg.sub0_sub1 +# CHECK: %6:sreg_32_xm0 = EXTRACT_SUBREG %5, %subreg.sub0 +# CHECK: %7:sreg_32_xm0 = EXTRACT_SUBREG %5, %subreg.sub1 # CHECK: S_NOP 0, implicit %5 # CHECK: S_NOP 0, implicit %6 # CHECK: S_NOP 0, implicit undef %7 # CHECK: %8:sreg_64 = IMPLICIT_DEF -# CHECK: %9:sreg_32_xm0 = EXTRACT_SUBREG undef %8, {{[0-9]+}} +# CHECK: %9:sreg_32_xm0 = EXTRACT_SUBREG undef %8, %subreg.sub1 # CHECK: S_NOP 0, implicit undef %9 -# CHECK: %10:sreg_128 = EXTRACT_SUBREG undef %0, {{[0-9]+}} +# CHECK: %10:sreg_128 = EXTRACT_SUBREG undef %0, %subreg.sub2_sub3 # CHECK: S_NOP 0, implicit undef %10 name: test1 registers: @@ -125,29 +125,29 @@ body: | # CHECK: S_NOP 0, implicit-def dead %0 # CHECK: S_NOP 0, implicit-def %1 # CHECK: S_NOP 0, implicit-def %2 -# CHECK: %3:sreg_128 = REG_SEQUENCE undef %0, {{[0-9]+}}, %1, {{[0-9]+}}, %2, {{[0-9]+}} +# CHECK: %3:sreg_128 = REG_SEQUENCE undef %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2_sub3 # CHECK: S_NOP 0, implicit %3.sub1 # CHECK: S_NOP 0, implicit %3.sub3 # CHECK: S_NOP 0, implicit-def %4 # CHECK: S_NOP 0, implicit-def dead %5 -# CHECK: %6:sreg_64 = REG_SEQUENCE %4, {{[0-9]+}}, undef %5, {{[0-9]+}} +# CHECK: %6:sreg_64 = REG_SEQUENCE %4, %subreg.sub0, undef %5, %subreg.sub1 # CHECK: S_NOP 0, implicit %6 # CHECK: S_NOP 0, implicit-def dead %7 # CHECK: S_NOP 0, implicit-def %8 -# CHECK: %9:sreg_128 = INSERT_SUBREG undef %7, %8, {{[0-9]+}} +# CHECK: %9:sreg_128 = INSERT_SUBREG undef %7, %8, %subreg.sub2_sub3 # CHECK: S_NOP 0, implicit %9.sub2 # CHECK: S_NOP 0, implicit-def %10 # CHECK: S_NOP 0, implicit-def dead %11 -# CHECK: %12:sreg_128 = INSERT_SUBREG %10, undef %11, {{[0-9]+}} +# CHECK: %12:sreg_128 = INSERT_SUBREG %10, undef %11, %subreg.sub0_sub1 # CHECK: S_NOP 0, implicit %12.sub3 # CHECK: S_NOP 0, implicit-def %13 # CHECK: S_NOP 0, implicit-def dead %14 -# CHECK: %15:sreg_128 = REG_SEQUENCE %13, {{[0-9]+}}, undef %14, {{[0-9]+}} -# CHECK: %16:sreg_64 = EXTRACT_SUBREG %15, {{[0-9]+}} +# CHECK: %15:sreg_128 = REG_SEQUENCE %13, %subreg.sub0_sub1, undef %14, %subreg.sub2_sub3 +# CHECK: %16:sreg_64 = EXTRACT_SUBREG %15, %subreg.sub0_sub1 # CHECK: S_NOP 0, implicit %16.sub1 name: test2 @@ -245,7 +245,7 @@ body: | # used. # CHECK-LABEL: name: test5 # CHECK: S_NOP 0, implicit-def %0 -# CHECK: %1:sreg_64 = REG_SEQUENCE undef %0, {{[0-9]+}}, %0, {{[0-9]+}} +# CHECK: %1:sreg_64 = REG_SEQUENCE undef %0, %subreg.sub0, %0, %subreg.sub1 # CHECK: S_NOP 0, implicit %1.sub1 name: test5 tracksRegLiveness: true @@ -265,7 +265,7 @@ body: | # CHECK: S_NOP 0, implicit-def %0 # CHECK: S_NOP 0, implicit-def dead %1 # CHECK: S_NOP 0, implicit-def dead %2 -# CHECK: %3:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, undef %1, {{[0-9]+}}, undef %2, {{[0-9]+}} +# CHECK: %3:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, undef %1, %subreg.sub1, undef %2, %subreg.sub2 # CHECK: bb.1: # CHECK: %4:sreg_128 = PHI %3, %bb.0, %5, %bb.1 @@ -315,12 +315,12 @@ body: | # CHECK: S_NOP 0, implicit-def %1 # CHECK: S_NOP 0, implicit-def dead %2 # CHECK: S_NOP 0, implicit-def %3 -# CHECK: %4:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}}, %1, {{[0-9]+}}, undef %2, {{[0-9]+}}, %3, {{[0-9]+}} +# CHECK: %4:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, undef %2, %subreg.sub2, %3, %subreg.sub3 # CHECK: bb.1: # CHECK: %5:sreg_128 = PHI %4, %bb.0, %6, %bb.1 -# CHECK: %6:sreg_128 = REG_SEQUENCE %5.sub1, {{[0-9]+}}, %5.sub3, {{[0-9]+}}, undef %5.sub2, {{[0-9]+}}, %5.sub0, {{[0-9]+}} +# CHECK: %6:sreg_128 = REG_SEQUENCE %5.sub1, %subreg.sub0, %5.sub3, %subreg.sub1, undef %5.sub2, %subreg.sub2, %5.sub0, %subreg.sub3 # CHECK: bb.2: # CHECK: S_NOP 0, implicit %6.sub3 @@ -361,12 +361,12 @@ body: | # CHECK-LABEL: name: loop2 # CHECK: bb.0: # CHECK: S_NOP 0, implicit-def %0 -# CHECK: %1:sreg_128 = REG_SEQUENCE %0, {{[0-9]+}} +# CHECK: %1:sreg_128 = REG_SEQUENCE %0, %subreg.sub0 # CHECK: bb.1: # CHECK: %2:sreg_128 = PHI %1, %bb.0, %3, %bb.1 -# CHECK: %3:sreg_128 = REG_SEQUENCE %2.sub3, {{[0-9]+}}, undef %2.sub1, {{[0-9]+}}, %2.sub0, {{[0-9]+}}, %2.sub2, {{[0-9]+}} +# CHECK: %3:sreg_128 = REG_SEQUENCE %2.sub3, %subreg.sub0, undef %2.sub1, %subreg.sub1, %2.sub0, %subreg.sub2, %2.sub2, %subreg.sub3 # CHECK: bb.2: # CHECK: S_NOP 0, implicit %2.sub0 diff --git a/test/CodeGen/AMDGPU/mad_64_32.ll b/test/CodeGen/AMDGPU/mad_64_32.ll new file mode 100644 index 000000000000..b4d9d9281013 --- /dev/null +++ b/test/CodeGen/AMDGPU/mad_64_32.ll @@ -0,0 +1,168 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s + +; GCN-LABEL: {{^}}mad_i64_i32_sextops: +; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] + +; SI: v_mul_lo_i32 +; SI: v_mul_hi_i32 +; SI: v_add_i32 +; SI: v_addc_u32 +define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { + %sext0 = sext i32 %arg0 to i64 + %sext1 = sext i32 %arg1 to i64 + %mul = mul i64 %sext0, %sext1 + %mad = add i64 %mul, %arg2 + ret i64 %mad +} + +; GCN-LABEL: {{^}}mad_i64_i32_sextops_commute: +; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] + +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_i32 +; SI: v_add_i32 +; SI: v_addc_u32 +define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { + %sext0 = sext i32 %arg0 to i64 + %sext1 = sext i32 %arg1 to i64 + %mul = mul i64 %sext0, %sext1 + %mad = add i64 %arg2, %mul + ret i64 %mad +} + +; GCN-LABEL: {{^}}mad_u64_u32_zextops: +; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3] + +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_u32 +; SI: v_add_i32 +; SI: v_addc_u32 +define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { + %sext0 = zext i32 %arg0 to i64 + %sext1 = zext i32 %arg1 to i64 + %mul = mul i64 %sext0, %sext1 + %mad = add i64 %mul, %arg2 + ret i64 %mad +} + +; GCN-LABEL: {{^}}mad_u64_u32_zextops_commute: +; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3] + +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_u32 +; SI: v_add_i32 +; SI: v_addc_u32 +define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { + %sext0 = zext i32 %arg0 to i64 + %sext1 = zext i32 %arg1 to i64 + %mul = mul i64 %sext0, %sext1 + %mad = add i64 %arg2, %mul + ret i64 %mad +} + + + + + + +; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128: +; CI: v_mad_u64_u32 +; CI: v_mad_u64_u32 +; CI: v_mad_u64_u32 +; CI: v_mad_i64_i32 + +; SI-NOT: v_mad_ +define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { + %sext0 = sext i32 %arg0 to i128 + %sext1 = sext i32 %arg1 to i128 + %mul = mul i128 %sext0, %sext1 + %mad = add i128 %mul, %arg2 + ret i128 %mad +} + +; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i63: +; CI: v_lshl_b64 +; CI: v_ashr +; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] + +; SI-NOT: v_mad_u64_u32 +define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 { + %sext0 = sext i32 %arg0 to i63 + %sext1 = sext i32 %arg1 to i63 + %mul = mul i63 %sext0, %sext1 + %mad = add i63 %mul, %arg2 + ret i63 %mad +} + +; GCN-LABEL: {{^}}mad_i64_i32_sextops_i31_i63: +; CI: v_lshl_b64 +; CI: v_ashr_i64 +; CI: v_bfe_i32 v1, v1, 0, 31 +; CI: v_bfe_i32 v0, v0, 0, 31 +; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] +define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { + %sext0 = sext i31 %arg0 to i63 + %sext1 = sext i31 %arg1 to i63 + %mul = mul i63 %sext0, %sext1 + %mad = add i63 %mul, %arg2 + ret i63 %mad +} + +; GCN-LABEL: {{^}}mad_u64_u32_bitops: +; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v2, v[4:5] +define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { + %trunc.lhs = and i64 %arg0, 4294967295 + %trunc.rhs = and i64 %arg1, 4294967295 + %mul = mul i64 %trunc.lhs, %trunc.rhs + %add = add i64 %mul, %arg2 + ret i64 %add +} + +; GCN-LABEL: {{^}}mad_u64_u32_bitops_lhs_mask_small: +; GCN-NOT: v_mad_ +define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 { + %trunc.lhs = and i64 %arg0, 8589934591 + %trunc.rhs = and i64 %arg1, 4294967295 + %mul = mul i64 %trunc.lhs, %trunc.rhs + %add = add i64 %mul, %arg2 + ret i64 %add +} + +; GCN-LABEL: {{^}}mad_u64_u32_bitops_rhs_mask_small: +; GCN-NOT: v_mad_ +define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 { + %trunc.lhs = and i64 %arg0, 4294967295 + %trunc.rhs = and i64 %arg1, 8589934591 + %mul = mul i64 %trunc.lhs, %trunc.rhs + %add = add i64 %mul, %arg2 + ret i64 %add +} + +; GCN-LABEL: {{^}}mad_i64_i32_bitops: +; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v2, v[4:5] +; SI-NOT: v_mad_ +define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { + %shl.lhs = shl i64 %arg0, 32 + %trunc.lhs = ashr i64 %shl.lhs, 32 + %shl.rhs = shl i64 %arg1, 32 + %trunc.rhs = ashr i64 %shl.rhs, 32 + %mul = mul i64 %trunc.lhs, %trunc.rhs + %add = add i64 %mul, %arg2 + ret i64 %add +} + +; Example from bug report +; GCN-LABEL: {{^}}mad_i64_i32_unpack_i64ops: +; CI: v_mad_u64_u32 v[0:1], s[6:7], v1, v0, v[0:1] +; SI-NOT: v_mad_u64_u32 +define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { + %tmp4 = lshr i64 %arg0, 32 + %tmp5 = and i64 %arg0, 4294967295 + %mul = mul nuw i64 %tmp4, %tmp5 + %mad = add i64 %mul, %arg0 + ret i64 %mad +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll index a0290789175d..555c65a6ffe9 100644 --- a/test/CodeGen/AMDGPU/mul.ll +++ b/test/CodeGen/AMDGPU/mul.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s ; mul24 and mad24 are affected @@ -8,8 +8,8 @@ ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -26,10 +26,10 @@ define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 @@ -41,10 +41,10 @@ define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a } ; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32: -; SI: s_load_dword -; SI: s_load_dword -; SI: s_mul_i32 -; SI: buffer_store_dword +; GCN: s_load_dword +; GCN: s_load_dword +; GCN: s_mul_i32 +; GCN: buffer_store_dword define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { %mul = mul i64 %b, %a %trunc = trunc i64 %mul to i32 @@ -53,10 +53,10 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a } ; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32: -; SI: s_load_dword -; SI: s_load_dword -; SI: v_mul_lo_i32 -; SI: buffer_store_dword +; GCN: s_load_dword +; GCN: s_load_dword +; GCN: v_mul_lo_i32 +; GCN: buffer_store_dword define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 @@ -71,8 +71,8 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 ad ; FUNC-LABEL: {{^}}mul64_sext_c: ; EG-DAG: MULLO_INT ; EG-DAG: MULHI_INT -; SI-DAG: s_mul_i32 -; SI-DAG: v_mul_hi_i32 +; GCN-DAG: s_mul_i32 +; GCN-DAG: v_mul_hi_i32 define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { entry: %0 = sext i32 %in to i64 @@ -84,9 +84,9 @@ entry: ; FUNC-LABEL: {{^}}v_mul64_sext_c: ; EG-DAG: MULLO_INT ; EG-DAG: MULHI_INT -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_mul_hi_i32 -; SI: s_endpgm +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_mul_hi_i32 +; GCN: s_endpgm define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in, align 4 %ext = sext i32 %val to i64 @@ -96,9 +96,9 @@ define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace( } ; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm: -; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 -; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 -; SI: s_endpgm +; GCN-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 +; GCN-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9 +; GCN: s_endpgm define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { %val = load i32, i32 addrspace(1)* %in, align 4 %ext = sext i32 %val to i64 @@ -108,12 +108,12 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 a } ; FUNC-LABEL: {{^}}s_mul_i32: -; SI: s_load_dword [[SRC0:s[0-9]+]], -; SI: s_load_dword [[SRC1:s[0-9]+]], -; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm +; GCN: s_load_dword [[SRC0:s[0-9]+]], +; GCN: s_load_dword [[SRC1:s[0-9]+]], +; GCN: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]] +; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; GCN: buffer_store_dword [[VRESULT]], +; GCN: s_endpgm define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { %mul = mul i32 %a, %b store i32 %mul, i32 addrspace(1)* %out, align 4 @@ -121,7 +121,7 @@ define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nou } ; FUNC-LABEL: {{^}}v_mul_i32: -; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in @@ -146,7 +146,7 @@ define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nou } ; FUNC-LABEL: {{^}}v_mul_i64: -; SI: v_mul_lo_i32 +; GCN: v_mul_lo_i32 define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 @@ -156,7 +156,7 @@ define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % } ; FUNC-LABEL: {{^}}mul32_in_branch: -; SI: s_mul_i32 +; GCN: s_mul_i32 define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { entry: %0 = icmp eq i32 %a, 0 @@ -177,9 +177,9 @@ endif: } ; FUNC-LABEL: {{^}}mul64_in_branch: -; SI-DAG: s_mul_i32 -; SI-DAG: v_mul_hi_u32 -; SI: s_endpgm +; GCN-DAG: s_mul_i32 +; GCN-DAG: v_mul_hi_u32 +; GCN: s_endpgm define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: %0 = icmp eq i64 %a, 0 @@ -201,29 +201,41 @@ endif: ; FIXME: Load dwordx4 ; FUNC-LABEL: {{^}}s_mul_i128: -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 ; SI: v_mul_hi_u32 ; SI: v_mul_hi_u32 ; SI: s_mul_i32 ; SI: v_mul_hi_u32 ; SI: s_mul_i32 + ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_u32 ; SI-DAG: v_mul_hi_u32 ; SI-DAG: s_mul_i32 ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_u32 + ; SI: s_mul_i32 ; SI: s_mul_i32 ; SI: s_mul_i32 ; SI: s_mul_i32 ; SI: s_mul_i32 -; SI: buffer_store_dwordx4 + +; VI: s_mul_i32 +; VI: v_mul_hi_u32 +; VI: v_mad_u64_u32 +; VI: s_mul_i32 +; VI: v_mul_hi_u32 +; VI: v_mad_u64_u32 +; VI: v_mad_u64_u32 + + +; GCN: buffer_store_dwordx4 define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 { %mul = mul i128 %a, %b store i128 %mul, i128 addrspace(1)* %out @@ -231,18 +243,19 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) } ; FUNC-LABEL: {{^}}v_mul_i128: -; SI: {{buffer|flat}}_load_dwordx4 -; SI: {{buffer|flat}}_load_dwordx4 +; GCN: {{buffer|flat}}_load_dwordx4 +; GCN: {{buffer|flat}}_load_dwordx4 + +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_mul_hi_u32 +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_mul_lo_i32 +; GCN-DAG: v_add_i32_e32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_hi_u32 -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_mul_lo_i32 -; SI: v_add_i32_e32 ; SI-DAG: v_mul_hi_u32 ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_hi_u32 @@ -252,7 +265,11 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_lo_i32 -; SI: {{buffer|flat}}_store_dwordx4 +; VI-DAG: v_mad_u64_u32 +; VI: v_mad_u64_u32 +; VI: v_mad_u64_u32 + +; GCN: {{buffer|flat}}_store_dwordx4 define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid diff --git a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir index 6c6590a154a0..9702d18d9059 100644 --- a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -5,19 +5,19 @@ # GCN-LABEL: {{^}}name: const_to_sgpr{{$}} # GCN: %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 # GCN-NEXT: %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576 -# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2 +# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1 # GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec # GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}} # GCN: %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 # GCN-NEXT: %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576 -# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2 +# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1 # GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec # GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec # GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}} -# GCN: %[[OP0:[0-9]+]]:vreg_64 = REG_SEQUENCE killed %{{[0-9]+}}, 1, killed %{{[0-9]+}}, 2 +# GCN: %[[OP0:[0-9]+]]:vreg_64 = REG_SEQUENCE killed %{{[0-9]+}}, %subreg.sub0, killed %{{[0-9]+}}, %subreg.sub1 # GCN-NEXT: V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit %exec --- | @@ -109,7 +109,7 @@ body: | %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 %6 = COPY %7 %9 = S_MOV_B32 0 - %10 = REG_SEQUENCE %2, 1, killed %9, 2 + %10 = REG_SEQUENCE %2, %subreg.sub0, killed %9, %subreg.sub1 %0 = COPY %10 %11 = COPY %10.sub0 %12 = COPY %10.sub1 @@ -117,10 +117,10 @@ body: | %14 = COPY %8.sub1 %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc - %17 = REG_SEQUENCE killed %15, 1, killed %16, 2 + %17 = REG_SEQUENCE killed %15, %subreg.sub0, killed %16, %subreg.sub1 %18 = S_MOV_B32 0 %19 = S_MOV_B32 1048576 - %20 = REG_SEQUENCE killed %19, 1, killed %18, 2 + %20 = REG_SEQUENCE killed %19, %subreg.sub0, killed %18, %subreg.sub1 %22 = COPY killed %20 %21 = V_CMP_LT_U64_e64 killed %17, %22, implicit %exec %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec @@ -133,7 +133,7 @@ body: | %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc %25 = S_MOV_B32 61440 %26 = S_MOV_B32 0 - %27 = REG_SEQUENCE killed %26, 1, killed %25, 2 + %27 = REG_SEQUENCE killed %26, %subreg.sub0, killed %25, %subreg.sub1 %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit %exec %30 = COPY %24 @@ -208,7 +208,7 @@ body: | %9 = S_LOAD_DWORDX2_IMM %3, 13, 0 %6 = COPY %7 %10 = S_MOV_B32 0 - %11 = REG_SEQUENCE %2, 1, killed %10, 2 + %11 = REG_SEQUENCE %2, %subreg.sub0, killed %10, %subreg.sub1 %0 = COPY %11 %12 = COPY %11.sub0 %13 = COPY %11.sub1 @@ -216,15 +216,15 @@ body: | %15 = COPY %8.sub1 %16 = S_ADD_U32 %12, killed %14, implicit-def %scc %17 = S_ADDC_U32 %13, killed %15, implicit-def dead %scc, implicit %scc - %18 = REG_SEQUENCE killed %16, 1, killed %17, 2 + %18 = REG_SEQUENCE killed %16, %subreg.sub0, killed %17, %subreg.sub1 %19 = COPY %9.sub0 %20 = COPY %9.sub1 %21 = S_ADD_U32 %12, killed %19, implicit-def %scc %22 = S_ADDC_U32 %13, killed %20, implicit-def dead %scc, implicit %scc - %23 = REG_SEQUENCE killed %21, 1, killed %22, 2 + %23 = REG_SEQUENCE killed %21, %subreg.sub0, killed %22, %subreg.sub1 %24 = S_MOV_B32 0 %25 = S_MOV_B32 1048576 - %26 = REG_SEQUENCE killed %25, 1, killed %24, 2 + %26 = REG_SEQUENCE killed %25, %subreg.sub0, killed %24, %subreg.sub1 %28 = COPY %26 %27 = V_CMP_LT_U64_e64 killed %18, %28, implicit %exec %29 = V_CMP_LT_U64_e64 killed %23, %28, implicit %exec @@ -239,7 +239,7 @@ body: | %33 = S_LSHL_B64 %0, killed %32, implicit-def dead %scc %34 = S_MOV_B32 61440 %35 = S_MOV_B32 0 - %36 = REG_SEQUENCE killed %35, 1, killed %34, 2 + %36 = REG_SEQUENCE killed %35, %subreg.sub0, killed %34, %subreg.sub1 %37 = REG_SEQUENCE %6, 17, killed %36, 18 %38 = V_MOV_B32_e32 0, implicit %exec %39 = COPY %33 @@ -304,7 +304,7 @@ body: | %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 %6 = COPY %7 %9 = S_MOV_B32 0 - %10 = REG_SEQUENCE %2, 1, killed %9, 2 + %10 = REG_SEQUENCE %2, %subreg.sub0, killed %9, %subreg.sub1 %0 = COPY %10 %11 = COPY %10.sub0 %12 = COPY %10.sub1 @@ -312,10 +312,10 @@ body: | %14 = COPY %8.sub1 %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc - %17 = REG_SEQUENCE killed %15, 1, killed %16, 2 + %17 = REG_SEQUENCE killed %15, %subreg.sub0, killed %16, %subreg.sub1 %18 = S_MOV_B32 12 %19 = S_MOV_B32 1048576 - %20 = REG_SEQUENCE killed %19, 1, killed %18, 2 + %20 = REG_SEQUENCE killed %19, %subreg.sub0, killed %18, %subreg.sub1 %22 = COPY killed %20.sub1 %21 = V_CMP_LT_U32_e64 killed %17.sub0, %22, implicit %exec %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec @@ -328,7 +328,7 @@ body: | %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc %25 = S_MOV_B32 61440 %26 = S_MOV_B32 0 - %27 = REG_SEQUENCE killed %26, 1, killed %25, 2 + %27 = REG_SEQUENCE killed %26, %subreg.sub0, killed %25, %subreg.sub1 %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit %exec %30 = COPY %24 diff --git a/test/CodeGen/AMDGPU/private-memory-r600.ll b/test/CodeGen/AMDGPU/private-memory-r600.ll index 866cd16ec3b5..65e728174291 100644 --- a/test/CodeGen/AMDGPU/private-memory-r600.ll +++ b/test/CodeGen/AMDGPU/private-memory-r600.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC -; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +; RUN: opt -S -mtriple=r600-unknown-unknown-amdgiz -mcpu=redwood -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s +target datalayout = "A5" declare i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -18,19 +19,19 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [5 x i32], align 4, addrspace(5) %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 + store i32 4, i32 addrspace(5)* %arrayidx1, align 4 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 + store i32 5, i32 addrspace(5)* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 + %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 + %3 = load i32, i32 addrspace(5)* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %3, i32 addrspace(1)* %arrayidx13 ret void @@ -49,20 +50,20 @@ entry: define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { entry: - %a = alloca %struct.point - %b = alloca %struct.point - %a.x.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0 - %a.y.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 1 - %b.x.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0 - %b.y.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 1 - store i32 0, i32* %a.x.ptr - store i32 1, i32* %a.y.ptr - store i32 2, i32* %b.x.ptr - store i32 3, i32* %b.y.ptr - %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %a, i32 0, i32 0 - %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point* %b, i32 0, i32 0 - %a.indirect = load i32, i32* %a.indirect.ptr - %b.indirect = load i32, i32* %b.indirect.ptr + %a = alloca %struct.point, addrspace(5) + %b = alloca %struct.point, addrspace(5) + %a.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 + %a.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 1 + %b.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 + %b.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %a.x.ptr + store i32 1, i32 addrspace(5)* %a.y.ptr + store i32 2, i32 addrspace(5)* %b.x.ptr + store i32 3, i32 addrspace(5)* %b.y.ptr + %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 + %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 + %a.indirect = load i32, i32 addrspace(5)* %a.indirect.ptr + %b.indirect = load i32, i32 addrspace(5)* %b.indirect.ptr %0 = add i32 %a.indirect, %b.indirect store i32 %0, i32 addrspace(1)* %out ret void @@ -77,32 +78,32 @@ entry: define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: - %prv_array_const = alloca [2 x i32] - %prv_array = alloca [2 x i32] + %prv_array_const = alloca [2 x i32], addrspace(5) + %prv_array = alloca [2 x i32], addrspace(5) %a = load i32, i32 addrspace(1)* %in %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 %b = load i32, i32 addrspace(1)* %b_src_ptr - %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 - store i32 %a, i32* %a_dst_ptr - %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1 - store i32 %b, i32* %b_dst_ptr + %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 + store i32 %a, i32 addrspace(5)* %a_dst_ptr + %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 1 + store i32 %b, i32 addrspace(5)* %b_dst_ptr br label %for.body for.body: %inc = phi i32 [0, %entry], [%count, %for.body] - %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 - %x = load i32, i32* %x_ptr - %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 - %y = load i32, i32* %y_ptr + %x_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 + %x = load i32, i32 addrspace(5)* %x_ptr + %y_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 + %y = load i32, i32 addrspace(5)* %y_ptr %xy = add i32 %x, %y - store i32 %xy, i32* %y_ptr + store i32 %xy, i32 addrspace(5)* %y_ptr %count = add i32 %inc, 1 %done = icmp eq i32 %count, 4095 br i1 %done, label %for.end, label %for.body for.end: - %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 - %value = load i32, i32* %value_ptr + %value_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 + %value = load i32, i32 addrspace(5)* %value_ptr store i32 %value, i32 addrspace(1)* %out ret void } @@ -112,13 +113,13 @@ for.end: ; R600: MOVA_INT define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %0 = alloca [2 x i16] - %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0 - %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1 - store i16 0, i16* %1 - store i16 1, i16* %2 - %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index - %4 = load i16, i16* %3 + %0 = alloca [2 x i16], addrspace(5) + %1 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 0 + %2 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 1 + store i16 0, i16 addrspace(5)* %1 + store i16 1, i16 addrspace(5)* %2 + %3 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 %index + %4 = load i16, i16 addrspace(5)* %3 %5 = sext i16 %4 to i32 store i32 %5, i32 addrspace(1)* %out ret void @@ -129,13 +130,13 @@ entry: ; R600: MOVA_INT define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %0 = alloca [2 x i8] - %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0 - %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1 - store i8 0, i8* %1 - store i8 1, i8* %2 - %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index - %4 = load i8, i8* %3 + %0 = alloca [2 x i8], addrspace(5) + %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 0 + %2 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 1 + store i8 0, i8 addrspace(5)* %1 + store i8 1, i8 addrspace(5)* %2 + %3 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 %index + %4 = load i8, i8 addrspace(5)* %3 %5 = sext i8 %4 to i32 store i32 %5, i32 addrspace(1)* %out ret void @@ -150,13 +151,13 @@ entry: ; R600-NOT: MOV * TO.X define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { entry: - %0 = alloca [2 x i32] - %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0 - %2 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 1 - store i32 0, i32* %1 - store i32 1, i32* %2 - %3 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 %in - %4 = load i32, i32* %3 + %0 = alloca [2 x i32], addrspace(5) + %1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0 + %2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %1 + store i32 1, i32 addrspace(5)* %2 + %3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in + %4 = load i32, i32 addrspace(5)* %3 %5 = call i32 @llvm.r600.read.tidig.x() %6 = add i32 %4, %5 store i32 %6, i32 addrspace(1)* %out @@ -171,22 +172,22 @@ entry: ; R600-NOT: [[CHAN]]+ define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: - %0 = alloca [3 x i8], align 1 - %1 = alloca [2 x i8], align 1 - %2 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 0 - %3 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 1 - %4 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 2 - %5 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 0 - %6 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 1 - store i8 0, i8* %2 - store i8 1, i8* %3 - store i8 2, i8* %4 - store i8 1, i8* %5 - store i8 0, i8* %6 - %7 = getelementptr inbounds [3 x i8], [3 x i8]* %0, i32 0, i32 %in - %8 = getelementptr inbounds [2 x i8], [2 x i8]* %1, i32 0, i32 %in - %9 = load i8, i8* %7 - %10 = load i8, i8* %8 + %0 = alloca [3 x i8], align 1, addrspace(5) + %1 = alloca [2 x i8], align 1, addrspace(5) + %2 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 0 + %3 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 1 + %4 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 2 + %5 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 0 + %6 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 1 + store i8 0, i8 addrspace(5)* %2 + store i8 1, i8 addrspace(5)* %3 + store i8 2, i8 addrspace(5)* %4 + store i8 1, i8 addrspace(5)* %5 + store i8 0, i8 addrspace(5)* %6 + %7 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 %in + %8 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 %in + %9 = load i8, i8 addrspace(5)* %7 + %10 = load i8, i8 addrspace(5)* %8 %11 = add i8 %9, %10 %12 = sext i8 %11 to i32 store i32 %12, i32 addrspace(1)* %out @@ -195,13 +196,13 @@ entry: define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x i8]] - %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1 - store i8 0, i8* %gep0 - store i8 1, i8* %gep1 - %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index - %load = load i8, i8* %gep2 + %alloca = alloca [2 x [2 x i8]], addrspace(5) + %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + store i8 0, i8 addrspace(5)* %gep0 + store i8 1, i8 addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index + %load = load i8, i8 addrspace(5)* %gep2 %sext = sext i8 %load to i32 store i32 %sext, i32 addrspace(1)* %out ret void @@ -209,26 +210,26 @@ entry: define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x i32]] - %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32* %gep2 + %alloca = alloca [2 x [2 x i32]], addrspace(5) + %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index + %load = load i32, i32 addrspace(5)* %gep2 store i32 %load, i32 addrspace(1)* %out ret void } define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x i64]] - %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1 - store i64 0, i64* %gep0 - store i64 1, i64* %gep1 - %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index - %load = load i64, i64* %gep2 + %alloca = alloca [2 x [2 x i64]], addrspace(5) + %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + store i64 0, i64 addrspace(5)* %gep0 + store i64 1, i64 addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index + %load = load i64, i64 addrspace(5)* %gep2 store i64 %load, i64 addrspace(1)* %out ret void } @@ -237,40 +238,40 @@ entry: define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x [2 x %struct.pair32]] - %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 - %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0 - %load = load i32, i32* %gep2 + %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5) + %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0, i32 1 + %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1, i32 1 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index, i32 0 + %load = load i32, i32 addrspace(5)* %gep2 store i32 %load, i32 addrspace(1)* %out ret void } define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: - %alloca = alloca [2 x %struct.pair32] - %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 - %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0 - %load = load i32, i32* %gep2 + %alloca = alloca [2 x %struct.pair32], addrspace(5) + %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 0, i32 1 + %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 1, i32 0 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 %index, i32 0 + %load = load i32, i32 addrspace(5)* %gep2 store i32 %load, i32 addrspace(1)* %out ret void } define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 + %tmp = alloca [2 x i32], addrspace(5) + %tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 + %tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %tmp1 + store i32 1, i32 addrspace(5)* %tmp2 %cmp = icmp eq i32 %in, 0 - %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2 - %load = load i32, i32* %sel + %sel = select i1 %cmp, i32 addrspace(5)* %tmp1, i32 addrspace(5)* %tmp2 + %load = load i32, i32 addrspace(5)* %sel store i32 %load, i32 addrspace(1)* %out ret void } @@ -283,14 +284,14 @@ entry: ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %alloca = alloca [16 x i32] - %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - store i32 5, i32* %tmp0 - %tmp1 = ptrtoint [16 x i32]* %alloca to i32 + %alloca = alloca [16 x i32], addrspace(5) + %tmp0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a + store i32 5, i32 addrspace(5)* %tmp0 + %tmp1 = ptrtoint [16 x i32] addrspace(5)* %alloca to i32 %tmp2 = add i32 %tmp1, 5 - %tmp3 = inttoptr i32 %tmp2 to i32* - %tmp4 = getelementptr inbounds i32, i32* %tmp3, i32 %b - %tmp5 = load i32, i32* %tmp4 + %tmp3 = inttoptr i32 %tmp2 to i32 addrspace(5)* + %tmp4 = getelementptr inbounds i32, i32 addrspace(5)* %tmp3, i32 %b + %tmp5 = load i32, i32 addrspace(5)* %tmp4 store i32 %tmp5, i32 addrspace(1)* %out ret void } diff --git a/test/CodeGen/AMDGPU/simplify-libcalls.ll b/test/CodeGen/AMDGPU/simplify-libcalls.ll index 47eb9a9a3d13..aa6c1833bdec 100644 --- a/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -1,11 +1,11 @@ -; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s -; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s -; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s +; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s +; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s +; RUN: opt -S -O1 -mtriple=amdgcn---amdgiz -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos ; GCN-POSTLINK: tail call fast float @_Z3sinf( ; GCN-POSTLINK: tail call fast float @_Z3cosf( -; GCN-PRELINK: call fast float @_Z6sincosfPU3AS4f( +; GCN-PRELINK: call fast float @_Z6sincosfPf( ; GCN-NATIVE: tail call fast float @_Z10native_sinf( ; GCN-NATIVE: tail call fast float @_Z10native_cosf( define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) { @@ -26,7 +26,7 @@ declare float @_Z3cosf(float) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2 ; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f( ; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f( -; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS4S_( +; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_( ; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f( ; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f( define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) { @@ -47,7 +47,7 @@ declare <2 x float> @_Z3cosDv2_f(<2 x float>) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3 ; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f( ; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f( -; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS4S_( +; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_( ; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f( ; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f( define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) { @@ -73,7 +73,7 @@ declare <3 x float> @_Z3cosDv3_f(<3 x float>) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4 ; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f( ; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f( -; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS4S_( +; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_( ; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f( ; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f( define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) { @@ -94,7 +94,7 @@ declare <4 x float> @_Z3cosDv4_f(<4 x float>) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8 ; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f( ; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f( -; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS4S_( +; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_( ; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f( ; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f( define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) { @@ -115,7 +115,7 @@ declare <8 x float> @_Z3cosDv8_f(<8 x float>) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16 ; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f( ; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f( -; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS4S_( +; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_( ; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f( ; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f( define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) { @@ -685,101 +685,101 @@ define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) { entry: %tmp = load float, float addrspace(1)* %a, align 4 %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1 - %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float addrspace(4)* - %call = tail call fast float @_Z6sincosfPU3AS4f(float %tmp, float addrspace(4)* %tmp1) + %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float* + %call = tail call fast float @_Z6sincosfPf(float %tmp, float* %tmp1) store float %call, float addrspace(1)* %a, align 4 ret void } -declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*) +declare float @_Z6sincosfPf(float, float*) %opencl.pipe_t = type opaque %opencl.reserve_id_t = type opaque ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) -; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND:[0-9]+]] -; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[NOUNWIND:[0-9]+]] +; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[NOUNWIND]] define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr { entry: %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)* - %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)* - %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0 - %tmp3 = tail call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) - %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0 - tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4) + %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8* + %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0 + %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) + %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0 + tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) ret void } -declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) +declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32) -declare %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) +declare %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) -declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) +declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32) -declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) +declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) -; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[NOUNWIND]] define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr { entry: %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)* - %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)* - %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0 - %tmp3 = tail call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0 - %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0 - tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4) #0 + %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8* + %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0 + %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0 + %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0 + tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0 ret void } -declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) local_unnamed_addr +declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32) local_unnamed_addr -declare %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr +declare %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr -declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) local_unnamed_addr +declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32) local_unnamed_addr -declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) local_unnamed_addr +declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32) local_unnamed_addr %struct.S = type { [100 x i32] } ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size -; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8 addrspace(4)* %{{.*}}) #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16 addrspace(4)* %{{.*}}) #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64 addrspace(4)* %{{.*}}) #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64> addrspace(4)* %{{.*}}) #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8 addrspace(4)* %{{.*}} i32 400, i32 4) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64>* %{{.*}}) #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64>* %{{.*}} #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64>* %{{.*}} #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64>* %{{.*}} #[[NOUNWIND]] +; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8* %{{.*}} i32 400, i32 4) #[[NOUNWIND]] define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 { entry: - %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8 addrspace(4)* - %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(4)* %tmp, i32 1, i32 1) #0 + %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8* + %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0 %tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)* - %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8 addrspace(4)* - %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8 addrspace(4)* %tmp3, i32 2, i32 2) #0 + %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8* + %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0 %tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)* - %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8 addrspace(4)* - %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8 addrspace(4)* %tmp6, i32 4, i32 4) #0 + %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8* + %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0 %tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)* - %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8 addrspace(4)* - %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8 addrspace(4)* %tmp9, i32 8, i32 8) #0 + %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8* + %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0 %tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)* - %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8 addrspace(4)* - %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8 addrspace(4)* %tmp12, i32 16, i32 16) #0 + %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8* + %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0 %tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)* - %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8 addrspace(4)* - %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8 addrspace(4)* %tmp15, i32 32, i32 32) #0 + %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8* + %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0 %tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)* - %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8 addrspace(4)* - %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8 addrspace(4)* %tmp18, i32 64, i32 64) #0 + %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8* + %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0 %tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)* - %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8 addrspace(4)* - %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8 addrspace(4)* %tmp21, i32 128, i32 128) #0 + %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8* + %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0 %tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)* - %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8 addrspace(4)* - %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8 addrspace(4)* %tmp24, i32 400, i32 4) #0 + %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8* + %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0 ret void } diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll index e25f2235993f..6dfcff77d813 100644 --- a/test/CodeGen/AMDGPU/unknown-processor.ll +++ b/test/CodeGen/AMDGPU/unknown-processor.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s -; RUN: llc -march=r600 -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s +; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s +target datalayout = "A5" ; Should not crash when the processor is not recognized and the ; wavefront size feature not set. @@ -14,7 +15,7 @@ ; R600: MOV define amdgpu_kernel void @foo() { - %alloca = alloca i32, align 4 - store volatile i32 0, i32* %alloca + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca ret void } diff --git a/test/CodeGen/AMDGPU/unsupported-calls.ll b/test/CodeGen/AMDGPU/unsupported-calls.ll index 990b25e0c590..68872c54f7fb 100644 --- a/test/CodeGen/AMDGPU/unsupported-calls.ll +++ b/test/CodeGen/AMDGPU/unsupported-calls.ll @@ -1,5 +1,5 @@ -; RUN: not llc -march=amdgcn -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s -; RUN: not llc -march=r600 -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s +; RUN: not llc -march=amdgcn -mtriple=amdgcn---amdgiz -tailcallopt < %s 2>&1 | FileCheck -check-prefix=GCN %s +; RUN: not llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress -tailcallopt < %s 2>&1 | FileCheck -check-prefix=R600 %s declare i32 @external_function(i32) nounwind diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir index d96463f00c7b..939c851584cf 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-combos.mir @@ -1,6 +1,7 @@ # RUN: llc -O0 -mtriple arm-- -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --- | define void @test_mla() #0 { ret void } + define void @test_mla_commutative() #0 { ret void } define void @test_mla_v5() #1 { ret void } define void @test_mls() #2 { ret void } @@ -45,6 +46,40 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_mla_commutative +# CHECK-LABEL: name: test_mla_commutative +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } +body: | + bb.0: + liveins: %r0, %r1, %r2 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + ; CHECK: [[VREGX:%[0-9]+]]:gprnopc = COPY %r0 + ; CHECK: [[VREGY:%[0-9]+]]:gprnopc = COPY %r1 + ; CHECK: [[VREGZ:%[0-9]+]]:gprnopc = COPY %r2 + + %3(s32) = G_MUL %0, %1 + %4(s32) = G_ADD %2, %3 + ; CHECK: [[VREGR:%[0-9]+]]:gprnopc = MLA [[VREGX]], [[VREGY]], [[VREGZ]], 14, _, _ + + %r0 = COPY %4(s32) + ; CHECK: %r0 = COPY [[VREGR]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_mla_v5 # CHECK-LABEL: name: test_mla_v5 legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 0fdd485ba906..588ceaca2c47 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -970,9 +970,10 @@ registers: - { id: 1, class: gprb } - { id: 2, class: gprb } - { id: 3, class: gprb } + - { id: 4, class: gprb } body: | bb.0: - liveins: %r0, %r1 + liveins: %r0, %r1, %r2 %0(p0) = COPY %r0 ; CHECK: [[VREGX:%[0-9]+]]:gpr = COPY %r0 @@ -980,14 +981,17 @@ body: | %1(p0) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]]:gpr = COPY %r1 - %2(s1) = G_TRUNC %1(p0) - ; CHECK: [[VREGC:%[0-9]+]]:gpr = COPY [[VREGY]] + %2(s32) = COPY %r2 + ; CHECK: [[VREGC:%[0-9]+]]:gpr = COPY %r2 - %3(p0) = G_SELECT %2(s1), %0, %1 - ; CHECK: CMPri [[VREGC]], 0, 14, _, implicit-def %cpsr + %3(s1) = G_TRUNC %2(s32) + ; CHECK: [[VREGD:%[0-9]+]]:gpr = COPY [[VREGC]] + + %4(p0) = G_SELECT %3(s1), %0, %1 + ; CHECK: CMPri [[VREGD]], 0, 14, _, implicit-def %cpsr ; CHECK: [[RES:%[0-9]+]]:gpr = MOVCCr [[VREGX]], [[VREGY]], 0, %cpsr - %r0 = COPY %3(p0) + %r0 = COPY %4(p0) ; CHECK: %r0 = COPY [[RES]] BX_RET 14, _, implicit %r0 diff --git a/test/CodeGen/Generic/llc-start-stop.ll b/test/CodeGen/Generic/llc-start-stop.ll index 85b69c37aa01..9056e2cab49d 100644 --- a/test/CodeGen/Generic/llc-start-stop.ll +++ b/test/CodeGen/Generic/llc-start-stop.ll @@ -13,15 +13,15 @@ ; STOP-BEFORE-NOT: Loop Strength Reduction ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER -; START-AFTER: -machine-branch-prob -gc-lowering +; START-AFTER: -machine-branch-prob -expandmemcmp ; START-AFTER: FunctionPass Manager -; START-AFTER-NEXT: Lower Garbage Collection Instructions +; START-AFTER-NEXT: Expand memcmp() to load/stores ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE ; START-BEFORE: -machine-branch-prob -domtree ; START-BEFORE: FunctionPass Manager ; START-BEFORE: Loop Strength Reduction -; START-BEFORE-NEXT: Lower Garbage Collection Instructions +; START-BEFORE-NEXT: Expand memcmp() to load/stores ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE diff --git a/test/CodeGen/Hexagon/isel-prefer.ll b/test/CodeGen/Hexagon/isel-prefer.ll index 062b0b3a0ea3..7094544f54b7 100644 --- a/test/CodeGen/Hexagon/isel-prefer.ll +++ b/test/CodeGen/Hexagon/isel-prefer.ll @@ -54,4 +54,14 @@ b2: ret i32 %v6 } +; CHECK-LABEL: Prefer_L2_loadrub_io: +; CHECK: memub(r0+#65) +define i64 @Prefer_L2_loadrub_io(i8* %a0) #0 { +b1: + %v2 = getelementptr i8, i8* %a0, i32 65 + %v3 = load i8, i8* %v2 + %v4 = zext i8 %v3 to i64 + ret i64 %v4 +} + attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/MIR/X86/subregister-index-operands.mir b/test/CodeGen/MIR/X86/subregister-index-operands.mir index e3c5b9d17eec..4d8b24608b7e 100644 --- a/test/CodeGen/MIR/X86/subregister-index-operands.mir +++ b/test/CodeGen/MIR/X86/subregister-index-operands.mir @@ -22,9 +22,9 @@ body: | liveins: %edi, %eax ; CHECK-LABEL: name: t ; CHECK: liveins: %edi, %eax - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:gr32 = INSERT_SUBREG %edi, %al, 1 - ; CHECK: [[EXTRACT_SUBREG:%[0-9]+]]:gr8 = EXTRACT_SUBREG %eax, 2 - ; CHECK: %ax = REG_SEQUENCE [[EXTRACT_SUBREG]], 1, [[EXTRACT_SUBREG]], 2 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:gr32 = INSERT_SUBREG %edi, %al, %subreg.sub_8bit + ; CHECK: [[EXTRACT_SUBREG:%[0-9]+]]:gr8 = EXTRACT_SUBREG %eax, %subreg.sub_8bit_hi + ; CHECK: %ax = REG_SEQUENCE [[EXTRACT_SUBREG]], %subreg.sub_8bit, [[EXTRACT_SUBREG]], %subreg.sub_8bit_hi ; CHECK: RETQ %ax %0 = INSERT_SUBREG %edi, %al, %subreg.sub_8bit %1 = EXTRACT_SUBREG %eax, %subreg.sub_8bit_hi diff --git a/test/CodeGen/Mips/brind-tailcall.ll b/test/CodeGen/Mips/brind-tailcall.ll new file mode 100644 index 000000000000..78fb0f151077 --- /dev/null +++ b/test/CodeGen/Mips/brind-tailcall.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=pic < %s 2>&1 | FileCheck --check-prefix=PIC %s +; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=static < %s 2>&1 | FileCheck --check-prefix=STATIC %s +; RUN: llc -march=mips64 -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=pic < %s 2>&1 | FileCheck --check-prefix=PIC64 %s +; RUN: llc -march=mips64 -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=static < %s 2>&1 | FileCheck --check-prefix=STATIC64 %s +; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=pic -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=PIC %s +; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=static -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=STATIC-MM %s +; RUN: llc -march=mips -mcpu=mips32r6 -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=pic -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=PIC %s +; RUN: llc -march=mips -mcpu=mips32r6 -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=static -mattr=+micromips < %s 2>&1 | FileCheck --check-prefix=STATIC-MM %s +; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=pic -mattr=+mips16 < %s 2>&1 | FileCheck --check-prefix=MIPS16 %s +; RUN: llc -march=mips -debug-only=isel -mips-tail-calls=1 \ +; RUN: -relocation-model=static -mattr=+mips16 < %s 2>&1 | FileCheck --check-prefix=MIPS16 %s + +; REQUIRES: asserts + +; Test that the correct pseudo instructions are generated for indirect +; branches and tail calls. Previously, the order of the DAG matcher table +; determined if the correct instruction was selected for mips16. + +declare protected void @a() + +define void @test1(i32 %a) { +entry: + %0 = trunc i32 %a to i1 + %1 = select i1 %0, + i8* blockaddress(@test1, %bb), + i8* blockaddress(@test1, %bb6) + indirectbr i8* %1, [label %bb, label %bb6] + +; STATIC: PseudoIndirectBranch +; STATIC-MM: PseudoIndirectBranch +; STATIC-NOT: PseudoIndirectBranch64 +; STATIC64: PseudoIndirectBranch64 +; PIC: PseudoIndirectBranch +; PIC-NOT: PseudoIndirectBranch64 +; PIC64: PseudoIndirectBranch64 +; MIPS16: JrcRx16 +bb: + ret void + +bb6: + tail call void @a() + +; STATIC: TAILCALL +; STATIC-NOT: TAILCALL_MM +; STATIC-MM: TAILCALL_MM +; PIC: TAILCALLREG +; PIC-NOT: TAILCALLREG64 +; PIC64: TAILCALLREG64 +; MIPS16: RetRA16 + ret void +} diff --git a/test/CodeGen/Mips/dins.ll b/test/CodeGen/Mips/dins.ll index 8a8b377861ae..2f7138ca4c5d 100644 --- a/test/CodeGen/Mips/dins.ll +++ b/test/CodeGen/Mips/dins.ll @@ -1,7 +1,11 @@ -; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2 -; RUN: llc -O2 -march=mips -mcpu=mips32r2 < %s -o - | FileCheck %s -check-prefix=MIPS32R2 -; RUN: llc -O2 -march=mips -mattr=mips16 < %s -o - | FileCheck %s -check-prefix=MIPS16 -; RUN: llc -O2 -march=mips64 -mcpu=mips64r2 -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32 +; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \ +; RUN: -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2 +; RUN: llc -O2 -verify-machineinstrs -march=mips -mcpu=mips32r2 < %s -o - \ +; RUN: | FileCheck %s -check-prefix=MIPS32R2 +; RUN: llc -O2 -verify-machineinstrs -march=mips -mattr=mips16 < %s -o - \ +; RUN: | FileCheck %s -check-prefix=MIPS16 +; RUN: llc -O2 -verify-machineinstrs -march=mips64 -mcpu=mips64r2 \ +; RUN: -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32 ; #include <stdint.h> ; #include <stdio.h> @@ -60,7 +64,7 @@ entry: ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 123 ; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 27, 37 ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 4 -; MIPS64R2: dins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6 +; MIPS64R2: dinsm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 28, 6 ; MIPS64R2: daddiu $[[R0:[0-9]+]], $zero, 5 ; MIPS64R2: dinsu $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50, 14 ; MIPS64R2: dsrl $[[R0:[0-9]+]], $[[R1:[0-9]+]], 50 diff --git a/test/CodeGen/Mips/msa/emergency-spill.mir b/test/CodeGen/Mips/msa/emergency-spill.mir new file mode 100644 index 000000000000..502b60f673e2 --- /dev/null +++ b/test/CodeGen/Mips/msa/emergency-spill.mir @@ -0,0 +1,221 @@ +# RUN: llc %s -start-after=shrink-wrap -march=mips64 -mcpu=mips64r6 -mattr=+fp64,+msa -o /dev/null + +# Test that estimated size of the stack leads to the creation of an emergency +# spill when MSA is in use. Previously, this test case would fail during +# register scavenging due to the lack of a spill slot. +--- | + define inreg { i64, i64 } @test(i64 inreg %a.coerce0, i64 inreg %a.coerce1, i64 inreg %b.coerce0, i64 inreg %b.coerce1, i32 signext %c) #0 { + entry: + %retval = alloca <16 x i8>, align 16 + %a = alloca <16 x i8>, align 16 + %b = alloca <16 x i8>, align 16 + %a.addr = alloca <16 x i8>, align 16 + %b.addr = alloca <16 x i8>, align 16 + %c.addr = alloca i32, align 4 + %g = alloca <16 x i8>*, align 8 + %d = alloca i8*, align 8 + %0 = bitcast <16 x i8>* %a to { i64, i64 }* + %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 0 + store i64 %a.coerce0, i64* %1, align 16 + %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 1 + store i64 %a.coerce1, i64* %2, align 8 + %a1 = load <16 x i8>, <16 x i8>* %a, align 16 + %3 = bitcast <16 x i8>* %b to { i64, i64 }* + %4 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 0 + store i64 %b.coerce0, i64* %4, align 16 + %5 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 1 + store i64 %b.coerce1, i64* %5, align 8 + %b2 = load <16 x i8>, <16 x i8>* %b, align 16 + store <16 x i8> %a1, <16 x i8>* %a.addr, align 16 + store <16 x i8> %b2, <16 x i8>* %b.addr, align 16 + store i32 %c, i32* %c.addr, align 4 + %6 = alloca i8, i64 6400, align 16 + %7 = bitcast i8* %6 to <16 x i8>* + store <16 x i8>* %7, <16 x i8>** %g, align 8 + %8 = load <16 x i8>*, <16 x i8>** %g, align 8 + call void @h(<16 x i8>* %b.addr, <16 x i8>* %8) + %9 = load <16 x i8>*, <16 x i8>** %g, align 8 + %10 = bitcast <16 x i8>* %9 to i8* + store i8* %10, i8** %d, align 8 + %11 = load <16 x i8>, <16 x i8>* %a.addr, align 16 + %12 = load i8*, i8** %d, align 8 + %arrayidx = getelementptr inbounds i8, i8* %12, i64 0 + %13 = load i8, i8* %arrayidx, align 1 + %conv = sext i8 %13 to i32 + %14 = call <16 x i8> @llvm.mips.fill.b(i32 %conv) + %add = add <16 x i8> %11, %14 + %15 = load i8*, i8** %d, align 8 + %arrayidx3 = getelementptr inbounds i8, i8* %15, i64 1 + %16 = load i8, i8* %arrayidx3, align 1 + %conv4 = sext i8 %16 to i32 + %17 = call <16 x i8> @llvm.mips.fill.b(i32 %conv4) + %add5 = add <16 x i8> %add, %17 + %18 = load <16 x i8>, <16 x i8>* %b.addr, align 16 + %add6 = add <16 x i8> %18, %add5 + store <16 x i8> %add6, <16 x i8>* %b.addr, align 16 + %19 = load <16 x i8>, <16 x i8>* %b.addr, align 16 + store <16 x i8> %19, <16 x i8>* %retval, align 16 + %20 = bitcast <16 x i8>* %retval to { i64, i64 }* + %21 = load { i64, i64 }, { i64, i64 }* %20, align 16 + ret { i64, i64 } %21 + } + + declare void @h(<16 x i8>*, <16 x i8>*) + + declare <16 x i8> @llvm.mips.fill.b(i32) + + declare void @llvm.stackprotector(i8*, i8**) + +... +--- +name: test +alignment: 3 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%a0_64', virtual-reg: '' } + - { reg: '%a1_64', virtual-reg: '' } + - { reg: '%a2_64', virtual-reg: '' } + - { reg: '%a3_64', virtual-reg: '' } + - { reg: '%t0_64', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 16 + adjustsStack: false + hasCalls: true + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: + - { id: 0, name: retval, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 1, name: a, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 2, name: b, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 3, name: a.addr, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 4, name: b.addr, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 5, name: c.addr, type: default, offset: 0, size: 4, alignment: 4, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 6, name: g, type: default, offset: 0, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 7, name: d, type: default, offset: 0, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } + - { id: 8, name: '', type: default, offset: 0, size: 6400, + alignment: 16, stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } +constants: +body: | + bb.0.entry: + liveins: %a0_64, %a1_64, %a2_64, %a3_64, %t0_64 + + SD killed %a0_64, %stack.1.a, 0 :: (store 8 into %ir.1, align 16) + SD killed %a1_64, %stack.1.a, 8 :: (store 8 into %ir.2) + %w0 = LD_B %stack.1.a, 0 :: (dereferenceable load 16 from %ir.a) + SD killed %a2_64, %stack.2.b, 0 :: (store 8 into %ir.4, align 16) + SD killed %a3_64, %stack.2.b, 8 :: (store 8 into %ir.5) + %w1 = LD_B %stack.2.b, 0 :: (dereferenceable load 16 from %ir.b) + ST_B killed %w0, %stack.3.a.addr, 0 :: (store 16 into %ir.a.addr) + ST_B killed %w1, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr) + SW %t0, %stack.5.c.addr, 0, implicit killed %t0_64 :: (store 4 into %ir.c.addr) + %at_64 = LEA_ADDiu64 %stack.8, 0 + SD killed %at_64, %stack.6.g, 0 :: (store 8 into %ir.g) + %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + %a0_64 = LEA_ADDiu64 %stack.4.b.addr, 0 + JAL @h, csr_n64, implicit-def dead %ra, implicit %a0_64, implicit %a1_64, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + %at_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %v0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %v1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %a3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %s7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t8_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %t9_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %ra_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g) + %w0 = LD_B %stack.3.a.addr, 0 :: (dereferenceable load 16 from %ir.a.addr) + SD %at_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %v0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %v1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %a3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t4_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t5_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t6_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t7_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s0_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s1_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s2_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s3_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s4_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s5_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s6_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %s7_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t8_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %t9_64, %stack.7.d, 0 :: (store 8 into %ir.d) + SD %ra_64, %stack.7.d, 0 :: (store 8 into %ir.d) + %at_64 = LD %stack.7.d, 0 :: (dereferenceable load 8 from %ir.d) + %v0 = LB %at_64, 0 :: (load 1 from %ir.arrayidx) + %w1 = FILL_B killed %v0 + %w0 = ADDV_B killed %w0, killed %w1 + %at = LB killed %at_64, 1 :: (load 1 from %ir.arrayidx3) + %w1 = FILL_B killed %at + %w0 = ADDV_B killed %w0, killed %w1 + %w1 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr) + %w0 = ADDV_B killed %w1, killed %w0 + ST_B killed %w0, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr) + %w0 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr) + ST_B killed %w0, %stack.0.retval, 0 :: (store 16 into %ir.retval) + %v0_64 = LD %stack.0.retval, 0 :: (dereferenceable load 8 from %ir.20, align 16) + %v1_64 = LD %stack.0.retval, 8 :: (dereferenceable load 8 from %ir.20 + 8, align 16) + RetRA implicit %v0_64, implicit %v1_64 + +... diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll index f903381f9ef0..9c2228d3bf63 100644 --- a/test/CodeGen/Mips/msa/frameindex.ll +++ b/test/CodeGen/Mips/msa/frameindex.ll @@ -18,7 +18,8 @@ define void @loadstore_v16i8_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v16i8_just_under_simm10: %1 = alloca <16 x i8> - %2 = alloca [496 x i8] ; Push the frame right up to 512 bytes + %2 = alloca [492 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 512 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 496($sp) @@ -33,7 +34,8 @@ define void @loadstore_v16i8_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v16i8_just_over_simm10: %1 = alloca <16 x i8> - %2 = alloca [497 x i8] ; Push the frame just over 512 bytes + %2 = alloca [497 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 512 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512 @@ -50,7 +52,8 @@ define void @loadstore_v16i8_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v16i8_just_under_simm16: %1 = alloca <16 x i8> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -69,7 +72,8 @@ define void @loadstore_v16i8_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v16i8_just_over_simm16: %1 = alloca <16 x i8> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <16 x i8>, <16 x i8>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -121,7 +125,8 @@ define void @loadstore_v8i16_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v8i16_just_under_simm10: %1 = alloca <8 x i16> - %2 = alloca [1008 x i8] ; Push the frame right up to 1024 bytes + %2 = alloca [1004 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 1024 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 1008($sp) @@ -136,7 +141,8 @@ define void @loadstore_v8i16_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v8i16_just_over_simm10: %1 = alloca <8 x i16> - %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes + %2 = alloca [1009 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 1024 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024 @@ -153,7 +159,8 @@ define void @loadstore_v8i16_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v8i16_just_under_simm16: %1 = alloca <8 x i16> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -172,7 +179,8 @@ define void @loadstore_v8i16_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v8i16_just_over_simm16: %1 = alloca <8 x i16> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <8 x i16>, <8 x i16>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -224,7 +232,8 @@ define void @loadstore_v4i32_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v4i32_just_under_simm10: %1 = alloca <4 x i32> - %2 = alloca [2032 x i8] ; Push the frame right up to 2048 bytes + %2 = alloca [2028 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 2048 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 2032($sp) @@ -239,7 +248,8 @@ define void @loadstore_v4i32_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v4i32_just_over_simm10: %1 = alloca <4 x i32> - %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes + %2 = alloca [2033 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 2048 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048 @@ -256,7 +266,8 @@ define void @loadstore_v4i32_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v4i32_just_under_simm16: %1 = alloca <4 x i32> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot-- right up to 32768 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -275,7 +286,8 @@ define void @loadstore_v4i32_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v4i32_just_over_simm16: %1 = alloca <4 x i32> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <4 x i32>, <4 x i32>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -327,8 +339,8 @@ define void @loadstore_v2i64_just_under_simm10() nounwind { ; MIPS32-AE: loadstore_v2i64_just_under_simm10: %1 = alloca <2 x i64> - %2 = alloca [4080 x i8] ; Push the frame right up to 4096 bytes - + %2 = alloca [4076 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 4096 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 4080($sp) store volatile <2 x i64> %3, <2 x i64>* %1 @@ -342,7 +354,8 @@ define void @loadstore_v2i64_just_over_simm10() nounwind { ; MIPS32-AE: loadstore_v2i64_just_over_simm10: %1 = alloca <2 x i64> - %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes + %2 = alloca [4081 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 4096 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096 @@ -359,7 +372,8 @@ define void @loadstore_v2i64_just_under_simm16() nounwind { ; MIPS32-AE: loadstore_v2i64_just_under_simm16: %1 = alloca <2 x i64> - %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes + %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill + ; slot--right up to 32768 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 @@ -378,7 +392,8 @@ define void @loadstore_v2i64_just_over_simm16() nounwind { ; MIPS32-AE: loadstore_v2i64_just_over_simm16: %1 = alloca <2 x i64> - %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes + %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill + ; slot--just over 32768 bytes %3 = load volatile <2 x i64>, <2 x i64>* %1 ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768 diff --git a/test/CodeGen/Mips/tailcall/tailcall.ll b/test/CodeGen/Mips/tailcall/tailcall.ll index 3f04e1cf3053..1c81335937d8 100644 --- a/test/CodeGen/Mips/tailcall/tailcall.ll +++ b/test/CodeGen/Mips/tailcall/tailcall.ll @@ -27,7 +27,7 @@ ; RUN: llc -march=mipsel -relocation-model=pic -mcpu=mips32r6 -mattr=+micromips \ ; RUN: -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,PIC32MM ; RUN: llc -march=mipsel -relocation-model=static -mcpu=mips32r6 \ -; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,STATIC32 +; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefixes=ALL,STATIC32MMR6 ; RUN: llc -march=mips64el -relocation-model=pic -mcpu=mips64r6 \ ; RUN: -mattr=+micromips -mips-tail-calls=1 < %s | FileCheck %s -check-prefix=PIC64R6MM ; RUN: llc -march=mips64el -relocation-model=static -mcpu=mips64r6 \ @@ -51,6 +51,7 @@ entry: ; PIC32MM: jalr $25 ; PIC32R6: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; N64: jalr $25 ; N64R6: jalr $25 ; PIC16: jalrc @@ -68,6 +69,7 @@ entry: ; PIC32MM: jalr $25 ; PIC32R6: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; N64: jalr $25 ; N64R6: jalr $25 ; PIC16: jalrc @@ -85,6 +87,7 @@ entry: ; PIC32R6: jalr $25 ; PIC32MM: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; N64: jalr $25 ; N64R6: jalr $25 ; PIC16: jalrc @@ -102,6 +105,7 @@ entry: ; PIC32R6: jalr $25 ; PIC32MM: jalr $25 ; STATIC32: jal +; SATATIC32MMR6: jal ; PIC64: jalr $25 ; STATIC64: jal ; N64R6: jalr $25 @@ -120,6 +124,7 @@ entry: ; PIC32R6: jr $25 ; PIC32MM: jr ; STATIC32: j +; STATIC32MMR6: bc ; PIC64: jr $25 ; STATIC64: j ; PIC16: jalrc @@ -161,6 +166,7 @@ entry: ; PIC32R6: jrc $25 ; PIC32MM: jrc ; STATIC32: j +; STATIC32MMR6: bc ; PIC64: jr $25 ; PIC64R6: jrc $25 ; PIC64R6MM: jr $25 @@ -178,6 +184,7 @@ entry: ; PIC32R6: jalr $25 ; PIC32MM: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; PIC64: jalr $25 ; STATIC64: jal ; PIC16: jalrc @@ -199,6 +206,7 @@ entry: ; PIC32R6: jrc $25 ; PIC32MM: jrc ; STATIC32: j +; STATIC32MMR6: bc ; PIC64: jr $25 ; STATIC64: j ; PIC64R6: jrc $25 @@ -214,6 +222,7 @@ entry: ; PIC32R6: jalrc $25 ; PIC32MM: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; STATIC64: jal ; PIC64: jalr $25 ; PIC64R6: jalrc $25 @@ -232,6 +241,7 @@ entry: ; PIC32R6: jalr $25 ; PIC32MM: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; STATIC64: jal ; PIC64: jalr $25 ; PIC64R6: jalr $25 @@ -250,6 +260,7 @@ entry: ; PIC32R6: jalrc $25 ; PIC32MM: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; STATIC64: jal ; PIC64: jalr $25 ; PIC64R6: jalrc $25 @@ -270,6 +281,7 @@ entry: ; PIC32R6: jalrc $25 ; PIC32MM: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; STATIC64: jal ; PIC64: jalr $25 ; PIC64R6: jalrc $25 @@ -290,6 +302,7 @@ entry: ; PIC32R6: jalr $25 ; PIC32MM: jalr $25 ; STATIC32: jal +; STATIC32MMR6: jal ; STATIC64: jal ; PIC64R6: jalr $25 ; PIC64: jalr $25 diff --git a/test/CodeGen/NVPTX/atomics-sm60.ll b/test/CodeGen/NVPTX/atomics-sm60.ll new file mode 100644 index 000000000000..0b5bafb780c5 --- /dev/null +++ b/test/CodeGen/NVPTX/atomics-sm60.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_60 | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s + +; CHECK-LABEL .func test( +define void @test(double* %dp0, double addrspace(1)* %dp1, double addrspace(3)* %dp3, double %d) { +; CHECK: atom.add.f64 + %r1 = call double @llvm.nvvm.atomic.load.add.f64.p0f64(double* %dp0, double %d) +; CHECK: atom.global.add.f64 + %r2 = call double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* %dp1, double %d) +; CHECK: atom.shared.add.f64 + %ret = call double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* %dp3, double %d) + ret void +} + +declare double @llvm.nvvm.atomic.load.add.f64.p0f64(double* nocapture, double) #1 +declare double @llvm.nvvm.atomic.load.add.f64.p1f64(double addrspace(1)* nocapture, double) #1 +declare double @llvm.nvvm.atomic.load.add.f64.p3f64(double addrspace(3)* nocapture, double) #1 + +attributes #1 = { argmemonly nounwind } diff --git a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll index f874148c0e83..5df5183dc2fb 100644 --- a/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll +++ b/test/CodeGen/NVPTX/generic-to-nvvm-ir.ll @@ -1,6 +1,6 @@ ; Verify functionality of NVPTXGenericToNVVM.cpp pass. ; -; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm -verify-debug-info | FileCheck %s +; RUN: opt < %s -march nvptx64 -S -generic-to-nvvm | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" diff --git a/test/CodeGen/PowerPC/bswap64.ll b/test/CodeGen/PowerPC/bswap64.ll new file mode 100644 index 000000000000..0a78aa2dc548 --- /dev/null +++ b/test/CodeGen/PowerPC/bswap64.ll @@ -0,0 +1,13 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc64le-- -mcpu=pwr9 | FileCheck %s + +declare i64 @llvm.bswap.i64(i64) + +; CHECK: mtvsrdd +; CHECK: xxbrd +; CHECK: mfvsrd +define i64 @bswap64(i64 %x) { +entry: + %0 = call i64 @llvm.bswap.i64(i64 %x) + ret i64 %0 +} + diff --git a/test/CodeGen/PowerPC/p9-vinsert-vextract.ll b/test/CodeGen/PowerPC/p9-vinsert-vextract.ll index 31bbc4b13516..c8c7d797c005 100644 --- a/test/CodeGen/PowerPC/p9-vinsert-vextract.ll +++ b/test/CodeGen/PowerPC/p9-vinsert-vextract.ll @@ -298,3 +298,825 @@ entry: ret <8 x i16> %vecins } +; The following testcases take one byte element from the second vector and +; inserts it at various locations in the first vector +define <16 x i8> @shuffle_vector_byte_0_16(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_0_16 +; CHECK: vsldoi 3, 3, 3, 8 +; CHECK: vinsertb 2, 3, 15 +; CHECK-BE-LABEL: shuffle_vector_byte_0_16 +; CHECK-BE: vsldoi 3, 3, 3, 9 +; CHECK-BE: vinsertb 2, 3, 0 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_1_25(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_1_25 +; CHECK: vsldoi 3, 3, 3, 15 +; CHECK: vinsertb 2, 3, 14 +; CHECK-BE-LABEL: shuffle_vector_byte_1_25 +; CHECK-BE: vsldoi 3, 3, 3, 2 +; CHECK-BE: vinsertb 2, 3, 1 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 25, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_2_18(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_2_18 +; CHECK: vsldoi 3, 3, 3, 6 +; CHECK: vinsertb 2, 3, 13 +; CHECK-BE-LABEL: shuffle_vector_byte_2_18 +; CHECK-BE: vsldoi 3, 3, 3, 11 +; CHECK-BE: vinsertb 2, 3, 2 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_3_27(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_3_27 +; CHECK: vsldoi 3, 3, 3, 13 +; CHECK: vinsertb 2, 3, 12 +; CHECK-BE-LABEL: shuffle_vector_byte_3_27 +; CHECK-BE: vsldoi 3, 3, 3, 4 +; CHECK-BE: vinsertb 2, 3, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 27, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_4_20(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_4_20 +; CHECK: vsldoi 3, 3, 3, 4 +; CHECK: vinsertb 2, 3, 11 +; CHECK-BE-LABEL: shuffle_vector_byte_4_20 +; CHECK-BE: vsldoi 3, 3, 3, 13 +; CHECK-BE: vinsertb 2, 3, 4 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_5_29(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_5_29 +; CHECK: vsldoi 3, 3, 3, 11 +; CHECK: vinsertb 2, 3, 10 +; CHECK-BE-LABEL: shuffle_vector_byte_5_29 +; CHECK-BE: vsldoi 3, 3, 3, 6 +; CHECK-BE: vinsertb 2, 3, 5 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 29, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_6_22(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_6_22 +; CHECK: vsldoi 3, 3, 3, 2 +; CHECK: vinsertb 2, 3, 9 +; CHECK-BE-LABEL: shuffle_vector_byte_6_22 +; CHECK-BE: vsldoi 3, 3, 3, 15 +; CHECK-BE: vinsertb 2, 3, 6 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_7_31(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_7_31 +; CHECK: vsldoi 3, 3, 3, 9 +; CHECK: vinsertb 2, 3, 8 +; CHECK-BE-LABEL: shuffle_vector_byte_7_31 +; CHECK-BE: vsldoi 3, 3, 3, 8 +; CHECK-BE: vinsertb 2, 3, 7 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_8_24(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_8_24 +; CHECK: vinsertb 2, 3, 7 +; CHECK-BE-LABEL: shuffle_vector_byte_8_24 +; CHECK-BE: vsldoi 3, 3, 3, 1 +; CHECK-BE: vinsertb 2, 3, 8 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_9_17(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_9_17 +; CHECK: vsldoi 3, 3, 3, 7 +; CHECK: vinsertb 2, 3, 6 +; CHECK-BE-LABEL: shuffle_vector_byte_9_17 +; CHECK-BE: vsldoi 3, 3, 3, 10 +; CHECK-BE: vinsertb 2, 3, 9 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_10_26(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_10_26 +; CHECK: vsldoi 3, 3, 3, 14 +; CHECK: vinsertb 2, 3, 5 +; CHECK-BE-LABEL: shuffle_vector_byte_10_26 +; CHECK-BE: vsldoi 3, 3, 3, 3 +; CHECK-BE: vinsertb 2, 3, 10 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_11_19(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_11_19 +; CHECK: vsldoi 3, 3, 3, 5 +; CHECK: vinsertb 2, 3, 4 +; CHECK-BE-LABEL: shuffle_vector_byte_11_19 +; CHECK-BE: vsldoi 3, 3, 3, 12 +; CHECK-BE: vinsertb 2, 3, 11 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 19, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_12_28(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_12_28 +; CHECK: vsldoi 3, 3, 3, 12 +; CHECK: vinsertb 2, 3, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_12_28 +; CHECK-BE: vsldoi 3, 3, 3, 5 +; CHECK-BE: vinsertb 2, 3, 12 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_13_21(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_13_21 +; CHECK: vsldoi 3, 3, 3, 3 +; CHECK: vinsertb 2, 3, 2 +; CHECK-BE-LABEL: shuffle_vector_byte_13_21 +; CHECK-BE: vsldoi 3, 3, 3, 14 +; CHECK-BE: vinsertb 2, 3, 13 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 21, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_14_30(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_14_30 +; CHECK: vsldoi 3, 3, 3, 10 +; CHECK: vinsertb 2, 3, 1 +; CHECK-BE-LABEL: shuffle_vector_byte_14_30 +; CHECK-BE: vsldoi 3, 3, 3, 7 +; CHECK-BE: vinsertb 2, 3, 14 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 30, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_15_23(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_15_23 +; CHECK: vsldoi 3, 3, 3, 1 +; CHECK: vinsertb 2, 3, 0 +; CHECK-BE-LABEL: shuffle_vector_byte_15_23 +; CHECK-BE: vinsertb 2, 3, 15 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 23> + ret <16 x i8> %vecins +} + +; The following testcases take one byte element from the first vector and +; inserts it at various locations in the second vector +define <16 x i8> @shuffle_vector_byte_16_8(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_16_8 +; CHECK: vinsertb 3, 2, 15 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_16_8 +; CHECK-BE: vsldoi 2, 2, 2, 1 +; CHECK-BE: vinsertb 3, 2, 0 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_17_1(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_17_1 +; CHECK: vsldoi 2, 2, 2, 7 +; CHECK: vinsertb 3, 2, 14 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_17_1 +; CHECK-BE: vsldoi 2, 2, 2, 10 +; CHECK-BE: vinsertb 3, 2, 1 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_18_10(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_18_10 +; CHECK: vsldoi 2, 2, 2, 14 +; CHECK: vinsertb 3, 2, 13 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_18_10 +; CHECK-BE: vsldoi 2, 2, 2, 3 +; CHECK-BE: vinsertb 3, 2, 2 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 10, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_19_3(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_19_3 +; CHECK: vsldoi 2, 2, 2, 5 +; CHECK: vinsertb 3, 2, 12 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_19_3 +; CHECK-BE: vsldoi 2, 2, 2, 12 +; CHECK-BE: vinsertb 3, 2, 3 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_20_12(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_20_12 +; CHECK: vsldoi 2, 2, 2, 12 +; CHECK: vinsertb 3, 2, 11 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_20_12 +; CHECK-BE: vsldoi 2, 2, 2, 5 +; CHECK-BE: vinsertb 3, 2, 4 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 12, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_21_5(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_21_5 +; CHECK: vsldoi 2, 2, 2, 3 +; CHECK: vinsertb 3, 2, 10 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_21_5 +; CHECK-BE: vsldoi 2, 2, 2, 14 +; CHECK-BE: vinsertb 3, 2, 5 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_22_14(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_22_14 +; CHECK: vsldoi 2, 2, 2, 10 +; CHECK: vinsertb 3, 2, 9 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_22_14 +; CHECK-BE: vsldoi 2, 2, 2, 7 +; CHECK-BE: vinsertb 3, 2, 6 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 14, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_23_7(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_23_7 +; CHECK: vsldoi 2, 2, 2, 1 +; CHECK: vinsertb 3, 2, 8 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_23_7 +; CHECK-BE: vinsertb 3, 2, 7 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_24_0(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_24_0 +; CHECK: vsldoi 2, 2, 2, 8 +; CHECK: vinsertb 3, 2, 7 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_24_0 +; CHECK-BE: vsldoi 2, 2, 2, 9 +; CHECK-BE: vinsertb 3, 2, 8 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_25_9(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_25_9 +; CHECK: vsldoi 2, 2, 2, 15 +; CHECK: vinsertb 3, 2, 6 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_25_9 +; CHECK-BE: vsldoi 2, 2, 2, 2 +; CHECK-BE: vinsertb 3, 2, 9 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 9, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_26_2(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_26_2 +; CHECK: vsldoi 2, 2, 2, 6 +; CHECK: vinsertb 3, 2, 5 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_26_2 +; CHECK-BE: vsldoi 2, 2, 2, 11 +; CHECK-BE: vinsertb 3, 2, 10 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 2, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_27_11(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_27_11 +; CHECK: vsldoi 2, 2, 2, 13 +; CHECK: vinsertb 3, 2, 4 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_27_11 +; CHECK-BE: vsldoi 2, 2, 2, 4 +; CHECK-BE: vinsertb 3, 2, 11 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 11, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_28_4(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_28_4 +; CHECK: vsldoi 2, 2, 2, 4 +; CHECK: vinsertb 3, 2, 3 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_28_4 +; CHECK-BE: vsldoi 2, 2, 2, 13 +; CHECK-BE: vinsertb 3, 2, 12 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 4, i32 29, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_29_13(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_29_13 +; CHECK: vsldoi 2, 2, 2, 11 +; CHECK: vinsertb 3, 2, 2 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_29_13 +; CHECK-BE: vsldoi 2, 2, 2, 6 +; CHECK-BE: vinsertb 3, 2, 13 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 13, i32 30, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_30_6(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_30_6 +; CHECK: vsldoi 2, 2, 2, 2 +; CHECK: vinsertb 3, 2, 1 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_30_6 +; CHECK-BE: vsldoi 2, 2, 2, 15 +; CHECK-BE: vinsertb 3, 2, 14 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 6, i32 31> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_31_15(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_31_15 +; CHECK: vsldoi 2, 2, 2, 9 +; CHECK: vinsertb 3, 2, 0 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_31_15 +; CHECK-BE: vsldoi 2, 2, 2, 8 +; CHECK-BE: vinsertb 3, 2, 15 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 15> + ret <16 x i8> %vecins +} + +; The following testcases use the same vector in both arguments of the +; shufflevector. If byte element 7 in BE mode(or 8 in LE mode) is the one +; we're attempting to insert, then we can use the vector insert instruction +define <16 x i8> @shuffle_vector_byte_0_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_0_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_0_7 +; CHECK-BE: vinsertb 2, 2, 0 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_1_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_1_8 +; CHECK: vinsertb 2, 2, 14 +; CHECK-BE-LABEL: shuffle_vector_byte_1_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_2_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_2_8 +; CHECK: vinsertb 2, 2, 13 +; CHECK-BE-LABEL: shuffle_vector_byte_2_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_3_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_3_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_3_7 +; CHECK-BE: vinsertb 2, 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_4_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_4_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_4_7 +; CHECK-BE: vinsertb 2, 2, 4 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_5_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_5_8 +; CHECK: vinsertb 2, 2, 10 +; CHECK-BE-LABEL: shuffle_vector_byte_5_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_6_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_6_8 +; CHECK: vinsertb 2, 2, 9 +; CHECK-BE-LABEL: shuffle_vector_byte_6_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_7_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_7_8 +; CHECK: vinsertb 2, 2, 8 +; CHECK-BE-LABEL: shuffle_vector_byte_7_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_8_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_8_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_8_7 +; CHECK-BE: vinsertb 2, 2, 8 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_9_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_9_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_9_7 +; CHECK-BE: vinsertb 2, 2, 9 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 7, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_10_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_10_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_10_7 +; CHECK-BE: vinsertb 2, 2, 10 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 7, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_11_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_11_8 +; CHECK: vinsertb 2, 2, 4 +; CHECK-BE-LABEL: shuffle_vector_byte_11_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 8, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_12_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_12_8 +; CHECK: vinsertb 2, 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_12_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 8, i32 13, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_13_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_13_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_13_7 +; CHECK-BE: vinsertb 2, 2, 13 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 7, i32 14, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_14_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_14_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_14_7 +; CHECK-BE: vinsertb 2, 2, 14 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 15> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_15_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_15_8 +; CHECK: vinsertb 2, 2, 0 +; CHECK-BE-LABEL: shuffle_vector_byte_15_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8> + ret <16 x i8> %vecins +} + +; The following tests try to insert one halfword element into the vector. We +; should always be using the 'vinserth' instruction. +define <8 x i16> @insert_halfword_0(<8 x i16> %a, i16 %b) { +entry: +; CHECK-LABEL: insert_halfword_0 +; CHECK: vinserth 2, 3, 14 +; CHECK-BE-LABEL: insert_halfword_0 +; CHECK-BE: vinserth 2, 3, 0 + %vecins = insertelement <8 x i16> %a, i16 %b, i32 0 + ret <8 x i16> %vecins +} + +define <8 x i16> @insert_halfword_1(<8 x i16> %a, i16 %b) { +entry: +; CHECK-LABEL: insert_halfword_1 +; CHECK: vinserth 2, 3, 12 +; CHECK-BE-LABEL: insert_halfword_1 +; CHECK-BE: vinserth 2, 3, 2 + %vecins = insertelement <8 x i16> %a, i16 %b, i32 1 + ret <8 x i16> %vecins +} + +define <8 x i16> @insert_halfword_2(<8 x i16> %a, i16 %b) { +entry: +; CHECK-LABEL: insert_halfword_2 +; CHECK: vinserth 2, 3, 10 +; CHECK-BE-LABEL: insert_halfword_2 +; CHECK-BE: vinserth 2, 3, 4 + %vecins = insertelement <8 x i16> %a, i16 %b, i32 2 + ret <8 x i16> %vecins +} + +define <8 x i16> @insert_halfword_3(<8 x i16> %a, i16 %b) { +entry: +; CHECK-LABEL: insert_halfword_3 +; CHECK: vinserth 2, 3, 8 +; CHECK-BE-LABEL: insert_halfword_3 +; CHECK-BE: vinserth 2, 3, 6 + %vecins = insertelement <8 x i16> %a, i16 %b, i32 3 + ret <8 x i16> %vecins +} + +define <8 x i16> @insert_halfword_4(<8 x i16> %a, i16 %b) { +entry: +; CHECK-LABEL: insert_halfword_4 +; CHECK: vinserth 2, 3, 6 +; CHECK-BE-LABEL: insert_halfword_4 +; CHECK-BE: vinserth 2, 3, 8 + %vecins = insertelement <8 x i16> %a, i16 %b, i32 4 + ret <8 x i16> %vecins +} + +define <8 x i16> @insert_halfword_5(<8 x i16> %a, i16 %b) { +entry: +; CHECK-LABEL: insert_halfword_5 +; CHECK: vinserth 2, 3, 4 +; CHECK-BE-LABEL: insert_halfword_5 +; CHECK-BE: vinserth 2, 3, 10 + %vecins = insertelement <8 x i16> %a, i16 %b, i32 5 + ret <8 x i16> %vecins +} + +define <8 x i16> @insert_halfword_6(<8 x i16> %a, i16 %b) { +entry: +; CHECK-LABEL: insert_halfword_6 +; CHECK: vinserth 2, 3, 2 +; CHECK-BE-LABEL: insert_halfword_6 +; CHECK-BE: vinserth 2, 3, 12 + %vecins = insertelement <8 x i16> %a, i16 %b, i32 6 + ret <8 x i16> %vecins +} + +define <8 x i16> @insert_halfword_7(<8 x i16> %a, i16 %b) { +entry: +; CHECK-LABEL: insert_halfword_7 +; CHECK: vinserth 2, 3, 0 +; CHECK-BE-LABEL: insert_halfword_7 +; CHECK-BE: vinserth 2, 3, 14 + %vecins = insertelement <8 x i16> %a, i16 %b, i32 7 + ret <8 x i16> %vecins +} + +; The following tests try to insert one byte element into the vector. We +; should always be using the 'vinsertb' instruction. +define <16 x i8> @insert_byte_0(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_0 +; CHECK: vinsertb 2, 3, 15 +; CHECK-BE-LABEL: insert_byte_0 +; CHECK-BE: vinsertb 2, 3, 0 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 0 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_1(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_1 +; CHECK: vinsertb 2, 3, 14 +; CHECK-BE-LABEL: insert_byte_1 +; CHECK-BE: vinsertb 2, 3, 1 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 1 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_2(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_2 +; CHECK: vinsertb 2, 3, 13 +; CHECK-BE-LABEL: insert_byte_2 +; CHECK-BE: vinsertb 2, 3, 2 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 2 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_3(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_3 +; CHECK: vinsertb 2, 3, 12 +; CHECK-BE-LABEL: insert_byte_3 +; CHECK-BE: vinsertb 2, 3, 3 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 3 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_4(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_4 +; CHECK: vinsertb 2, 3, 11 +; CHECK-BE-LABEL: insert_byte_4 +; CHECK-BE: vinsertb 2, 3, 4 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 4 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_5(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_5 +; CHECK: vinsertb 2, 3, 10 +; CHECK-BE-LABEL: insert_byte_5 +; CHECK-BE: vinsertb 2, 3, 5 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 5 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_6(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_6 +; CHECK: vinsertb 2, 3, 9 +; CHECK-BE-LABEL: insert_byte_6 +; CHECK-BE: vinsertb 2, 3, 6 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 6 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_7(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_7 +; CHECK: vinsertb 2, 3, 8 +; CHECK-BE-LABEL: insert_byte_7 +; CHECK-BE: vinsertb 2, 3, 7 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 7 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_8(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_8 +; CHECK: vinsertb 2, 3, 7 +; CHECK-BE-LABEL: insert_byte_8 +; CHECK-BE: vinsertb 2, 3, 8 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 8 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_9(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_9 +; CHECK: vinsertb 2, 3, 6 +; CHECK-BE-LABEL: insert_byte_9 +; CHECK-BE: vinsertb 2, 3, 9 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 9 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_10(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_10 +; CHECK: vinsertb 2, 3, 5 +; CHECK-BE-LABEL: insert_byte_10 +; CHECK-BE: vinsertb 2, 3, 10 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 10 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_11(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_11 +; CHECK: vinsertb 2, 3, 4 +; CHECK-BE-LABEL: insert_byte_11 +; CHECK-BE: vinsertb 2, 3, 11 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 11 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_12(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_12 +; CHECK: vinsertb 2, 3, 3 +; CHECK-BE-LABEL: insert_byte_12 +; CHECK-BE: vinsertb 2, 3, 12 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 12 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_13(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_13 +; CHECK: vinsertb 2, 3, 2 +; CHECK-BE-LABEL: insert_byte_13 +; CHECK-BE: vinsertb 2, 3, 13 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 13 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_14(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_14 +; CHECK: vinsertb 2, 3, 1 +; CHECK-BE-LABEL: insert_byte_14 +; CHECK-BE: vinsertb 2, 3, 14 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 14 + ret <16 x i8> %vecins +} + +define <16 x i8> @insert_byte_15(<16 x i8> %a, i8 %b) { +entry: +; CHECK-LABEL: insert_byte_15 +; CHECK: vinsertb 2, 3, 0 +; CHECK-BE-LABEL: insert_byte_15 +; CHECK-BE: vinsertb 2, 3, 15 + %vecins = insertelement <16 x i8> %a, i8 %b, i32 15 + ret <16 x i8> %vecins +} diff --git a/test/CodeGen/PowerPC/subreg-postra-2.ll b/test/CodeGen/PowerPC/subreg-postra-2.ll index 338000cd8bae..794c9c190d1c 100644 --- a/test/CodeGen/PowerPC/subreg-postra-2.ll +++ b/test/CodeGen/PowerPC/subreg-postra-2.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gep-opt=0 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-gen-isel=false -ppc-gep-opt=0 < %s | FileCheck --check-prefix=CHECK-NO-ISEL %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -38,10 +38,10 @@ while.end418: ; preds = %wait_on_buffer.exit ; CHECK: stdcx. ; CHECK: isel {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, [[REG]] ; CHECK-NO-ISEL: bc 12, 20, [[TRUE:.LBB[0-9]+]] -; CHECK-NO-ISEL: ori 4, 7, 0 +; CHECK-NO-ISEL: ori 7, 8, 0 ; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]] ; CHECK-NO-ISEL: [[TRUE]] -; CHECK-NO-ISEL-NEXT: addi 4, 3, 0 +; CHECK-NO-ISEL: addi 7, 3, 0 if.then420: ; preds = %while.end418 unreachable diff --git a/test/CodeGen/RISCV/alu32.ll b/test/CodeGen/RISCV/alu32.ll index 32242d2e40d3..9db6bb9dd434 100644 --- a/test/CodeGen/RISCV/alu32.ll +++ b/test/CodeGen/RISCV/alu32.ll @@ -7,7 +7,6 @@ define i32 @addi(i32 %a) nounwind { ; RV32I-LABEL: addi: ; RV32I: addi a0, a0, 1 ; RV32I: jalr zero, ra, 0 -; TODO: check support for materialising larger constants %1 = add i32 %a, 1 ret i32 %1 } diff --git a/test/CodeGen/RISCV/branch.ll b/test/CodeGen/RISCV/branch.ll new file mode 100644 index 000000000000..194083b07c71 --- /dev/null +++ b/test/CodeGen/RISCV/branch.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32I %s + +define void @foo(i32 %a, i32 *%b, i1 %c) { +; RV32I-LABEL: foo: +; RV32I: # BB#0: +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: beq a3, a0, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_1 +; RV32I-NEXT: .LBB0_1: # %test2 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: bne a3, a0, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_2 +; RV32I-NEXT: .LBB0_2: # %test3 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: blt a3, a0, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_3 +; RV32I-NEXT: .LBB0_3: # %test4 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: bge a3, a0, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_4 +; RV32I-NEXT: .LBB0_4: # %test5 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: bltu a3, a0, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_5 +; RV32I-NEXT: .LBB0_5: # %test6 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: bgeu a3, a0, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_6 +; RV32I-NEXT: .LBB0_6: # %test7 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: blt a0, a3, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_7 +; RV32I-NEXT: .LBB0_7: # %test8 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: bge a0, a3, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_8 +; RV32I-NEXT: .LBB0_8: # %test9 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: bltu a0, a3, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_9 +; RV32I-NEXT: .LBB0_9: # %test10 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: bgeu a0, a3, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_10 +; RV32I-NEXT: .LBB0_10: # %test11 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: andi a0, a2, 1 +; RV32I-NEXT: bne a0, zero, .LBB0_12 +; RV32I-NEXT: jal zero, .LBB0_11 +; RV32I-NEXT: .LBB0_11: # %test12 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: .LBB0_12: # %end +; RV32I-NEXT: jalr zero, ra, 0 + + %val1 = load volatile i32, i32* %b + %tst1 = icmp eq i32 %val1, %a + br i1 %tst1, label %end, label %test2 + +test2: + %val2 = load volatile i32, i32* %b + %tst2 = icmp ne i32 %val2, %a + br i1 %tst2, label %end, label %test3 + +test3: + %val3 = load volatile i32, i32* %b + %tst3 = icmp slt i32 %val3, %a + br i1 %tst3, label %end, label %test4 + +test4: + %val4 = load volatile i32, i32* %b + %tst4 = icmp sge i32 %val4, %a + br i1 %tst4, label %end, label %test5 + +test5: + %val5 = load volatile i32, i32* %b + %tst5 = icmp ult i32 %val5, %a + br i1 %tst5, label %end, label %test6 + +test6: + %val6 = load volatile i32, i32* %b + %tst6 = icmp uge i32 %val6, %a + br i1 %tst6, label %end, label %test7 + +; Check for condition codes that don't have a matching instruction + +test7: + %val7 = load volatile i32, i32* %b + %tst7 = icmp sgt i32 %val7, %a + br i1 %tst7, label %end, label %test8 + +test8: + %val8 = load volatile i32, i32* %b + %tst8 = icmp sle i32 %val8, %a + br i1 %tst8, label %end, label %test9 + +test9: + %val9 = load volatile i32, i32* %b + %tst9 = icmp ugt i32 %val9, %a + br i1 %tst9, label %end, label %test10 + +test10: + %val10 = load volatile i32, i32* %b + %tst10 = icmp ule i32 %val10, %a + br i1 %tst10, label %end, label %test11 + +; Check the case of a branch where the condition was generated in another +; function + +test11: + %val11 = load volatile i32, i32* %b + br i1 %c, label %end, label %test12 + +test12: + %val12 = load volatile i32, i32* %b + br label %end + +end: + ret void +} diff --git a/test/CodeGen/RISCV/calls.ll b/test/CodeGen/RISCV/calls.ll new file mode 100644 index 000000000000..8abe5e92a8e0 --- /dev/null +++ b/test/CodeGen/RISCV/calls.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32I %s + +declare i32 @external_function(i32) + +define i32 @test_call_external(i32 %a) nounwind { +; RV32I-LABEL: test_call_external: +; RV32I: # BB#0: +; RV32I-NEXT: sw ra, 12(s0) +; RV32I-NEXT: lui a1, %hi(external_function) +; RV32I-NEXT: addi a1, a1, %lo(external_function) +; RV32I-NEXT: jalr ra, a1, 0 +; RV32I-NEXT: lw ra, 12(s0) +; RV32I-NEXT: jalr zero, ra, 0 + %1 = call i32 @external_function(i32 %a) + ret i32 %1 +} + +define i32 @defined_function(i32 %a) nounwind { +; RV32I-LABEL: defined_function: +; RV32I: # BB#0: +; RV32I-NEXT: addi a0, a0, 1 +; RV32I-NEXT: jalr zero, ra, 0 + %1 = add i32 %a, 1 + ret i32 %1 +} + +define i32 @test_call_defined(i32 %a) nounwind { +; RV32I-LABEL: test_call_defined: +; RV32I: # BB#0: +; RV32I-NEXT: sw ra, 12(s0) +; RV32I-NEXT: lui a1, %hi(defined_function) +; RV32I-NEXT: addi a1, a1, %lo(defined_function) +; RV32I-NEXT: jalr ra, a1, 0 +; RV32I-NEXT: lw ra, 12(s0) +; RV32I-NEXT: jalr zero, ra, 0 + %1 = call i32 @defined_function(i32 %a) nounwind + ret i32 %1 +} + +define i32 @test_call_indirect(i32 (i32)* %a, i32 %b) nounwind { +; RV32I-LABEL: test_call_indirect: +; RV32I: # BB#0: +; RV32I-NEXT: sw ra, 12(s0) +; RV32I-NEXT: addi a2, a0, 0 +; RV32I-NEXT: addi a0, a1, 0 +; RV32I-NEXT: jalr ra, a2, 0 +; RV32I-NEXT: lw ra, 12(s0) +; RV32I-NEXT: jalr zero, ra, 0 + %1 = call i32 %a(i32 %b) + ret i32 %1 +} + +; Ensure that calls to fastcc functions aren't rejected. Such calls may be +; introduced when compiling with optimisation. + +define fastcc i32 @fastcc_function(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: fastcc_function: +; RV32I: # BB#0: +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: jalr zero, ra, 0 + %1 = add i32 %a, %b + ret i32 %1 +} + +define i32 @test_call_fastcc(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: test_call_fastcc: +; RV32I: # BB#0: +; RV32I-NEXT: sw ra, 12(s0) +; RV32I-NEXT: sw s1, 8(s0) +; RV32I-NEXT: addi s1, a0, 0 +; RV32I-NEXT: lui a0, %hi(fastcc_function) +; RV32I-NEXT: addi a2, a0, %lo(fastcc_function) +; RV32I-NEXT: addi a0, s1, 0 +; RV32I-NEXT: jalr ra, a2, 0 +; RV32I-NEXT: addi a0, s1, 0 +; RV32I-NEXT: lw s1, 8(s0) +; RV32I-NEXT: lw ra, 12(s0) +; RV32I-NEXT: jalr zero, ra, 0 + %1 = call fastcc i32 @fastcc_function(i32 %a, i32 %b) + ret i32 %a +} diff --git a/test/CodeGen/RISCV/imm.ll b/test/CodeGen/RISCV/imm.ll new file mode 100644 index 000000000000..c52638da02eb --- /dev/null +++ b/test/CodeGen/RISCV/imm.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32I + +; Materializing constants + +define i32 @zero() nounwind { +; RV32I-LABEL: zero: +; RV32I: # BB#0: +; RV32I-NEXT: addi a0, zero, 0 +; RV32I-NEXT: jalr zero, ra, 0 + ret i32 0 +} + +define i32 @pos_small() nounwind { +; RV32I-LABEL: pos_small: +; RV32I: # BB#0: +; RV32I-NEXT: addi a0, zero, 2047 +; RV32I-NEXT: jalr zero, ra, 0 + ret i32 2047 +} + +define i32 @neg_small() nounwind { +; RV32I-LABEL: neg_small: +; RV32I: # BB#0: +; RV32I-NEXT: addi a0, zero, -2048 +; RV32I-NEXT: jalr zero, ra, 0 + ret i32 -2048 +} + +define i32 @pos_i32() nounwind { +; RV32I-LABEL: pos_i32: +; RV32I: # BB#0: +; RV32I-NEXT: lui a0, 423811 +; RV32I-NEXT: addi a0, a0, -1297 +; RV32I-NEXT: jalr zero, ra, 0 + ret i32 1735928559 +} + +define i32 @neg_i32() nounwind { +; RV32I-LABEL: neg_i32: +; RV32I: # BB#0: +; RV32I-NEXT: lui a0, 912092 +; RV32I-NEXT: addi a0, a0, -273 +; RV32I-NEXT: jalr zero, ra, 0 + ret i32 -559038737 +} diff --git a/test/CodeGen/RISCV/mem.ll b/test/CodeGen/RISCV/mem.ll new file mode 100644 index 000000000000..b06382f8742a --- /dev/null +++ b/test/CodeGen/RISCV/mem.ll @@ -0,0 +1,202 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32I + +; Check indexed and unindexed, sext, zext and anyext loads + +define i32 @lb(i8 *%a) nounwind { +; RV32I-LABEL: lb: +; RV32I: # BB#0: +; RV32I-NEXT: lb a1, 0(a0) +; RV32I-NEXT: lb a0, 1(a0) +; RV32I-NEXT: jalr zero, ra, 0 + %1 = getelementptr i8, i8* %a, i32 1 + %2 = load i8, i8* %1 + %3 = sext i8 %2 to i32 + ; the unused load will produce an anyext for selection + %4 = load volatile i8, i8* %a + ret i32 %3 +} + +define i32 @lh(i16 *%a) nounwind { +; RV32I-LABEL: lh: +; RV32I: # BB#0: +; RV32I-NEXT: lh a1, 0(a0) +; RV32I-NEXT: lh a0, 4(a0) +; RV32I-NEXT: jalr zero, ra, 0 + %1 = getelementptr i16, i16* %a, i32 2 + %2 = load i16, i16* %1 + %3 = sext i16 %2 to i32 + ; the unused load will produce an anyext for selection + %4 = load volatile i16, i16* %a + ret i32 %3 +} + +define i32 @lw(i32 *%a) nounwind { +; RV32I-LABEL: lw: +; RV32I: # BB#0: +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: jalr zero, ra, 0 + %1 = getelementptr i32, i32* %a, i32 3 + %2 = load i32, i32* %1 + %3 = load volatile i32, i32* %a + ret i32 %2 +} + +define i32 @lbu(i8 *%a) nounwind { +; RV32I-LABEL: lbu: +; RV32I: # BB#0: +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a0, 4(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: jalr zero, ra, 0 + %1 = getelementptr i8, i8* %a, i32 4 + %2 = load i8, i8* %1 + %3 = zext i8 %2 to i32 + %4 = load volatile i8, i8* %a + %5 = zext i8 %4 to i32 + %6 = add i32 %3, %5 + ret i32 %6 +} + +define i32 @lhu(i16 *%a) nounwind { +; RV32I-LABEL: lhu: +; RV32I: # BB#0: +; RV32I-NEXT: lhu a1, 0(a0) +; RV32I-NEXT: lhu a0, 10(a0) +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: jalr zero, ra, 0 + %1 = getelementptr i16, i16* %a, i32 5 + %2 = load i16, i16* %1 + %3 = zext i16 %2 to i32 + %4 = load volatile i16, i16* %a + %5 = zext i16 %4 to i32 + %6 = add i32 %3, %5 + ret i32 %6 +} + +; Check indexed and unindexed stores + +define void @sb(i8 *%a, i8 %b) nounwind { +; RV32I-LABEL: sb: +; RV32I: # BB#0: +; RV32I-NEXT: sb a1, 6(a0) +; RV32I-NEXT: sb a1, 0(a0) +; RV32I-NEXT: jalr zero, ra, 0 + store i8 %b, i8* %a + %1 = getelementptr i8, i8* %a, i32 6 + store i8 %b, i8* %1 + ret void +} + +define void @sh(i16 *%a, i16 %b) nounwind { +; RV32I-LABEL: sh: +; RV32I: # BB#0: +; RV32I-NEXT: sh a1, 14(a0) +; RV32I-NEXT: sh a1, 0(a0) +; RV32I-NEXT: jalr zero, ra, 0 + store i16 %b, i16* %a + %1 = getelementptr i16, i16* %a, i32 7 + store i16 %b, i16* %1 + ret void +} + +define void @sw(i32 *%a, i32 %b) nounwind { +; RV32I-LABEL: sw: +; RV32I: # BB#0: +; RV32I-NEXT: sw a1, 32(a0) +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: jalr zero, ra, 0 + store i32 %b, i32* %a + %1 = getelementptr i32, i32* %a, i32 8 + store i32 %b, i32* %1 + ret void +} + +; Check load and store to an i1 location +define i32 @load_sext_zext_anyext_i1(i1 *%a) nounwind { +; RV32I-LABEL: load_sext_zext_anyext_i1: +; RV32I: # BB#0: +; RV32I-NEXT: lb a1, 0(a0) +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: lbu a0, 2(a0) +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: jalr zero, ra, 0 + ; sextload i1 + %1 = getelementptr i1, i1* %a, i32 1 + %2 = load i1, i1* %1 + %3 = sext i1 %2 to i32 + ; zextload i1 + %4 = getelementptr i1, i1* %a, i32 2 + %5 = load i1, i1* %4 + %6 = zext i1 %5 to i32 + %7 = add i32 %3, %6 + ; extload i1 (anyext). Produced as the load is unused. + %8 = load volatile i1, i1* %a + ret i32 %7 +} + +define i16 @load_sext_zext_anyext_i1_i16(i1 *%a) nounwind { +; RV32I-LABEL: load_sext_zext_anyext_i1_i16: +; RV32I: # BB#0: +; RV32I-NEXT: lb a1, 0(a0) +; RV32I-NEXT: lbu a1, 1(a0) +; RV32I-NEXT: lbu a0, 2(a0) +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: jalr zero, ra, 0 + ; sextload i1 + %1 = getelementptr i1, i1* %a, i32 1 + %2 = load i1, i1* %1 + %3 = sext i1 %2 to i16 + ; zextload i1 + %4 = getelementptr i1, i1* %a, i32 2 + %5 = load i1, i1* %4 + %6 = zext i1 %5 to i16 + %7 = add i16 %3, %6 + ; extload i1 (anyext). Produced as the load is unused. + %8 = load volatile i1, i1* %a + ret i16 %7 +} + +; Check load and store to a global +@G = global i32 0 + +define i32 @lw_sw_global(i32 %a) nounwind { +; TODO: the addi should be folded in to the lw/sw operations +; RV32I-LABEL: lw_sw_global: +; RV32I: # BB#0: +; RV32I-NEXT: lui a1, %hi(G) +; RV32I-NEXT: addi a2, a1, %lo(G) +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: sw a0, 0(a2) +; RV32I-NEXT: lui a2, %hi(G+36) +; RV32I-NEXT: addi a2, a2, %lo(G+36) +; RV32I-NEXT: lw a3, 0(a2) +; RV32I-NEXT: sw a0, 0(a2) +; RV32I-NEXT: addi a0, a1, 0 +; RV32I-NEXT: jalr zero, ra, 0 + %1 = load volatile i32, i32* @G + store i32 %a, i32* @G + %2 = getelementptr i32, i32* @G, i32 9 + %3 = load volatile i32, i32* %2 + store i32 %a, i32* %2 + ret i32 %1 +} + +; Ensure that 1 is added to the high 20 bits if bit 11 of the low part is 1 +define i32 @lw_sw_constant(i32 %a) nounwind { +; TODO: the addi should be folded in to the lw/sw +; RV32I-LABEL: lw_sw_constant: +; RV32I: # BB#0: +; RV32I-NEXT: lui a1, 912092 +; RV32I-NEXT: addi a2, a1, -273 +; RV32I-NEXT: lw a1, 0(a2) +; RV32I-NEXT: sw a0, 0(a2) +; RV32I-NEXT: addi a0, a1, 0 +; RV32I-NEXT: jalr zero, ra, 0 + %1 = inttoptr i32 3735928559 to i32* + %2 = load volatile i32, i32* %1 + store i32 %a, i32* %1 + ret i32 %2 +} diff --git a/test/CodeGen/RISCV/wide-mem.ll b/test/CodeGen/RISCV/wide-mem.ll new file mode 100644 index 000000000000..18ab52aaf138 --- /dev/null +++ b/test/CodeGen/RISCV/wide-mem.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32I + +; Check load/store operations on values wider than what is natively supported + +define i64 @load_i64(i64 *%a) nounwind { +; RV32I-LABEL: load_i64: +; RV32I: # BB#0: +; RV32I-NEXT: lw a2, 0(a0) +; RV32I-NEXT: lw a1, 4(a0) +; RV32I-NEXT: addi a0, a2, 0 +; RV32I-NEXT: jalr zero, ra, 0 + %1 = load i64, i64* %a + ret i64 %1 +} + +@val64 = local_unnamed_addr global i64 2863311530, align 8 + +; TODO: codegen on this should be improved. It shouldn't be necessary to +; generate two addi +define i64 @load_i64_global() nounwind { +; RV32I-LABEL: load_i64_global: +; RV32I: # BB#0: +; RV32I-NEXT: lui a0, %hi(val64) +; RV32I-NEXT: addi a0, a0, %lo(val64) +; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lui a1, %hi(val64+4) +; RV32I-NEXT: addi a1, a1, %lo(val64+4) +; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: jalr zero, ra, 0 + %1 = load i64, i64* @val64 + ret i64 %1 +} diff --git a/test/CodeGen/WebAssembly/inline-asm-m.ll b/test/CodeGen/WebAssembly/inline-asm-m.ll new file mode 100644 index 000000000000..8d514a528fd9 --- /dev/null +++ b/test/CodeGen/WebAssembly/inline-asm-m.ll @@ -0,0 +1,13 @@ +; RUN: not llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -no-integrated-as + +; Test basic inline assembly "m" operands, which are unsupported. Pass +; -no-integrated-as since these aren't actually valid assembly syntax. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown-wasm" + +define void @bar(i32* %r, i32* %s) { +entry: + tail call void asm sideeffect "# $0 = bbb($1)", "=*m,*m"(i32* %s, i32* %r) #0, !srcloc !1 + ret void +} diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll index 56576305d9e2..760b0ad0de60 100644 --- a/test/CodeGen/WebAssembly/inline-asm.ll +++ b/test/CodeGen/WebAssembly/inline-asm.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -no-integrated-as | FileCheck %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -no-integrated-as | FileCheck %s ; Test basic inline assembly. Pass -no-integrated-as since these aren't ; actually valid assembly syntax. @@ -10,33 +10,24 @@ target triple = "wasm32-unknown-unknown-wasm" ; CHECK-NEXT: .param i32{{$}} ; CHECK-NEXT: .result i32{{$}} ; CHECK-NEXT: #APP{{$}} -; CHECK-NEXT: # $0 = aaa($0){{$}} +; CHECK-NEXT: # 0 = aaa(0){{$}} ; CHECK-NEXT: #NO_APP{{$}} -; CHECK-NEXT: return $0{{$}} +; CHECK-NEXT: get_local $push0=, 0{{$}} +; CHECK-NEXT: return $pop0{{$}} define i32 @foo(i32 %r) { entry: %0 = tail call i32 asm sideeffect "# $0 = aaa($1)", "=r,r"(i32 %r) #0, !srcloc !0 ret i32 %0 } -; CHECK-LABEL: bar: -; CHECK-NEXT: .param i32, i32{{$}} -; CHECK-NEXT: #APP{{$}} -; CHECK-NEXT: # 0($1) = bbb(0($0)){{$}} -; CHECK-NEXT: #NO_APP{{$}} -; CHECK-NEXT: return{{$}} -define void @bar(i32* %r, i32* %s) { -entry: - tail call void asm sideeffect "# $0 = bbb($1)", "=*m,*m"(i32* %s, i32* %r) #0, !srcloc !1 - ret void -} - ; CHECK-LABEL: imm: ; CHECK-NEXT: .result i32{{$}} +; CHECK-NEXT: .local i32{{$}} ; CHECK-NEXT: #APP{{$}} -; CHECK-NEXT: # $0 = ccc(42){{$}} +; CHECK-NEXT: # 0 = ccc(42){{$}} ; CHECK-NEXT: #NO_APP{{$}} -; CHECK-NEXT: return $0{{$}} +; CHECK-NEXT: get_local $push0=, 0{{$}} +; CHECK-NEXT: return $pop0{{$}} define i32 @imm() { entry: %0 = tail call i32 asm sideeffect "# $0 = ccc($1)", "=r,i"(i32 42) #0, !srcloc !2 @@ -47,9 +38,10 @@ entry: ; CHECK-NEXT: .param i64{{$}} ; CHECK-NEXT: .result i64{{$}} ; CHECK-NEXT: #APP{{$}} -; CHECK-NEXT: # $0 = aaa($0){{$}} +; CHECK-NEXT: # 0 = aaa(0){{$}} ; CHECK-NEXT: #NO_APP{{$}} -; CHECK-NEXT: return $0{{$}} +; CHECK-NEXT: get_local $push0=, 0{{$}} +; CHECK-NEXT: return $pop0{{$}} define i64 @foo_i64(i64 %r) { entry: %0 = tail call i64 asm sideeffect "# $0 = aaa($1)", "=r,r"(i64 %r) #0, !srcloc !0 @@ -57,16 +49,20 @@ entry: } ; CHECK-LABEL: X_i16: -; CHECK: foo $1{{$}} -; CHECK: i32.store16 0($0), $1{{$}} +; CHECK: foo 1{{$}} +; CHECK: get_local $push[[S0:[0-9]+]]=, 0{{$}} +; CHECK-NEXT: get_local $push[[S1:[0-9]+]]=, 1{{$}} +; CHECK-NEXT: i32.store16 0($pop[[S0]]), $pop[[S1]]{{$}} define void @X_i16(i16 * %t) { call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16* %t) ret void } ; CHECK-LABEL: X_ptr: -; CHECK: foo $1{{$}} -; CHECK: i32.store 0($0), $1{{$}} +; CHECK: foo 1{{$}} +; CHECK: get_local $push[[S0:[0-9]+]]=, 0{{$}} +; CHECK-NEXT: get_local $push[[S1:[0-9]+]]=, 1{{$}} +; CHECK-NEXT: i32.store 0($pop[[S0]]), $pop[[S1]]{{$}} define void @X_ptr(i16 ** %t) { call void asm sideeffect "foo $0", "=*X,~{dirflag},~{fpsr},~{flags},~{memory}"(i16** %t) ret void @@ -87,6 +83,20 @@ define void @varname() { ret void } +; CHECK-LABEL: r_constraint +; CHECK: i32.const $push[[S0:[0-9]+]]=, 0{{$}} +; CHECK-NEXT: set_local [[L0:[0-9]+]], $pop[[S0]]{{$}} +; CHECK-NEXT: i32.const $push[[S1:[0-9]+]]=, 37{{$}} +; CHECK-NEXT: set_local [[L1:[0-9]+]], $pop[[S1]]{{$}} +; CHECK: foo [[L2:[0-9]+]], 1, [[L0]], [[L1]]{{$}} +; CHECK: get_local $push{{[0-9]+}}=, [[L2]]{{$}} +define hidden i32 @r_constraint(i32 %a, i32 %y) { +entry: + %z = bitcast i32 0 to i32 + %t0 = tail call i32 asm "foo $0, $1, $2, $3", "=r,r,r,r"(i32 %y, i32 %z, i32 37) #0, !srcloc !0 + ret i32 %t0 +} + attributes #0 = { nounwind } !0 = !{i32 47} diff --git a/test/CodeGen/WebAssembly/signext-arg.ll b/test/CodeGen/WebAssembly/signext-arg.ll new file mode 100644 index 000000000000..cd116c645b44 --- /dev/null +++ b/test/CodeGen/WebAssembly/signext-arg.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=wasm32 | FileCheck %s + +declare i32 @get_int(i16 %arg) + +define i32 @func_1(i16 %arg1 , i32 %arg2) #0 { +; CHECK-LABEL: func_1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: i32.const $push1=, 16 +; CHECK-NEXT: i32.shl $push2=, $0, $pop1 +; CHECK-NEXT: i32.const $push4=, 16 +; CHECK-NEXT: i32.shr_s $push3=, $pop2, $pop4 +; CHECK-NEXT: i32.call $push0=, get_int@FUNCTION, $pop3 +; CHECK-NEXT: # fallthrough-return: $pop0 +; CHECK-NEXT: .endfunc +entry: + %retval = call i32 @get_int(i16 signext %arg1) + ret i32 %retval +} + +attributes #0 = {noinline nounwind optnone} + diff --git a/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll b/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll index 6814ed1d894e..109962c2859a 100644 --- a/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll +++ b/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll @@ -23,6 +23,7 @@ lpad: ; preds = %cont, %entry } ; CHECK: lpad +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: Ltmp declare i32 @__gxx_personality_v0(...) diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll index 416761ffef45..dd0591005036 100644 --- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -88,6 +88,7 @@ define void @full_test() { ; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) ; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: full_test: diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll index 64a6313023be..9d28f441fb7e 100644 --- a/test/CodeGen/X86/GlobalISel/add-scalar.ll +++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll @@ -20,6 +20,7 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) { ; X32-NEXT: addl 8(%ebp), %eax ; X32-NEXT: adcl 12(%ebp), %edx ; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa %esp, 4 ; X32-NEXT: retl %ret = add i64 %arg1, %arg2 ret i64 %ret diff --git a/test/CodeGen/X86/GlobalISel/brcond.ll b/test/CodeGen/X86/GlobalISel/brcond.ll index 917ee6f5bd8c..2467344776e2 100644 --- a/test/CodeGen/X86/GlobalISel/brcond.ll +++ b/test/CodeGen/X86/GlobalISel/brcond.ll @@ -36,6 +36,7 @@ define i32 @test_1(i32 %a, i32 %b, i32 %tValue, i32 %fValue) { ; X32-NEXT: movl %eax, (%esp) ; X32-NEXT: movl (%esp), %eax ; X32-NEXT: popl %ecx +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: %retval = alloca i32, align 4 diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll index 4100a7217ac3..23987a3c365d 100644 --- a/test/CodeGen/X86/GlobalISel/callingconv.ll +++ b/test/CodeGen/X86/GlobalISel/callingconv.ll @@ -117,6 +117,7 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1, <8 x i32> %arg2) { ; X32-NEXT: movups 16(%esp), %xmm1 ; X32-NEXT: movaps %xmm2, %xmm0 ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_v8i32_args: @@ -135,6 +136,7 @@ define void @test_trivial_call() { ; X32-NEXT: .cfi_def_cfa_offset 16 ; X32-NEXT: calll trivial_callee ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_trivial_call: @@ -143,6 +145,7 @@ define void @test_trivial_call() { ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: callq trivial_callee ; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq call void @trivial_callee() ret void @@ -160,6 +163,7 @@ define void @test_simple_arg_call(i32 %in0, i32 %in1) { ; X32-NEXT: movl %eax, 4(%esp) ; X32-NEXT: calll simple_arg_callee ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_simple_arg_call: @@ -171,6 +175,7 @@ define void @test_simple_arg_call(i32 %in0, i32 %in1) { ; X64-NEXT: movl %eax, %esi ; X64-NEXT: callq simple_arg_callee ; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq call void @simple_arg_callee(i32 %in1, i32 %in0) ret void @@ -193,6 +198,7 @@ define void @test_simple_arg8_call(i32 %in0) { ; X32-NEXT: movl %eax, 28(%esp) ; X32-NEXT: calll simple_arg8_callee ; X32-NEXT: addl $44, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_simple_arg8_call: @@ -208,6 +214,7 @@ define void @test_simple_arg8_call(i32 %in0) { ; X64-NEXT: movl %edi, %r9d ; X64-NEXT: callq simple_arg8_callee ; X64-NEXT: addq $24, %rsp +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq call void @simple_arg8_callee(i32 %in0, i32 %in0, i32 %in0, i32 %in0,i32 %in0, i32 %in0, i32 %in0, i32 %in0) ret void @@ -224,6 +231,7 @@ define i32 @test_simple_return_callee() { ; X32-NEXT: calll simple_return_callee ; X32-NEXT: addl %eax, %eax ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_simple_return_callee: @@ -234,6 +242,7 @@ define i32 @test_simple_return_callee() { ; X64-NEXT: callq simple_return_callee ; X64-NEXT: addl %eax, %eax ; X64-NEXT: popq %rcx +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq %call = call i32 @simple_return_callee(i32 5) %r = add i32 %call, %call @@ -254,6 +263,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) { ; X32-NEXT: paddd (%esp), %xmm0 # 16-byte Folded Reload ; X32-NEXT: paddd 16(%esp), %xmm1 # 16-byte Folded Reload ; X32-NEXT: addl $44, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_split_return_callee: @@ -268,6 +278,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) { ; X64-NEXT: paddd (%rsp), %xmm0 # 16-byte Folded Reload ; X64-NEXT: paddd 16(%rsp), %xmm1 # 16-byte Folded Reload ; X64-NEXT: addq $40, %rsp +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq %call = call <8 x i32> @split_return_callee(<8 x i32> %arg2) %r = add <8 x i32> %arg1, %call @@ -281,6 +292,7 @@ define void @test_indirect_call(void()* %func) { ; X32-NEXT: .cfi_def_cfa_offset 16 ; X32-NEXT: calll *16(%esp) ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_indirect_call: @@ -289,6 +301,7 @@ define void @test_indirect_call(void()* %func) { ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: callq *%rdi ; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq call void %func() ret void @@ -317,8 +330,11 @@ define void @test_abi_exts_call(i8* %addr) { ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll take_char ; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 ; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_abi_exts_call: @@ -335,6 +351,7 @@ define void @test_abi_exts_call(i8* %addr) { ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq take_char ; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq %val = load i8, i8* %addr call void @take_char(i8 %val) @@ -357,6 +374,7 @@ define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) { ; X32-NEXT: movl %ecx, 4(%esp) ; X32-NEXT: calll variadic_callee ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_variadic_call_1: @@ -368,6 +386,7 @@ define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) { ; X64-NEXT: movb $0, %al ; X64-NEXT: callq variadic_callee ; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq %addr = load i8*, i8** %addr_ptr @@ -393,6 +412,7 @@ define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) { ; X32-NEXT: movl %ecx, 4(%eax) ; X32-NEXT: calll variadic_callee ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_variadic_call_2: @@ -405,6 +425,7 @@ define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) { ; X64-NEXT: movq %rcx, %xmm0 ; X64-NEXT: callq variadic_callee ; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq %addr = load i8*, i8** %addr_ptr diff --git a/test/CodeGen/X86/GlobalISel/frameIndex.ll b/test/CodeGen/X86/GlobalISel/frameIndex.ll index 7b2a050f1534..f260d0d707f6 100644 --- a/test/CodeGen/X86/GlobalISel/frameIndex.ll +++ b/test/CodeGen/X86/GlobalISel/frameIndex.ll @@ -18,6 +18,7 @@ define i32* @allocai32() { ; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: movl %esp, %eax ; X32-NEXT: popl %ecx +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32ABI-LABEL: allocai32: diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir index 9058f010f76e..3457e971b8d4 100644 --- a/test/CodeGen/X86/GlobalISel/select-cmp.mir +++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir @@ -100,7 +100,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr8 = COPY %sil ; CHECK: CMP8rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -131,7 +131,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr16 = COPY %si ; CHECK: CMP16rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -162,7 +162,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi ; CHECK: CMP64rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -193,7 +193,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -224,7 +224,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETNEr:%[0-9]+]]:gr8 = SETNEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -255,7 +255,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETAr:%[0-9]+]]:gr8 = SETAr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -286,7 +286,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETAEr:%[0-9]+]]:gr8 = SETAEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -317,7 +317,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETBr:%[0-9]+]]:gr8 = SETBr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -348,7 +348,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETBEr:%[0-9]+]]:gr8 = SETBEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -379,7 +379,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETGr:%[0-9]+]]:gr8 = SETGr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -410,7 +410,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETGEr:%[0-9]+]]:gr8 = SETGEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -441,7 +441,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETLr:%[0-9]+]]:gr8 = SETLr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax @@ -472,7 +472,7 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags ; CHECK: [[SETLEr:%[0-9]+]]:gr8 = SETLEr implicit %eflags - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], 1 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], %subreg.sub_8bit ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; CHECK: %eax = COPY [[AND32ri8_]] ; CHECK: RET 0, implicit %eax diff --git a/test/CodeGen/X86/GlobalISel/select-copy.mir b/test/CodeGen/X86/GlobalISel/select-copy.mir index a72f42782c09..fccba1f82068 100644 --- a/test/CodeGen/X86/GlobalISel/select-copy.mir +++ b/test/CodeGen/X86/GlobalISel/select-copy.mir @@ -42,7 +42,7 @@ registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } # ALL: %0:gr8 = COPY %al -# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, 1 +# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit # ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags # ALL-NEXT: %eax = COPY %1 # ALL-NEXT: RET 0, implicit %eax @@ -146,7 +146,7 @@ regBankSelected: true registers: - { id: 0, class: gpr, preferred-register: '' } # ALL: %0:gr8 = COPY %dl -# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, 1 +# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit # ALL-NEXT: %eax = COPY %1 # ALL-NEXT: RET 0, implicit %eax body: | @@ -170,7 +170,7 @@ regBankSelected: true registers: - { id: 0, class: gpr, preferred-register: '' } # ALL: %0:gr16 = COPY %dx -# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, 3 +# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_16bit # ALL-NEXT: %eax = COPY %1 # ALL-NEXT: RET 0, implicit %eax body: | diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir index 51088e126e5c..9df24f65b368 100644 --- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir +++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir @@ -39,7 +39,7 @@ body: | ; ALL-LABEL: name: test_zext_i1 ; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]] - ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1 + ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit ; ALL: [[AND64ri8_:%[0-9]+]]:gr64 = AND64ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags ; ALL: %rax = COPY [[AND64ri8_]] ; ALL: RET 0, implicit %rax @@ -112,7 +112,7 @@ body: | ; ALL-LABEL: name: anyext_s64_from_s1 ; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit - ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1 + ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit ; ALL: %rax = COPY [[SUBREG_TO_REG]] ; ALL: RET 0, implicit %rax %0(s64) = COPY %rdi @@ -137,7 +137,7 @@ body: | ; ALL-LABEL: name: anyext_s64_from_s8 ; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit - ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1 + ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit ; ALL: %rax = COPY [[SUBREG_TO_REG]] ; ALL: RET 0, implicit %rax %0(s64) = COPY %rdi @@ -162,7 +162,7 @@ body: | ; ALL-LABEL: name: anyext_s64_from_s16 ; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi ; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit - ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 3 + ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_16bit ; ALL: %rax = COPY [[SUBREG_TO_REG]] ; ALL: RET 0, implicit %rax %0(s64) = COPY %rdi @@ -187,7 +187,7 @@ body: | ; ALL-LABEL: name: anyext_s64_from_s32 ; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi ; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]].sub_32bit - ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 4 + ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_32bit ; ALL: %rax = COPY [[SUBREG_TO_REG]] ; ALL: RET 0, implicit %rax %0(s64) = COPY %rdi diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir index 5167ee987a5a..90ac0c6763aa 100644 --- a/test/CodeGen/X86/GlobalISel/select-ext.mir +++ b/test/CodeGen/X86/GlobalISel/select-ext.mir @@ -85,7 +85,7 @@ registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } # ALL: %0:gr8 = COPY %dil -# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, 1 +# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit # ALL-NEXT: %1:gr16 = AND16ri8 %2, 1, implicit-def %eflags # ALL-NEXT: %ax = COPY %1 # ALL-NEXT: RET 0, implicit %ax @@ -113,7 +113,7 @@ registers: - { id: 0, class: gpr } - { id: 1, class: gpr } # ALL: %0:gr8 = COPY %dil -# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, 1 +# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit # ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags # ALL-NEXT: %eax = COPY %1 # ALL-NEXT: RET 0, implicit %eax @@ -288,7 +288,7 @@ registers: # X32: %0:gr32_abcd = COPY %edi # X64: %0:gr32 = COPY %edi # ALL-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, 1 +# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit # ALL-NEXT: %ax = COPY %2 # ALL-NEXT: RET 0, implicit %ax body: | @@ -323,7 +323,7 @@ registers: # X32: %0:gr32_abcd = COPY %edi # X64: %0:gr32 = COPY %edi # ALL-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, 1 +# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit # ALL-NEXT: %eax = COPY %2 # ALL-NEXT: RET 0, implicit %eax body: | @@ -358,7 +358,7 @@ registers: # X32: %0:gr32_abcd = COPY %edi # X64: %0:gr32 = COPY %edi # ALL-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, 1 +# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit # ALL-NEXT: %ax = COPY %2 # ALL-NEXT: RET 0, implicit %ax body: | @@ -422,7 +422,7 @@ registers: - { id: 2, class: gpr } # ALL: %0:gr32 = COPY %edi # ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, 3 +# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_16bit # ALL-NEXT: %eax = COPY %2 # ALL-NEXT: RET 0, implicit %eax body: | diff --git a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir index 596c48b49226..628ab3bac4ab 100644 --- a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir +++ b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir @@ -20,7 +20,7 @@ body: | bb.0: ; CHECK-LABEL: name: read_flags ; CHECK: [[RDFLAGS32_:%[0-9]+]]:gr32 = RDFLAGS32 implicit-def %esp, implicit %esp - ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], 4 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], %subreg.sub_32bit ; CHECK: %rax = COPY [[SUBREG_TO_REG]] %0(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.x86.flags.read.u32) %rax = COPY %0(s32) diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll index 1f7415ee2af6..8ecafad8022e 100644 --- a/test/CodeGen/X86/O0-pipeline.ll +++ b/test/CodeGen/X86/O0-pipeline.ll @@ -49,6 +49,7 @@ ; CHECK-NEXT: X86 pseudo instruction expansion pass ; CHECK-NEXT: Analyze Machine Code For Garbage Collection ; CHECK-NEXT: X86 vzeroupper inserter +; CHECK-NEXT: Check CFA info and insert CFI instructions if needed ; CHECK-NEXT: Contiguously Lay Out Funclets ; CHECK-NEXT: StackMap Liveness Analysis ; CHECK-NEXT: Live DEBUG_VALUE analysis diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll index b9ae57ca0110..ed98fd51cc02 100644 --- a/test/CodeGen/X86/TruncAssertZext.ll +++ b/test/CodeGen/X86/TruncAssertZext.ll @@ -25,6 +25,7 @@ define i64 @main() { ; CHECK-NEXT: subq %rcx, %rax ; CHECK-NEXT: shrq $32, %rax ; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %b = call i64 @foo() %or = and i64 %b, 18446744069414584575 ; this is 0xffffffff000000ff diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 508f10e98894..14494779f10a 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -2209,62 +2209,53 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind { define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v32i8_const: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4] +; SSE2-NEXT: paddd %xmm9, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8] +; SSE2-NEXT: paddd %xmm4, %xmm8 ; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm6 ; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm7 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: packuswb %xmm7, %xmm1 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm7 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: packuswb %xmm6, %xmm4 -; SSE2-NEXT: packuswb %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm2, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: packuswb %xmm8, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: movdqu %xmm4, (%rax) +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8_const: @@ -2277,9 +2268,9 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4] ; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8] ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 @@ -2287,30 +2278,21 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm8 +; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2 +; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2567,49 +2549,40 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind { ; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 ; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 ; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm9 +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm10 +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm9, %ymm8 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm7, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-NEXT: vpackssdw %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm8, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm10, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3 +; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll index 923e1b9b0e0e..dc386415934e 100644 --- a/test/CodeGen/X86/avx-basic.ll +++ b/test/CodeGen/X86/avx-basic.ll @@ -12,7 +12,6 @@ define void @zero128() nounwind ssp { ; CHECK-NEXT: movq _z@{{.*}}(%rip), %rax ; CHECK-NEXT: vmovaps %xmm0, (%rax) ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function store <4 x float> zeroinitializer, <4 x float>* @z, align 16 ret void } @@ -27,7 +26,6 @@ define void @zero256() nounwind ssp { ; CHECK-NEXT: vmovaps %ymm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function store <8 x float> zeroinitializer, <8 x float>* @x, align 32 store <4 x double> zeroinitializer, <4 x double>* @y, align 32 ret void @@ -41,7 +39,6 @@ define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nou ; CHECK-NEXT: vmovaps %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function allocas: %ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>* store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float @@ -59,7 +56,6 @@ define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwi ; CHECK-NEXT: vmovaps %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function allocas: %ptr2vec615 = bitcast [0 x i32]* %RET to <8 x i32>* store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %ptr2vec615, align 32 @@ -83,7 +79,6 @@ define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind { ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32* %val.i34.i = load i32, i32* %ptrcast.i33.i, align 4 %ptroffset.i22.i992 = getelementptr [0 x float], [0 x float]* %aFOO, i64 0, i64 1 @@ -102,7 +97,6 @@ define <16 x float> @fneg(<16 x float> %a) nounwind { ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a ret <16 x float> %1 } @@ -114,7 +108,6 @@ define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly { ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function %res = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a, i32 0 ret <16 x i16> %res } diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 44eb14160ee1..e508e345de64 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -581,15 +581,10 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) { -; AVX-LABEL: test_x86_avx_rcp_ps_256: -; AVX: # BB#0: -; AVX-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_rcp_ps_256: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_rcp_ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -619,15 +614,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) { -; AVX-LABEL: test_x86_avx_rsqrt_ps_256: -; AVX: # BB#0: -; AVX-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_rsqrt_ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -635,10 +625,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) { -; CHECK-LABEL: test_x86_avx_sqrt_pd_256: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_sqrt_pd_256: +; AVX: # BB#0: +; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1] ret <4 x double> %res } @@ -646,10 +641,15 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { -; CHECK-LABEL: test_x86_avx_sqrt_ps_256: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_avx_sqrt_ps_256: +; AVX: # BB#0: +; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll index 44d13db65c9d..858a27b1d48b 100644 --- a/test/CodeGen/X86/avx-schedule.ll +++ b/test/CodeGen/X86/avx-schedule.ll @@ -3982,8 +3982,8 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) { ; ; SKX-LABEL: test_rcpps: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00] -; SKX-NEXT: vrcp14ps (%rdi), %ymm1 # sched: [11:1.00] +; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00] +; SKX-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:1.00] ; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; @@ -4174,8 +4174,8 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) { ; ; SKX-LABEL: test_rsqrtps: ; SKX: # BB#0: -; SKX-NEXT: vrsqrt14ps %ymm0, %ymm0 # sched: [4:1.00] -; SKX-NEXT: vrsqrt14ps (%rdi), %ymm1 # sched: [11:1.00] +; SKX-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [4:1.00] +; SKX-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:1.00] ; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index b75bd8cc3ee0..909e83986805 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -699,11 +699,13 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; AVX512BW-NEXT: jg LBB17_1 ; AVX512BW-NEXT: ## BB#2: ; AVX512BW-NEXT: vpcmpltud %zmm2, %zmm1, %k0 -; AVX512BW-NEXT: jmp LBB17_3 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; AVX512BW-NEXT: LBB17_1: -; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 -; AVX512BW-NEXT: LBB17_3: -; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/test/CodeGen/X86/avx512-regcall-Mask.ll b/test/CodeGen/X86/avx512-regcall-Mask.ll index bb541f46567f..fa6adec675f8 100644 --- a/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -209,12 +209,18 @@ define i64 @caller_argv64i1() #0 { ; LINUXOSX64-NEXT: pushq %rax ; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8 ; LINUXOSX64-NEXT: callq test_argv64i1 -; LINUXOSX64-NEXT: addq $24, %rsp +; LINUXOSX64-NEXT: addq $16, %rsp ; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset -16 +; LINUXOSX64-NEXT: addq $8, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 40 ; LINUXOSX64-NEXT: popq %r12 +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32 ; LINUXOSX64-NEXT: popq %r13 +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 24 ; LINUXOSX64-NEXT: popq %r14 +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %r15 +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq entry: %v0 = bitcast i64 4294967298 to <64 x i1> @@ -287,6 +293,7 @@ define <64 x i1> @caller_retv64i1() #0 { ; LINUXOSX64-NEXT: kmovq %rax, %k0 ; LINUXOSX64-NEXT: vpmovm2b %k0, %zmm0 ; LINUXOSX64-NEXT: popq %rax +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq entry: %call = call x86_regcallcc <64 x i1> @test_retv64i1() @@ -397,7 +404,9 @@ define x86_regcallcc i32 @test_argv32i1(<32 x i1> %x0, <32 x i1> %x1, <32 x i1> ; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload ; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: vzeroupper ; LINUXOSX64-NEXT: retq entry: @@ -451,6 +460,7 @@ define i32 @caller_argv32i1() #0 { ; LINUXOSX64-NEXT: movl $1, %edx ; LINUXOSX64-NEXT: callq test_argv32i1 ; LINUXOSX64-NEXT: popq %rcx +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq entry: %v0 = bitcast i32 1 to <32 x i1> @@ -513,6 +523,7 @@ define i32 @caller_retv32i1() #0 { ; LINUXOSX64-NEXT: callq test_retv32i1 ; LINUXOSX64-NEXT: incl %eax ; LINUXOSX64-NEXT: popq %rcx +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq entry: %call = call x86_regcallcc <32 x i1> @test_retv32i1() @@ -626,7 +637,9 @@ define x86_regcallcc i16 @test_argv16i1(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> ; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload ; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %res = call i16 @test_argv16i1helper(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> %x2) ret i16 %res @@ -678,6 +691,7 @@ define i16 @caller_argv16i1() #0 { ; LINUXOSX64-NEXT: movl $1, %edx ; LINUXOSX64-NEXT: callq test_argv16i1 ; LINUXOSX64-NEXT: popq %rcx +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq entry: %v0 = bitcast i16 1 to <16 x i1> @@ -746,6 +760,7 @@ define i16 @caller_retv16i1() #0 { ; LINUXOSX64-NEXT: incl %eax ; LINUXOSX64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; LINUXOSX64-NEXT: popq %rcx +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq entry: %call = call x86_regcallcc <16 x i1> @test_retv16i1() @@ -859,7 +874,9 @@ define x86_regcallcc i8 @test_argv8i1(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2) ; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload ; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload ; LINUXOSX64-NEXT: addq $128, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %res = call i8 @test_argv8i1helper(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2) ret i8 %res @@ -911,6 +928,7 @@ define i8 @caller_argv8i1() #0 { ; LINUXOSX64-NEXT: movl $1, %edx ; LINUXOSX64-NEXT: callq test_argv8i1 ; LINUXOSX64-NEXT: popq %rcx +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq entry: %v0 = bitcast i8 1 to <8 x i1> @@ -984,9 +1002,11 @@ define <8 x i1> @caller_retv8i1() #0 { ; LINUXOSX64-NEXT: vpmovm2w %k0, %zmm0 ; LINUXOSX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> ; LINUXOSX64-NEXT: popq %rax +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: vzeroupper ; LINUXOSX64-NEXT: retq entry: %call = call x86_regcallcc <8 x i1> @test_retv8i1() ret <8 x i1> %call } + diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll index 43a1871245ba..b4f1d2c776d9 100644 --- a/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -63,6 +63,7 @@ define x86_regcallcc i1 @test_CallargReti1(i1 %a) { ; LINUXOSX64-NEXT: callq test_argReti1 ; LINUXOSX64-NEXT: incb %al ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = add i1 %a, 1 %c = call x86_regcallcc i1 @test_argReti1(i1 %b) @@ -130,6 +131,7 @@ define x86_regcallcc i8 @test_CallargReti8(i8 %a) { ; LINUXOSX64-NEXT: callq test_argReti8 ; LINUXOSX64-NEXT: incb %al ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = add i8 %a, 1 %c = call x86_regcallcc i8 @test_argReti8(i8 %b) @@ -200,6 +202,7 @@ define x86_regcallcc i16 @test_CallargReti16(i16 %a) { ; LINUXOSX64-NEXT: incl %eax ; LINUXOSX64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = add i16 %a, 1 %c = call x86_regcallcc i16 @test_argReti16(i16 %b) @@ -261,6 +264,7 @@ define x86_regcallcc i32 @test_CallargReti32(i32 %a) { ; LINUXOSX64-NEXT: callq test_argReti32 ; LINUXOSX64-NEXT: incl %eax ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = add i32 %a, 1 %c = call x86_regcallcc i32 @test_argReti32(i32 %b) @@ -327,6 +331,7 @@ define x86_regcallcc i64 @test_CallargReti64(i64 %a) { ; LINUXOSX64-NEXT: callq test_argReti64 ; LINUXOSX64-NEXT: incq %rax ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = add i64 %a, 1 %c = call x86_regcallcc i64 @test_argReti64(i64 %b) @@ -406,7 +411,9 @@ define x86_regcallcc float @test_CallargRetFloat(float %a) { ; LINUXOSX64-NEXT: vaddss %xmm8, %xmm0, %xmm0 ; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $16, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = fadd float 1.0, %a %c = call x86_regcallcc float @test_argRetFloat(float %b) @@ -486,7 +493,9 @@ define x86_regcallcc double @test_CallargRetDouble(double %a) { ; LINUXOSX64-NEXT: vaddsd %xmm8, %xmm0, %xmm0 ; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $16, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = fadd double 1.0, %a %c = call x86_regcallcc double @test_argRetDouble(double %b) @@ -548,6 +557,7 @@ define x86_regcallcc x86_fp80 @test_CallargRetf80(x86_fp80 %a) { ; LINUXOSX64-NEXT: callq test_argRetf80 ; LINUXOSX64-NEXT: fadd %st(0), %st(0) ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = fadd x86_fp80 %a, %a %c = call x86_regcallcc x86_fp80 @test_argRetf80(x86_fp80 %b) @@ -611,6 +621,7 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) { ; LINUXOSX64-NEXT: callq test_argRetPointer ; LINUXOSX64-NEXT: incl %eax ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = ptrtoint [4 x i32]* %a to i32 %c = add i32 %b, 1 @@ -694,7 +705,9 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) { ; LINUXOSX64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1} ; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; LINUXOSX64-NEXT: addq $16, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = call x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %a) %c = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b @@ -768,7 +781,9 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) { ; LINUXOSX64-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; LINUXOSX64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; LINUXOSX64-NEXT: addq $48, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %a) %c = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b @@ -842,7 +857,9 @@ define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a) { ; LINUXOSX64-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; LINUXOSX64-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; LINUXOSX64-NEXT: addq $112, %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16 ; LINUXOSX64-NEXT: popq %rsp +; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8 ; LINUXOSX64-NEXT: retq %b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %a) %c = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 8372fbdb9aba..abc8c1a7513e 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -8839,6 +8839,7 @@ define <16 x float> @broadcast_ss_spill(float %x) { ; GENERIC-NEXT: callq func_f32 ; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload ; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33] +; GENERIC-NEXT: .cfi_def_cfa_offset 8 ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: broadcast_ss_spill: @@ -8852,6 +8853,7 @@ define <16 x float> @broadcast_ss_spill(float %x) { ; SKX-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50] ; SKX-NEXT: # sched: [8:0.50] ; SKX-NEXT: addq $24, %rsp # sched: [1:0.25] +; SKX-NEXT: .cfi_def_cfa_offset 8 ; SKX-NEXT: retq # sched: [7:1.00] %a = fadd float %x, %x call void @func_f32(float %a) @@ -8872,6 +8874,7 @@ define <8 x double> @broadcast_sd_spill(double %x) { ; GENERIC-NEXT: callq func_f64 ; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload ; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33] +; GENERIC-NEXT: .cfi_def_cfa_offset 8 ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: broadcast_sd_spill: @@ -8885,6 +8888,7 @@ define <8 x double> @broadcast_sd_spill(double %x) { ; SKX-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50] ; SKX-NEXT: # sched: [8:0.50] ; SKX-NEXT: addq $24, %rsp # sched: [1:0.25] +; SKX-NEXT: .cfi_def_cfa_offset 8 ; SKX-NEXT: retq # sched: [7:1.00] %a = fadd double %x, %x call void @func_f64(double %a) diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll index 43cf9ee73582..51a7c685ed4a 100644 --- a/test/CodeGen/X86/avx512-select.ll +++ b/test/CodeGen/X86/avx512-select.ll @@ -115,6 +115,7 @@ define <16 x double> @select04(<16 x double> %a, <16 x double> %b) { ; X86-NEXT: vmovaps 8(%ebp), %zmm1 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl ; ; X64-LABEL: select04: diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll index c59fb5b97bca..c95f0d40fbf2 100755 --- a/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -9533,18 +9533,18 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9555,18 +9555,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9576,18 +9574,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9598,18 +9596,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9619,18 +9615,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9641,18 +9637,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9675,18 +9669,18 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test2_8xfloat_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9697,18 +9691,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9732,18 +9724,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -9755,18 +9747,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -9778,18 +9768,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -9801,18 +9791,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -9824,18 +9812,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -9847,18 +9835,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -9884,18 +9870,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -9907,18 +9893,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -10337,18 +10321,18 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10359,18 +10343,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10380,18 +10362,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10402,18 +10384,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10423,18 +10403,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10445,18 +10425,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10479,18 +10457,18 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10501,18 +10479,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10536,18 +10512,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -10559,18 +10535,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -10582,18 +10556,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -10605,18 +10579,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -10628,18 +10600,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -10651,18 +10623,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -10688,18 +10658,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -10711,18 +10681,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -11128,12 +11096,12 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { ; GENERIC-LABEL: test_8xi32_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> ret <8 x i32> %res @@ -11141,18 +11109,18 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11163,18 +11131,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11184,18 +11150,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11206,18 +11172,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11227,18 +11191,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11249,18 +11213,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11270,12 +11232,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { ; GENERIC-LABEL: test_8xi32_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> ret <8 x i32> %res @@ -11283,18 +11245,18 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11305,18 +11267,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -11326,12 +11286,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; GENERIC-LABEL: test_8xi32_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -11340,18 +11300,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -11363,18 +11323,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -11386,18 +11344,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11409,18 +11367,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11432,18 +11388,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11455,18 +11411,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11478,12 +11432,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; GENERIC-LABEL: test_8xi32_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11492,18 +11446,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11515,18 +11469,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -11932,12 +11884,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { ; GENERIC-LABEL: test_4xi64_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> ret <4 x i64> %res @@ -11945,18 +11897,18 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -11967,18 +11919,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -11988,18 +11938,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12010,18 +11960,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12031,18 +11979,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12053,18 +12001,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12074,12 +12020,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { ; GENERIC-LABEL: test_4xi64_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> ret <4 x i64> %res @@ -12087,18 +12033,18 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12109,18 +12055,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] -; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -12130,12 +12074,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; GENERIC-LABEL: test_4xi64_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -12144,18 +12088,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -12167,18 +12111,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -12190,18 +12132,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -12213,18 +12155,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -12236,18 +12176,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -12259,18 +12199,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -12282,12 +12220,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; GENERIC-LABEL: test_4xi64_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -12296,18 +12234,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -12319,18 +12257,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; GENERIC: # BB#0: -; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] ; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; SKX: # BB#0: -; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll index c957a85a8852..799bbc11bee1 100644 --- a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll +++ b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll @@ -14,10 +14,10 @@ define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -28,10 +28,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -41,10 +40,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -55,10 +54,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -68,10 +66,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -82,10 +80,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -103,10 +100,10 @@ define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -117,10 +114,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x floa define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -139,10 +135,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -154,10 +150,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -169,10 +164,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -184,10 +179,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -199,10 +193,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -214,10 +208,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -238,10 +231,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -253,10 +246,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -530,10 +522,10 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -544,10 +536,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -557,10 +548,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -571,10 +562,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -584,10 +574,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, < define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -598,10 +588,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -619,10 +608,10 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -633,10 +622,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -655,10 +643,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -670,10 +658,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -685,10 +672,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -700,10 +687,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -715,10 +701,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -730,10 +716,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -754,10 +739,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -769,10 +754,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -1038,7 +1022,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> ret <8 x i32> %res @@ -1046,10 +1030,10 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1060,10 +1044,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1073,10 +1056,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1087,10 +1070,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1100,10 +1082,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1114,10 +1096,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1127,7 +1108,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> ret <8 x i32> %res @@ -1135,10 +1116,10 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1149,10 +1130,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1162,7 +1142,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -1171,10 +1151,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -1186,10 +1166,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> @@ -1201,10 +1180,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1216,10 +1195,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1231,10 +1209,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1246,10 +1224,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1261,7 +1238,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1270,10 +1247,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1285,10 +1262,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -1554,7 +1530,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> ret <4 x i64> %res @@ -1562,10 +1538,10 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1576,10 +1552,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1589,10 +1564,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1603,10 +1578,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1616,10 +1590,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1630,10 +1604,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1643,7 +1616,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> ret <4 x i64> %res @@ -1651,10 +1624,10 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1665,10 +1638,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1678,7 +1650,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -1687,10 +1659,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -1702,10 +1674,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -1717,10 +1688,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -1732,10 +1703,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -1747,10 +1717,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -1762,10 +1732,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -1777,7 +1746,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -1786,10 +1755,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> @@ -1801,10 +1770,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 23d664579943..ff25c005e9c1 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -46,8 +46,6 @@ define <8 x i1> @test3(<4 x i1> %a) { ; CHECK: # BB#0: ; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 -; CHECK-NEXT: kshiftlb $4, %k0, %k0 -; CHECK-NEXT: kshiftrb $4, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll index 584968f1c6ef..9aacb23fbd5b 100644 --- a/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/test/CodeGen/X86/avx512-vbroadcast.ll @@ -413,6 +413,7 @@ define <16 x float> @broadcast_ss_spill(float %x) { ; ALL-NEXT: callq func_f32 ; ALL-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload ; ALL-NEXT: addq $24, %rsp +; ALL-NEXT: .cfi_def_cfa_offset 8 ; ALL-NEXT: retq %a = fadd float %x, %x call void @func_f32(float %a) @@ -432,6 +433,7 @@ define <8 x double> @broadcast_sd_spill(double %x) { ; ALL-NEXT: callq func_f64 ; ALL-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload ; ALL-NEXT: addq $24, %rsp +; ALL-NEXT: .cfi_def_cfa_offset 8 ; ALL-NEXT: retq %a = fadd double %x, %x call void @func_f64(double %a) diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll index d1bf8fd5f3f7..7f170cd51bf9 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll @@ -717,6 +717,7 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext ; X32-NEXT: vpbroadcastb %eax, %zmm3 {%k1} ; X32-NEXT: vmovdqa64 %zmm3, %zmm0 ; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_mask_set1_epi8: @@ -1444,6 +1445,7 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) { ; X32-NEXT: korq %k0, %k1, %k1 ; X32-NEXT: vpbroadcastb %eax, %zmm0 {%k1} {z} ; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_maskz_set1_epi8: diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index a5ef1809157b..87565ac129b9 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -355,6 +355,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) @@ -380,6 +381,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) @@ -445,6 +447,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) @@ -470,6 +473,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) @@ -1702,6 +1706,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $60, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) @@ -2503,8 +2508,11 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { ; AVX512F-32-NEXT: addl %esi, %eax ; AVX512F-32-NEXT: adcl %ecx, %edx ; AVX512F-32-NEXT: addl $60, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 ; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 ; AVX512F-32-NEXT: popl %ebx +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) @@ -2586,6 +2594,7 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $60, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) @@ -3387,8 +3396,11 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; AVX512F-32-NEXT: addl %esi, %eax ; AVX512F-32-NEXT: adcl %ecx, %edx ; AVX512F-32-NEXT: addl $60, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 12 ; AVX512F-32-NEXT: popl %esi +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 ; AVX512F-32-NEXT: popl %ebx +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index e23deebd15b8..c2620642e5ce 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1499,6 +1499,7 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1) ret i64 %res @@ -1522,6 +1523,7 @@ define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) { ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $12, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0) ret i64 %res @@ -1712,6 +1714,7 @@ define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $20, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1) @@ -1776,6 +1779,7 @@ define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 % ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl $20, %esp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 4 ; AVX512F-32-NEXT: retl %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1) diff --git a/test/CodeGen/X86/avx512bw-vec-test-testn.ll b/test/CodeGen/X86/avx512bw-vec-test-testn.ll index 6dd6440faa1d..82d0b8846def 100644 --- a/test/CodeGen/X86/avx512bw-vec-test-testn.ll +++ b/test/CodeGen/X86/avx512bw-vec-test-testn.ll @@ -5,9 +5,7 @@ define zeroext i32 @TEST_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_test_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -24,9 +22,7 @@ entry: define zeroext i64 @TEST_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_test_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -42,10 +38,8 @@ entry: define zeroext i32 @TEST_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_test_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -63,10 +57,8 @@ entry: define zeroext i64 @TEST_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_test_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -84,9 +76,7 @@ entry: define zeroext i32 @TEST_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_testn_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -103,9 +93,7 @@ entry: define zeroext i64 @TEST_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_testn_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -121,10 +109,8 @@ entry: define zeroext i32 @TEST_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_testn_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -142,10 +128,8 @@ entry: define zeroext i64 @TEST_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_testn_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll index f67ceb2fe043..44075deb1d94 100644 --- a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll +++ b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll @@ -5,9 +5,7 @@ define zeroext i16 @TEST_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_test_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 +; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: retq @@ -23,10 +21,8 @@ entry: define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_mask_test_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: retq @@ -44,9 +40,7 @@ entry: define zeroext i8 @TEST_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_test_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 +; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; CHECK-NEXT: retq @@ -62,10 +56,8 @@ entry: define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_mask_test_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; CHECK-NEXT: retq @@ -83,9 +75,7 @@ entry: define zeroext i16 @TEST_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_testn_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: retq @@ -101,10 +91,8 @@ entry: define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: retq @@ -122,9 +110,7 @@ entry: define zeroext i8 @TEST_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_testn_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; CHECK-NEXT: retq @@ -140,10 +126,8 @@ entry: define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; CHECK-NEXT: retq @@ -161,9 +145,7 @@ entry: define i32 @TEST_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_test_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 +; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -179,10 +161,8 @@ entry: define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -200,9 +180,7 @@ entry: define zeroext i16 @TEST_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_test_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 +; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -219,10 +197,8 @@ entry: define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -241,9 +217,7 @@ entry: define i32 @TEST_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_testn_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -259,10 +233,8 @@ entry: define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -280,9 +252,7 @@ entry: define zeroext i16 @TEST_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_testn_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -299,10 +269,8 @@ entry: define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll new file mode 100644 index 000000000000..ca5e5523a9d2 --- /dev/null +++ b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s + +define <8 x i64> @test_mm512_broadcastmb_epi64(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_mm512_broadcastmb_epi64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = icmp eq <8 x i64> %a, %b + %1 = bitcast <8 x i1> %0 to i8 + %conv.i = zext i8 %1 to i64 + %vecinit.i.i = insertelement <8 x i64> undef, i64 %conv.i, i32 0 + %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer + ret <8 x i64> %vecinit7.i.i +} + +define <8 x i64> @test_mm512_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_mm512_broadcastmw_epi32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %a to <16 x i32> + %1 = bitcast <8 x i64> %b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast <16 x i1> %2 to i16 + %conv.i = zext i16 %3 to i32 + %vecinit.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0 + %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer + %4 = bitcast <16 x i32> %vecinit15.i.i to <8 x i64> + ret <8 x i64> %4 +} + + diff --git a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll index e5dbff9ac515..92dfe1e087ad 100644 --- a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll @@ -45,3 +45,26 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) ret <8 x i64> %res } + +define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: vpbroadcastd %eax, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) + ret <16 x i32> %res +} +declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16) + +define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: vpbroadcastq %rax, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) + ret <8 x i64> %res +} +declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8) + diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll index 7e5a3e8fe25d..ab8c80f8dd3b 100644 --- a/test/CodeGen/X86/avx512cd-intrinsics.ll +++ b/test/CodeGen/X86/avx512cd-intrinsics.ll @@ -1,28 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s -define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) { -; CHECK-LABEL: test_x86_vbroadcastmw_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) - ret <16 x i32> %res -} -declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16) - -define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) { -; CHECK-LABEL: test_x86_broadcastmb_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) - ret <8 x i64> %res -} -declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8) - declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly define <8 x i64> @test_conflict_q(<8 x i64> %a) { diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll index f8f47c87100a..0e310be34894 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -69,3 +69,47 @@ define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> ret <4 x i64> %res2 } +define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: vpbroadcastd %eax, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16) + +define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: vpbroadcastd %eax, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16) + +define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: vpbroadcastq %rax, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8) + +define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: vpbroadcastq %rax, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8) + diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index 96254f7c95b0..2fb50297c62c 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -147,46 +147,3 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i ret <4 x i64> %res2 } -define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) { -; CHECK-LABEL: test_x86_vbroadcastmw_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0 -; CHECK-NEXT: retq - %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16) - -define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) { -; CHECK-LABEL: test_x86_vbroadcastmw_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0 -; CHECK-NEXT: retq - %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16) - -define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) { -; CHECK-LABEL: test_x86_broadcastmb_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0 -; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8) - -define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) { -; CHECK-LABEL: test_x86_broadcastmb_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0 -; CHECK-NEXT: retq - %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8) diff --git a/test/CodeGen/X86/avx512f-vec-test-testn.ll b/test/CodeGen/X86/avx512f-vec-test-testn.ll index c9c0c2251a40..e9cdacc354ff 100644 --- a/test/CodeGen/X86/avx512f-vec-test-testn.ll +++ b/test/CodeGen/X86/avx512f-vec-test-testn.ll @@ -5,9 +5,7 @@ define zeroext i8 @TEST_mm512_test_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_test_epi64_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -23,9 +21,7 @@ entry: define zeroext i16 @TEST_mm512_test_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_test_epi32_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -42,10 +38,8 @@ entry: define zeroext i8 @TEST_mm512_mask_test_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_test_epi64_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -63,10 +57,8 @@ entry: define zeroext i16 @TEST_mm512_mask_test_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_test_epi32_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -85,9 +77,7 @@ entry: define zeroext i8 @TEST_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_testn_epi64_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -103,9 +93,7 @@ entry: define zeroext i16 @TEST_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_testn_epi32_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -122,10 +110,8 @@ entry: define zeroext i8 @TEST_mm512_mask_testn_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_testn_epi64_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper @@ -143,10 +129,8 @@ entry: define zeroext i16 @TEST_mm512_mask_testn_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_mask_testn_epi32_mask: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; CHECK-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index f5578d6cc885..3f4a696af0cb 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -233,6 +233,7 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastd_epi32: @@ -265,6 +266,7 @@ define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastd_epi32: @@ -369,6 +371,7 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastq_epi64: @@ -398,6 +401,7 @@ define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastq_epi64: @@ -441,6 +445,7 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_broadcastq_epi64: @@ -470,6 +475,7 @@ define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_broadcastq_epi64: @@ -513,6 +519,7 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastsd_pd: @@ -542,6 +549,7 @@ define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastsd_pd: @@ -585,6 +593,7 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_broadcastsd_pd: @@ -614,6 +623,7 @@ define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_broadcastsd_pd: @@ -657,6 +667,7 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_broadcastss_ps: @@ -686,6 +697,7 @@ define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_broadcastss_ps: @@ -781,6 +793,7 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_movddup_pd: @@ -810,6 +823,7 @@ define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_movddup_pd: @@ -853,6 +867,7 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_movddup_pd: @@ -882,6 +897,7 @@ define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_movddup_pd: @@ -925,6 +941,7 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_movehdup_ps: @@ -954,6 +971,7 @@ define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_movehdup_ps: @@ -1049,6 +1067,7 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_moveldup_ps: @@ -1078,6 +1097,7 @@ define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_moveldup_ps: @@ -1173,6 +1193,7 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64 ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_permutex_epi64: @@ -1202,6 +1223,7 @@ define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_permutex_epi64: @@ -1245,6 +1267,7 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_permutex_pd: @@ -1274,6 +1297,7 @@ define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) { ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_permutex_pd: @@ -1317,6 +1341,7 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_shuffle_pd: @@ -1346,6 +1371,7 @@ define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x dou ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_shuffle_pd: @@ -1389,6 +1415,7 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_shuffle_pd: @@ -1418,6 +1445,7 @@ define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_shuffle_pd: @@ -1461,6 +1489,7 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_mask_shuffle_ps: @@ -1490,6 +1519,7 @@ define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] ; X32-NEXT: popl %eax +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_maskz_shuffle_ps: diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index b6723ee50b05..6c6fad794c85 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -4712,8 +4712,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32, define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256: ; CHECK: ## BB#0: -; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06] -; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5] +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] +; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06] ; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 9098ca308971..35fecf8955c0 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2729,8 +2729,8 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x ; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16] ; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vperm2f128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x30] +; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] ; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] @@ -2752,7 +2752,7 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 ; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16] ; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16] +; CHECK-NEXT: vperm2f128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x30] ; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] ; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] @@ -2773,8 +2773,8 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16] ; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vperm2i128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x46,0xc1,0x30] +; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) @@ -2791,7 +2791,7 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16] ; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16] +; CHECK-NEXT: vperm2i128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x46,0xc1,0x30] ; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] diff --git a/test/CodeGen/X86/avx512vl-vbroadcast.ll b/test/CodeGen/X86/avx512vl-vbroadcast.ll index 9fc957297e24..1098e7bffe0c 100644 --- a/test/CodeGen/X86/avx512vl-vbroadcast.ll +++ b/test/CodeGen/X86/avx512vl-vbroadcast.ll @@ -12,6 +12,7 @@ define <8 x float> @_256_broadcast_ss_spill(float %x) { ; CHECK-NEXT: callq func_f32 ; CHECK-NEXT: vbroadcastss (%rsp), %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %a = fadd float %x, %x call void @func_f32(float %a) @@ -30,6 +31,7 @@ define <4 x float> @_128_broadcast_ss_spill(float %x) { ; CHECK-NEXT: callq func_f32 ; CHECK-NEXT: vbroadcastss (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %a = fadd float %x, %x call void @func_f32(float %a) @@ -49,6 +51,7 @@ define <4 x double> @_256_broadcast_sd_spill(double %x) { ; CHECK-NEXT: callq func_f64 ; CHECK-NEXT: vbroadcastsd (%rsp), %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %a = fadd double %x, %x call void @func_f64(double %a) diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index 5ee06fde1276..bccf953fb0be 100644 --- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -109,6 +109,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -227,6 +228,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -348,6 +350,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -470,6 +473,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -597,6 +601,7 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -720,6 +725,7 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -846,6 +852,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -973,6 +980,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1024,6 +1032,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1071,6 +1080,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1129,6 +1139,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1188,6 +1199,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1217,8 +1229,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -1246,8 +1256,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -1278,8 +1286,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -1311,8 +1317,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -1392,6 +1396,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1465,6 +1470,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1541,6 +1547,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1618,6 +1625,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1700,6 +1708,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1778,6 +1787,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1859,6 +1869,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -1941,6 +1952,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2064,6 +2076,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2183,6 +2196,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2305,6 +2319,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2428,6 +2443,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2556,6 +2572,7 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2680,6 +2697,7 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2807,6 +2825,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -2935,6 +2954,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -3288,6 +3308,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -3552,6 +3573,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -3912,6 +3934,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -4188,6 +4211,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5051,6 +5075,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5092,6 +5117,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5153,6 +5179,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5216,6 +5243,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5263,6 +5291,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5326,6 +5355,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5379,6 +5409,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5426,6 +5457,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5493,6 +5525,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5562,6 +5595,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5615,6 +5649,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5684,6 +5719,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -5957,6 +5993,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6030,6 +6067,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6106,6 +6144,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6183,6 +6222,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6260,6 +6300,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6337,6 +6378,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6420,6 +6462,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6498,6 +6541,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6579,6 +6623,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6661,6 +6706,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6743,6 +6789,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6825,6 +6872,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -6946,6 +6994,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7062,6 +7111,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7181,6 +7231,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7301,6 +7352,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7421,6 +7473,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7541,6 +7594,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7667,6 +7721,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7788,6 +7843,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -7912,6 +7968,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -8037,6 +8094,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -8162,6 +8220,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -8287,6 +8346,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9131,6 +9191,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9172,6 +9233,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9225,6 +9287,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9280,6 +9343,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9327,6 +9391,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9382,6 +9447,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9435,6 +9501,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9482,6 +9549,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9541,6 +9609,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9602,6 +9671,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9655,6 +9725,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -9716,6 +9787,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -10607,6 +10679,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -10650,6 +10723,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -10713,6 +10787,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -10778,6 +10853,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -10827,6 +10903,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -10892,6 +10969,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -10947,6 +11025,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -10996,6 +11075,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11065,6 +11145,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11136,6 +11217,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11191,6 +11273,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11262,6 +11345,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11509,6 +11593,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11580,6 +11665,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11654,6 +11740,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11729,6 +11816,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11804,6 +11892,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11879,6 +11968,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -11960,6 +12050,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12036,6 +12127,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12115,6 +12207,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12195,6 +12288,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12275,6 +12369,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12355,6 +12450,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12478,6 +12574,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12596,6 +12693,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12717,6 +12815,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12839,6 +12938,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -12966,6 +13066,7 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13089,6 +13190,7 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13215,6 +13317,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13342,6 +13445,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13393,6 +13497,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13440,6 +13545,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13498,6 +13604,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13557,6 +13664,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13586,8 +13694,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -13615,8 +13721,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -13647,8 +13751,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -13680,8 +13782,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -13761,6 +13861,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13834,6 +13935,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13910,6 +14012,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -13987,6 +14090,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14069,6 +14173,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14147,6 +14252,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14228,6 +14334,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14310,6 +14417,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14433,6 +14541,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14552,6 +14661,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14674,6 +14784,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14797,6 +14908,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -14925,6 +15037,7 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -15049,6 +15162,7 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -15176,6 +15290,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -15304,6 +15419,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -15657,6 +15773,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -15921,6 +16038,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -16281,6 +16399,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -16557,6 +16676,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17420,6 +17540,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17461,6 +17582,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17522,6 +17644,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17585,6 +17708,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17632,6 +17756,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17695,6 +17820,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17748,6 +17874,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17795,6 +17922,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17862,6 +17990,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17931,6 +18060,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -17984,6 +18114,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18053,6 +18184,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18326,6 +18458,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18399,6 +18532,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18475,6 +18609,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18552,6 +18687,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18629,6 +18765,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18706,6 +18843,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18789,6 +18927,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18867,6 +19006,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -18948,6 +19088,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19030,6 +19171,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19112,6 +19254,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19194,6 +19337,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19315,6 +19459,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19431,6 +19576,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19550,6 +19696,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19670,6 +19817,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19790,6 +19938,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -19910,6 +20059,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -20036,6 +20186,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -20157,6 +20308,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -20281,6 +20433,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -20406,6 +20559,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -20531,6 +20685,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -20656,6 +20811,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21500,6 +21656,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21541,6 +21698,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21594,6 +21752,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21649,6 +21808,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21696,6 +21856,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21751,6 +21912,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21804,6 +21966,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21851,6 +22014,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21910,6 +22074,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -21971,6 +22136,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -22024,6 +22190,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -22085,6 +22252,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -22976,6 +23144,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23019,6 +23188,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23082,6 +23252,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23147,6 +23318,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23196,6 +23368,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23261,6 +23434,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23316,6 +23490,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23365,6 +23540,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23434,6 +23610,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23505,6 +23682,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23560,6 +23738,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23631,6 +23810,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23878,6 +24058,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -23949,6 +24130,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24023,6 +24205,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24098,6 +24281,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24173,6 +24357,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24248,6 +24433,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24329,6 +24515,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24405,6 +24592,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24484,6 +24672,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24564,6 +24753,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24644,6 +24834,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24724,6 +24915,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24849,6 +25041,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -24970,6 +25163,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25093,6 +25287,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25218,6 +25413,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25347,6 +25543,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25473,6 +25670,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25601,6 +25799,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25731,6 +25930,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25784,6 +25984,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25834,6 +26035,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25894,6 +26096,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25956,6 +26159,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -25987,8 +26191,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -26019,8 +26221,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -26053,8 +26253,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -26089,8 +26287,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -26172,6 +26368,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26248,6 +26445,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26326,6 +26524,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26406,6 +26605,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26490,6 +26690,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26571,6 +26772,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26654,6 +26856,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26739,6 +26942,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26864,6 +27068,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -26986,6 +27191,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -27110,6 +27316,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -27236,6 +27443,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -27366,6 +27574,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -27493,6 +27702,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -27622,6 +27832,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -27753,6 +27964,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -28109,6 +28321,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -28378,6 +28591,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -28741,6 +28955,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -29022,6 +29237,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -29903,6 +30119,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -29947,6 +30164,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30008,6 +30226,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30072,6 +30291,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30121,6 +30341,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30184,6 +30405,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30239,6 +30461,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30289,6 +30512,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30356,6 +30580,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30426,6 +30651,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30481,6 +30707,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30550,6 +30777,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30823,6 +31051,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30896,6 +31125,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -30972,6 +31202,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31049,6 +31280,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31126,6 +31358,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31203,6 +31436,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31286,6 +31520,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31364,6 +31599,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31445,6 +31681,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31527,6 +31764,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31609,6 +31847,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31691,6 +31930,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31812,6 +32052,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -31928,6 +32169,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -32047,6 +32289,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -32167,6 +32410,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -32287,6 +32531,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -32407,6 +32652,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -32533,6 +32779,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -32654,6 +32901,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -32778,6 +33026,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -32903,6 +33152,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -33028,6 +33278,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -33153,6 +33404,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34023,6 +34275,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34067,6 +34320,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34120,6 +34374,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34176,6 +34431,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34225,6 +34481,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34280,6 +34537,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34335,6 +34593,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34385,6 +34644,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34444,6 +34704,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34506,6 +34767,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34561,6 +34823,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -34622,6 +34885,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -35543,6 +35807,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -35589,6 +35854,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -35654,6 +35920,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -35722,6 +35989,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -35773,6 +36041,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -35840,6 +36109,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -35897,6 +36167,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -35949,6 +36220,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36020,6 +36292,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36094,6 +36367,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36151,6 +36425,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36224,6 +36499,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36471,6 +36747,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36542,6 +36819,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36616,6 +36894,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36691,6 +36970,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36766,6 +37046,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36841,6 +37122,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36922,6 +37204,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -36998,6 +37281,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37077,6 +37361,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37157,6 +37442,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37237,6 +37523,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37317,6 +37604,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37443,6 +37731,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37564,6 +37853,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37688,6 +37978,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37813,6 +38104,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -37943,6 +38235,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38069,6 +38362,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38198,6 +38492,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38328,6 +38623,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38382,6 +38678,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38432,6 +38729,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38493,6 +38791,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38555,6 +38854,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38587,8 +38887,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -38619,8 +38917,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -38654,8 +38950,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -38690,8 +38984,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: vzeroupper @@ -38774,6 +39066,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38850,6 +39143,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -38929,6 +39223,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39009,6 +39304,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39094,6 +39390,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39175,6 +39472,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39259,6 +39557,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39344,6 +39643,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39470,6 +39770,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39592,6 +39893,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39717,6 +40019,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39843,6 +40146,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -39974,6 +40278,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -40101,6 +40406,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -40231,6 +40537,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -40362,6 +40669,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -40720,6 +41028,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -40989,6 +41298,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -41354,6 +41664,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -41635,6 +41946,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -42537,6 +42849,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -42581,6 +42894,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -42645,6 +42959,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -42711,6 +43026,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -42761,6 +43077,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -42827,6 +43144,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -42883,6 +43201,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -42933,6 +43252,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43003,6 +43323,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43075,6 +43396,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43131,6 +43453,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43203,6 +43526,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43476,6 +43800,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43549,6 +43874,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43625,6 +43951,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43702,6 +44029,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43779,6 +44107,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43856,6 +44185,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -43939,6 +44269,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44017,6 +44348,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44098,6 +44430,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44180,6 +44513,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44262,6 +44596,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44344,6 +44679,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44465,6 +44801,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44581,6 +44918,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44700,6 +45038,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44820,6 +45159,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -44940,6 +45280,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -45060,6 +45401,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -45186,6 +45528,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -45307,6 +45650,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -45431,6 +45775,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -45556,6 +45901,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -45681,6 +46027,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -45806,6 +46153,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -46707,6 +47055,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -46751,6 +47100,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -46807,6 +47157,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -46865,6 +47216,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -46915,6 +47267,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -46973,6 +47326,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -47029,6 +47383,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -47079,6 +47434,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -47141,6 +47497,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -47205,6 +47562,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -47261,6 +47619,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -47325,6 +47684,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48255,6 +48615,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48301,6 +48662,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48367,6 +48729,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48435,6 +48798,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48487,6 +48851,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48555,6 +48920,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48613,6 +48979,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48665,6 +49032,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48737,6 +49105,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48811,6 +49180,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48869,6 +49239,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -48943,6 +49314,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49190,6 +49562,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49261,6 +49634,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49335,6 +49709,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49410,6 +49785,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49485,6 +49861,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49560,6 +49937,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49641,6 +50019,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49717,6 +50096,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49796,6 +50176,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49876,6 +50257,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -49956,6 +50338,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -50036,6 +50419,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -50829,6 +51213,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -50870,6 +51255,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -50913,6 +51299,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -50964,6 +51351,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51015,6 +51403,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51068,6 +51457,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51121,6 +51511,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51168,6 +51559,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51217,6 +51609,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51274,6 +51667,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51331,6 +51725,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51390,6 +51785,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51663,6 +52059,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51736,6 +52133,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51810,6 +52208,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51887,6 +52286,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -51964,6 +52364,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52042,6 +52443,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52126,6 +52528,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52204,6 +52607,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52283,6 +52687,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52365,6 +52770,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52447,6 +52853,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52530,6 +52937,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52652,6 +53060,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52768,6 +53177,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -52885,6 +53295,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -53005,6 +53416,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -53125,6 +53537,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -53246,6 +53659,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -53414,6 +53828,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -53535,6 +53950,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -53657,6 +54073,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -53782,6 +54199,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -53907,6 +54325,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -54033,6 +54452,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: popq %r14 ; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -54886,6 +55306,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -54927,6 +55348,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -54970,6 +55392,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55020,6 +55443,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55070,6 +55494,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, < ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55122,6 +55547,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55175,6 +55601,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55222,6 +55649,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55271,6 +55699,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55327,6 +55756,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55383,6 +55813,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -55441,6 +55872,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56260,6 +56692,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56303,6 +56736,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56348,6 +56782,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56401,6 +56836,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56454,6 +56890,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56509,6 +56946,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56564,6 +57002,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56613,6 +57052,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56664,6 +57104,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56723,6 +57164,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56782,6 +57224,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -56843,6 +57286,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57146,6 +57590,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57217,6 +57662,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57289,6 +57735,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57364,6 +57811,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57439,6 +57887,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57515,6 +57964,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57647,6 +58097,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57723,6 +58174,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57800,6 +58252,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57880,6 +58333,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -57960,6 +58414,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: @@ -58041,6 +58496,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, ; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: .cfi_def_cfa %rsp, 8 ; NoVLX-NEXT: vzeroupper ; NoVLX-NEXT: retq entry: diff --git a/test/CodeGen/X86/avx512vl-vec-test-testn.ll b/test/CodeGen/X86/avx512vl-vec-test-testn.ll index f1919cb118c1..32de0254efaa 100644 --- a/test/CodeGen/X86/avx512vl-vec-test-testn.ll +++ b/test/CodeGen/X86/avx512vl-vec-test-testn.ll @@ -6,18 +6,14 @@ define zeroext i8 @TEST_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_test_epi64_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86_64-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 +; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_test_epi64_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; I386-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 +; I386-NEXT: vptestmq %xmm0, %xmm1, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: retl @@ -33,18 +29,14 @@ entry: define zeroext i8 @TEST_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_test_epi32_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86_64-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 +; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_test_epi32_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; I386-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 +; I386-NEXT: vptestmd %xmm0, %xmm1, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: retl @@ -61,9 +53,7 @@ entry: define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_test_epi64_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86_64-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 +; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: vzeroupper @@ -71,9 +61,7 @@ define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) lo ; ; I386-LABEL: TEST_mm256_test_epi64_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; I386-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 +; I386-NEXT: vptestmq %ymm0, %ymm1, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: vzeroupper @@ -90,9 +78,7 @@ entry: define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_test_epi32_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86_64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 +; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: vzeroupper @@ -100,9 +86,7 @@ define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) lo ; ; I386-LABEL: TEST_mm256_test_epi32_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; I386-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 +; I386-NEXT: vptestmd %ymm0, %ymm1, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: vzeroupper @@ -119,21 +103,17 @@ entry: define zeroext i8 @TEST_mm_mask_test_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_mask_test_epi64_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} +; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_mask_test_epi64_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} +; I386-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: retl @@ -152,21 +132,17 @@ entry: define zeroext i8 @TEST_mm_mask_test_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_mask_test_epi32_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} +; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_mask_test_epi32_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} +; I386-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: retl @@ -187,10 +163,8 @@ entry: define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_mask_test_epi64_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} +; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: vzeroupper @@ -198,11 +172,9 @@ define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x ; ; I386-LABEL: TEST_mm256_mask_test_epi64_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} +; I386-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: vzeroupper @@ -222,10 +194,8 @@ entry: define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} +; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: vzeroupper @@ -233,11 +203,9 @@ define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x ; ; I386-LABEL: TEST_mm256_mask_test_epi32_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} +; I386-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: vzeroupper @@ -256,18 +224,14 @@ entry: define zeroext i8 @TEST_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_testn_epi64_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86_64-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_testn_epi64_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; I386-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: retl @@ -283,18 +247,14 @@ entry: define zeroext i8 @TEST_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_testn_epi32_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86_64-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_testn_epi32_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; I386-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: retl @@ -311,9 +271,7 @@ entry: define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_testn_epi64_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86_64-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: vzeroupper @@ -321,9 +279,7 @@ define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) l ; ; I386-LABEL: TEST_mm256_testn_epi64_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; I386-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: vzeroupper @@ -340,9 +296,7 @@ entry: define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_testn_epi32_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86_64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0 ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: vzeroupper @@ -350,9 +304,7 @@ define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) l ; ; I386-LABEL: TEST_mm256_testn_epi32_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; I386-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0 ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: vzeroupper @@ -369,21 +321,17 @@ entry: define zeroext i8 @TEST_mm_mask_testn_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_mask_testn_epi64_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_mask_testn_epi64_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: retl @@ -402,21 +350,17 @@ entry: define zeroext i8 @TEST_mm_mask_testn_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm_mask_testn_epi32_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: retq ; ; I386-LABEL: TEST_mm_mask_testn_epi32_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %xmm0, %xmm1, %xmm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: retl @@ -437,10 +381,8 @@ entry: define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_mask_testn_epi64_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: vzeroupper @@ -448,11 +390,9 @@ define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4 ; ; I386-LABEL: TEST_mm256_mask_testn_epi64_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: vzeroupper @@ -472,10 +412,8 @@ entry: define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { ; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask: ; X86_64: # BB#0: # %entry -; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0 -; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86_64-NEXT: kmovw %edi, %k1 -; X86_64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} ; X86_64-NEXT: kmovw %k0, %eax ; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; X86_64-NEXT: vzeroupper @@ -483,11 +421,9 @@ define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 ; ; I386-LABEL: TEST_mm256_mask_testn_epi32_mask: ; I386: # BB#0: # %entry -; I386-NEXT: vpand %ymm0, %ymm1, %ymm0 -; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; I386-NEXT: kmovw %eax, %k1 -; I386-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} ; I386-NEXT: kmovw %k0, %eax ; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; I386-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll new file mode 100644 index 000000000000..ab4cbeb8d5ee --- /dev/null +++ b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s + +define <2 x i64> @test_mm_broadcastmb_epi64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_mm_broadcastmb_epi64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %a to <4 x i32> + %1 = bitcast <2 x i64> %b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %4 = bitcast <8 x i1> %3 to i8 + %conv.i = zext i8 %4 to i64 + %vecinit.i.i = insertelement <2 x i64> undef, i64 %conv.i, i32 0 + %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %vecinit1.i.i +} + +define <4 x i64> @test_mm256_broadcastmb_epi64(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: test_mm256_broadcastmb_epi64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = icmp eq <4 x i64> %a, %b + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %2 = bitcast <8 x i1> %1 to i8 + %conv.i = zext i8 %2 to i64 + %vecinit.i.i = insertelement <4 x i64> undef, i64 %conv.i, i32 0 + %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer + ret <4 x i64> %vecinit3.i.i +} + +define <2 x i64> @test_mm_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_mm_broadcastmw_epi32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %a to <16 x i32> + %1 = bitcast <8 x i64> %b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast <16 x i1> %2 to i16 + %conv.i = zext i16 %3 to i32 + %vecinit.i.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0 + %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer + %4 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64> + ret <2 x i64> %4 +} + +define <4 x i64> @test_mm256_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_mm256_broadcastmw_epi32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %a to <16 x i32> + %1 = bitcast <8 x i64> %b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast <16 x i1> %2 to i16 + %conv.i = zext i16 %3 to i32 + %vecinit.i.i = insertelement <8 x i32> undef, i32 %conv.i, i32 0 + %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer + %4 = bitcast <8 x i32> %vecinit7.i.i to <4 x i64> + ret <4 x i64> %4 +} + + diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll index e197713c6793..c48222000c6b 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -439,6 +439,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { ; AVX512F-NEXT: movl (%rsp), %eax ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll index f6cfbbb40440..f5fe395eaf3d 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -594,6 +594,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { ; AVX512F-NEXT: movl (%rsp), %eax ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1239,6 +1240,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp +; AVX1-NEXT: .cfi_def_cfa %rsp, 8 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1457,6 +1459,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp +; AVX2-NEXT: .cfi_def_cfa %rsp, 8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1499,6 +1502,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { ; AVX512F-NEXT: orq %rcx, %rax ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 4ed55ac0919e..1959000b859f 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -321,11 +321,17 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { ; AVX512-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 48 ; AVX512-NEXT: popq %r12 +; AVX512-NEXT: .cfi_def_cfa_offset 40 ; AVX512-NEXT: popq %r13 +; AVX512-NEXT: .cfi_def_cfa_offset 32 ; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 24 ; AVX512-NEXT: popq %r15 +; AVX512-NEXT: .cfi_def_cfa_offset 16 ; AVX512-NEXT: popq %rbp +; AVX512-NEXT: .cfi_def_cfa_offset 8 ; AVX512-NEXT: retq %1 = bitcast i16 %a0 to <16 x i1> %2 = zext <16 x i1> %1 to <16 x i8> diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll index ee2dac1d466e..76160517546c 100644 --- a/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-setcc-256.ll @@ -204,6 +204,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512F-NEXT: movl (%rsp), %eax ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll index 2b73c6e16bd0..ef981080bb35 100644 --- a/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/test/CodeGen/X86/bitcast-setcc-512.ll @@ -203,6 +203,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-NEXT: movl (%rsp), %eax ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -769,6 +770,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp +; AVX1-NEXT: .cfi_def_cfa %rsp, 8 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -983,6 +985,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp +; AVX2-NEXT: .cfi_def_cfa %rsp, 8 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1021,6 +1024,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-NEXT: orq %rcx, %rax ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/test/CodeGen/X86/bool-vector.ll b/test/CodeGen/X86/bool-vector.ll index eb40744c54d1..692d992df76e 100644 --- a/test/CodeGen/X86/bool-vector.ll +++ b/test/CodeGen/X86/bool-vector.ll @@ -93,6 +93,7 @@ define i32 @PR15215_good(<4 x i32> %input) { ; X32-NEXT: leal (%eax,%edx,4), %eax ; X32-NEXT: leal (%eax,%esi,8), %eax ; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X32-SSE2-LABEL: PR15215_good: @@ -115,6 +116,7 @@ define i32 @PR15215_good(<4 x i32> %input) { ; X32-SSE2-NEXT: leal (%eax,%edx,4), %eax ; X32-SSE2-NEXT: leal (%eax,%esi,8), %eax ; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE2-NEXT: retl ; ; X32-AVX2-LABEL: PR15215_good: @@ -134,6 +136,7 @@ define i32 @PR15215_good(<4 x i32> %input) { ; X32-AVX2-NEXT: leal (%eax,%edx,4), %eax ; X32-AVX2-NEXT: leal (%eax,%esi,8), %eax ; X32-AVX2-NEXT: popl %esi +; X32-AVX2-NEXT: .cfi_def_cfa_offset 4 ; X32-AVX2-NEXT: retl ; ; X64-LABEL: PR15215_good: diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll index 2a8236cf093f..fc7b192c2f81 100644 --- a/test/CodeGen/X86/broadcastm-lowering.ll +++ b/test/CodeGen/X86/broadcastm-lowering.ll @@ -80,8 +80,7 @@ define <16 x i32> @test_mm512_epi32(<16 x i32> %a, <16 x i32> %b) { ; AVX512CD-LABEL: test_mm512_epi32: ; AVX512CD: # BB#0: # %entry ; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512CD-NEXT: kmovw %k0, %eax -; AVX512CD-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512CD-NEXT: vpbroadcastmw2d %k0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512VLCDBW-LABEL: test_mm512_epi32: @@ -110,9 +109,7 @@ define <8 x i64> @test_mm512_epi64(<8 x i32> %a, <8 x i32> %b) { ; AVX512CD-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> ; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512CD-NEXT: kmovw %k0, %eax -; AVX512CD-NEXT: movzbl %al, %eax -; AVX512CD-NEXT: vpbroadcastq %rax, %zmm0 +; AVX512CD-NEXT: vpbroadcastmb2q %k0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512VLCDBW-LABEL: test_mm512_epi64: diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll index 82e133d25767..6f9abae6a715 100644 --- a/test/CodeGen/X86/cmp.ll +++ b/test/CodeGen/X86/cmp.ll @@ -247,10 +247,13 @@ define i32 @test12() ssp uwtable { ; CHECK-NEXT: # BB#1: # %T ; CHECK-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00] ; CHECK-NEXT: popq %rcx # encoding: [0x59] +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq # encoding: [0xc3] ; CHECK-NEXT: .LBB12_2: # %F +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00] ; CHECK-NEXT: popq %rcx # encoding: [0x59] +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %tmp1 = call zeroext i1 @test12b() diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 9f7f8a97dc20..c5f03dbd5a31 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -175,7 +175,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) { ; SSE: # BB#0: ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: packusdw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_trunc_lshr0: diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll index c6a1c07922e3..9237544ea95c 100644 --- a/test/CodeGen/X86/compress_expand.ll +++ b/test/CodeGen/X86/compress_expand.ll @@ -140,9 +140,7 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) { ; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 ; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 -; KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL-NEXT: kshiftlw $8, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} ; KNL-NEXT: retq call void @llvm.masked.compressstore.v8f32(<8 x float> %V, float* %base, <8 x i1> %mask) diff --git a/test/CodeGen/X86/emutls-pie.ll b/test/CodeGen/X86/emutls-pie.ll index 3c312a926695..f4561fcbd35a 100644 --- a/test/CodeGen/X86/emutls-pie.ll +++ b/test/CodeGen/X86/emutls-pie.ll @@ -18,13 +18,16 @@ define i32 @my_get_xyz() { ; X32-NEXT: calll my_emutls_get_address@PLT ; X32-NEXT: movl (%eax), %eax ; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; X64-LABEL: my_get_xyz: ; X64: movq my_emutls_v_xyz@GOTPCREL(%rip), %rdi ; X64-NEXT: callq my_emutls_get_address@PLT ; X64-NEXT: movl (%rax), %eax ; X64-NEXT: popq %rcx +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq entry: @@ -44,13 +47,16 @@ define i32 @f1() { ; X32-NEXT: calll __emutls_get_address@PLT ; X32-NEXT: movl (%eax), %eax ; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; X64-LABEL: f1: ; X64: leaq __emutls_v.i(%rip), %rdi ; X64-NEXT: callq __emutls_get_address@PLT ; X64-NEXT: movl (%rax), %eax ; X64-NEXT: popq %rcx +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq entry: diff --git a/test/CodeGen/X86/emutls.ll b/test/CodeGen/X86/emutls.ll index 8c0ba903659b..2321cd2fc284 100644 --- a/test/CodeGen/X86/emutls.ll +++ b/test/CodeGen/X86/emutls.ll @@ -16,12 +16,14 @@ define i32 @my_get_xyz() { ; X32-NEXT: calll my_emutls_get_address ; X32-NEXT: movl (%eax), %eax ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; X64-LABEL: my_get_xyz: ; X64: movl $my_emutls_v_xyz, %edi ; X64-NEXT: callq my_emutls_get_address ; X64-NEXT: movl (%rax), %eax ; X64-NEXT: popq %rcx +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq entry: @@ -45,12 +47,14 @@ define i32 @f1() { ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: movl (%eax), %eax ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; X64-LABEL: f1: ; X64: movl $__emutls_v.i1, %edi ; X64-NEXT: callq __emutls_get_address ; X64-NEXT: movl (%rax), %eax ; X64-NEXT: popq %rcx +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq entry: @@ -63,11 +67,13 @@ define i32* @f2() { ; X32: movl $__emutls_v.i1, (%esp) ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; X64-LABEL: f2: ; X64: movl $__emutls_v.i1, %edi ; X64-NEXT: callq __emutls_get_address ; X64-NEXT: popq %rcx +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq entry: @@ -92,6 +98,7 @@ define i32* @f4() { ; X32: movl $__emutls_v.i2, (%esp) ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -116,6 +123,7 @@ define i32* @f6() { ; X32: movl $__emutls_v.i3, (%esp) ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -128,6 +136,7 @@ define i32 @f7() { ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: movl (%eax), %eax ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -140,6 +149,7 @@ define i32* @f8() { ; X32: movl $__emutls_v.i4, (%esp) ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -152,6 +162,7 @@ define i32 @f9() { ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: movl (%eax), %eax ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -164,6 +175,7 @@ define i32* @f10() { ; X32: movl $__emutls_v.i5, (%esp) ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -176,6 +188,7 @@ define i16 @f11() { ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: movzwl (%eax), %eax ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -189,6 +202,7 @@ define i32 @f12() { ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: movswl (%eax), %eax ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -203,6 +217,7 @@ define i8 @f13() { ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: movb (%eax), %al ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: @@ -216,6 +231,7 @@ define i32 @f14() { ; X32-NEXT: calll __emutls_get_address ; X32-NEXT: movsbl (%eax), %eax ; X32-NEXT: addl $12, %esp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl entry: diff --git a/test/CodeGen/X86/epilogue-cfi-fp.ll b/test/CodeGen/X86/epilogue-cfi-fp.ll new file mode 100644 index 000000000000..c2fe1c7eaac3 --- /dev/null +++ b/test/CodeGen/X86/epilogue-cfi-fp.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 %s -o - | FileCheck %s + +; ModuleID = 'epilogue-cfi-fp.c' +source_filename = "epilogue-cfi-fp.c" +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i686-pc-linux" + +; Function Attrs: noinline nounwind +define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) #0 { + +; CHECK-LABEL: foo: +; CHECK: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl + +entry: + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + %k.addr = alloca i32, align 4 + %l.addr = alloca i32, align 4 + %m.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + store i32 %k, i32* %k.addr, align 4 + store i32 %l, i32* %l.addr, align 4 + store i32 %m, i32* %m.addr, align 4 + ret i32 0 +} + +attributes #0 = { "no-frame-pointer-elim"="true" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5, !6, !7} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 3f8116e6a2815b1d5f3491493938d0c63c9f42c9) (http://llvm.org/git/llvm.git 4fde77f8f1a8e4482e69b6a7484bc7d1b99b3c0a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "epilogue-cfi-fp.c", directory: "epilogue-dwarf/test") +!2 = !{} +!3 = !{i32 1, !"NumRegisterParameters", i32 0} +!4 = !{i32 2, !"Dwarf Version", i32 4} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} + diff --git a/test/CodeGen/X86/epilogue-cfi-no-fp.ll b/test/CodeGen/X86/epilogue-cfi-no-fp.ll new file mode 100644 index 000000000000..79d6f478de8a --- /dev/null +++ b/test/CodeGen/X86/epilogue-cfi-no-fp.ll @@ -0,0 +1,46 @@ +; RUN: llc -O0 < %s | FileCheck %s + +; ModuleID = 'epilogue-cfi-no-fp.c' +source_filename = "epilogue-cfi-no-fp.c" +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i686-pc-linux" + +; Function Attrs: noinline nounwind +define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) { +; CHECK-LABEL: foo: +; CHECK: addl $20, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: popl %edi +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl +entry: + %i.addr = alloca i32, align 4 + %j.addr = alloca i32, align 4 + %k.addr = alloca i32, align 4 + %l.addr = alloca i32, align 4 + %m.addr = alloca i32, align 4 + store i32 %i, i32* %i.addr, align 4 + store i32 %j, i32* %j.addr, align 4 + store i32 %k, i32* %k.addr, align 4 + store i32 %l, i32* %l.addr, align 4 + store i32 %m, i32* %m.addr, align 4 + ret i32 0 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5, !6, !7} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 3f8116e6a2815b1d5f3491493938d0c63c9f42c9) (http://llvm.org/git/llvm.git 4fde77f8f1a8e4482e69b6a7484bc7d1b99b3c0a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "epilogue-cfi-no-fp.c", directory: "epilogue-dwarf/test") +!2 = !{} +!3 = !{i32 1, !"NumRegisterParameters", i32 0} +!4 = !{i32 2, !"Dwarf Version", i32 4} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} + + diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll index 712fe810d2a9..64f8fd0ca8d6 100644 --- a/test/CodeGen/X86/f16c-intrinsics.ll +++ b/test/CodeGen/X86/f16c-intrinsics.ll @@ -1,33 +1,81 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32-AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64-AVX512VL define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) { ; X32-LABEL: test_x86_vcvtph2ps_128: ; X32: # BB#0: -; X32-NEXT: vcvtph2ps %xmm0, %xmm0 -; X32-NEXT: retl +; X32-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_128: ; X64: # BB#0: -; X64-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-NEXT: retq +; X64-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128: +; X32-AVX512VL: # BB#0: +; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128: +; X64-AVX512VL: # BB#0: +; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly +define <4 x float> @test_x86_vcvtph2ps_128_m(<8 x i16>* nocapture %a) { +; X32-LABEL: test_x86_vcvtph2ps_128_m: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X32-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_vcvtph2ps_128_m: +; X64: # BB#0: +; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m: +; X32-AVX512VL: # BB#0: +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m: +; X64-AVX512VL: # BB#0: +; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] + %load = load <8 x i16>, <8 x i16>* %a + %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %load) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) { ; X32-LABEL: test_x86_vcvtph2ps_256: ; X32: # BB#0: -; X32-NEXT: vcvtph2ps %xmm0, %ymm0 -; X32-NEXT: retl +; X32-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_256: ; X64: # BB#0: -; X64-NEXT: vcvtph2ps %xmm0, %ymm0 -; X64-NEXT: retq +; X64-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256: +; X32-AVX512VL: # BB#0: +; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256: +; X64-AVX512VL: # BB#0: +; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -36,15 +84,26 @@ declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind { ; X32-LABEL: test_x86_vcvtph2ps_256_m: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vcvtph2ps (%eax), %ymm0 -; X32-NEXT: retl +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-NEXT: vcvtph2ps (%eax), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x00] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_256_m: ; X64: # BB#0: -; X64-NEXT: vcvtph2ps (%rdi), %ymm0 -; X64-NEXT: retq - %load = load <8 x i16>, <8 x i16>* %a, align 16 +; X64-NEXT: vcvtph2ps (%rdi), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x07] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m: +; X32-AVX512VL: # BB#0: +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x00] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m: +; X64-AVX512VL: # BB#0: +; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x07] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] + %load = load <8 x i16>, <8 x i16>* %a %res = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %load) ret <8 x float> %res } @@ -52,13 +111,23 @@ define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind { define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) { ; X32-LABEL: test_x86_vcvtps2ph_128: ; X32: # BB#0: -; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; X32-NEXT: retl +; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128: ; X64: # BB#0: -; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; X64-NEXT: retq +; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128: +; X32-AVX512VL: # BB#0: +; X32-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128: +; X64-AVX512VL: # BB#0: +; X64-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } @@ -67,15 +136,27 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) { ; X32-LABEL: test_x86_vcvtps2ph_256: ; X32: # BB#0: -; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0 -; X32-NEXT: vzeroupper -; X32-NEXT: retl +; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00] +; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_256: ; X64: # BB#0: -; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0 -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256: +; X32-AVX512VL: # BB#0: +; X32-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00] +; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256: +; X64-AVX512VL: # BB#0: +; X64-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00] +; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } @@ -84,14 +165,25 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) { ; X32-LABEL: test_x86_vcvtps2ph_128_scalar: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vcvtph2ps (%eax), %xmm0 -; X32-NEXT: retl +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128_scalar: ; X64: # BB#0: -; X64-NEXT: vcvtph2ps (%rdi), %xmm0 -; X64-NEXT: retq +; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar: +; X32-AVX512VL: # BB#0: +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar: +; X64-AVX512VL: # BB#0: +; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %load = load i64, i64* %ptr %ins1 = insertelement <2 x i64> undef, i64 %load, i32 0 %ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1 @@ -103,14 +195,25 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) { define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) { ; X32-LABEL: test_x86_vcvtps2ph_128_scalar2: ; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vcvtph2ps (%eax), %xmm0 -; X32-NEXT: retl +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128_scalar2: ; X64: # BB#0: -; X64-NEXT: vcvtph2ps (%rdi), %xmm0 -; X64-NEXT: retq +; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2: +; X32-AVX512VL: # BB#0: +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2: +; X64-AVX512VL: # BB#0: +; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %load = load i64, i64* %ptr %ins = insertelement <2 x i64> undef, i64 %load, i32 0 %bc = bitcast <2 x i64> %ins to <8 x i16> @@ -121,16 +224,29 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) { define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind { ; X32-LABEL: test_x86_vcvtps2ph_256_m: ; X32: # BB#0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax) -; X32-NEXT: vzeroupper -; X32-NEXT: retl +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax) # encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03] +; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_256_m: ; X64: # BB#0: # %entry -; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi) -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m: +; X32-AVX512VL: # BB#0: # %entry +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03] +; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m: +; X64-AVX512VL: # BB#0: # %entry +; X64-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03] +; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a, i32 3) store <8 x i16> %0, <8 x i16>* %d, align 16 @@ -140,14 +256,31 @@ entry: define void @test_x86_vcvtps2ph_128_m(<4 x i16>* nocapture %d, <4 x float> %a) nounwind { ; X32-LABEL: test_x86_vcvtps2ph_128_m: ; X32: # BB#0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) -; X32-NEXT: retl +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128_m: ; X64: # BB#0: # %entry -; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) -; X64-NEXT: retq +; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m: +; X32-AVX512VL: # BB#0: # %entry +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03] +; X32-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0] +; X32-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-AVX512VL-NEXT: vpmovdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x00] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m: +; X64-AVX512VL: # BB#0: # %entry +; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03] +; X64-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0] +; X64-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX512VL-NEXT: vpmovdw %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a, i32 3) %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> @@ -158,14 +291,25 @@ entry: define void @test_x86_vcvtps2ph_128_m2(double* nocapture %hf4x16, <4 x float> %f4x32) #0 { ; X32-LABEL: test_x86_vcvtps2ph_128_m2: ; X32: # BB#0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) -; X32-NEXT: retl +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128_m2: ; X64: # BB#0: # %entry -; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) -; X64-NEXT: retq +; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2: +; X32-AVX512VL: # BB#0: # %entry +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2: +; X64-AVX512VL: # BB#0: # %entry +; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3) %1 = bitcast <8 x i16> %0 to <2 x double> @@ -177,14 +321,25 @@ entry: define void @test_x86_vcvtps2ph_128_m3(i64* nocapture %hf4x16, <4 x float> %f4x32) #0 { ; X32-LABEL: test_x86_vcvtps2ph_128_m3: ; X32: # BB#0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) -; X32-NEXT: retl +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03] +; X32-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128_m3: ; X64: # BB#0: # %entry -; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) -; X64-NEXT: retq +; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03] +; X64-NEXT: retq # encoding: [0xc3] +; +; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3: +; X32-AVX512VL: # BB#0: # %entry +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03] +; X32-AVX512VL-NEXT: retl # encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3: +; X64-AVX512VL: # BB#0: # %entry +; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03] +; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3) %1 = bitcast <8 x i16> %0 to <2 x i64> diff --git a/test/CodeGen/X86/fast-isel-int-float-conversion.ll b/test/CodeGen/X86/fast-isel-int-float-conversion.ll index 3e69710868b6..57b50abab535 100644 --- a/test/CodeGen/X86/fast-isel-int-float-conversion.ll +++ b/test/CodeGen/X86/fast-isel-int-float-conversion.ll @@ -31,6 +31,7 @@ define double @int_to_double_rr(i32 %a) { ; SSE2_X86-NEXT: fldl (%esp) ; SSE2_X86-NEXT: movl %ebp, %esp ; SSE2_X86-NEXT: popl %ebp +; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4 ; SSE2_X86-NEXT: retl ; ; AVX_X86-LABEL: int_to_double_rr: @@ -47,6 +48,7 @@ define double @int_to_double_rr(i32 %a) { ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp ; AVX_X86-NEXT: popl %ebp +; AVX_X86-NEXT: .cfi_def_cfa %esp, 4 ; AVX_X86-NEXT: retl entry: %0 = sitofp i32 %a to double @@ -80,6 +82,7 @@ define double @int_to_double_rm(i32* %a) { ; SSE2_X86-NEXT: fldl (%esp) ; SSE2_X86-NEXT: movl %ebp, %esp ; SSE2_X86-NEXT: popl %ebp +; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4 ; SSE2_X86-NEXT: retl ; ; AVX_X86-LABEL: int_to_double_rm: @@ -97,6 +100,7 @@ define double @int_to_double_rm(i32* %a) { ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp ; AVX_X86-NEXT: popl %ebp +; AVX_X86-NEXT: .cfi_def_cfa %esp, 4 ; AVX_X86-NEXT: retl entry: %0 = load i32, i32* %a @@ -130,6 +134,7 @@ define double @int_to_double_rm_optsize(i32* %a) optsize { ; SSE2_X86-NEXT: fldl (%esp) ; SSE2_X86-NEXT: movl %ebp, %esp ; SSE2_X86-NEXT: popl %ebp +; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4 ; SSE2_X86-NEXT: retl ; ; AVX_X86-LABEL: int_to_double_rm_optsize: @@ -147,6 +152,7 @@ define double @int_to_double_rm_optsize(i32* %a) optsize { ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp ; AVX_X86-NEXT: popl %ebp +; AVX_X86-NEXT: .cfi_def_cfa %esp, 4 ; AVX_X86-NEXT: retl entry: %0 = load i32, i32* %a @@ -174,6 +180,7 @@ define float @int_to_float_rr(i32 %a) { ; SSE2_X86-NEXT: movss %xmm0, (%esp) ; SSE2_X86-NEXT: flds (%esp) ; SSE2_X86-NEXT: popl %eax +; SSE2_X86-NEXT: .cfi_def_cfa_offset 4 ; SSE2_X86-NEXT: retl ; ; AVX_X86-LABEL: int_to_float_rr: @@ -184,6 +191,7 @@ define float @int_to_float_rr(i32 %a) { ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax +; AVX_X86-NEXT: .cfi_def_cfa_offset 4 ; AVX_X86-NEXT: retl entry: %0 = sitofp i32 %a to float @@ -211,6 +219,7 @@ define float @int_to_float_rm(i32* %a) { ; SSE2_X86-NEXT: movss %xmm0, (%esp) ; SSE2_X86-NEXT: flds (%esp) ; SSE2_X86-NEXT: popl %eax +; SSE2_X86-NEXT: .cfi_def_cfa_offset 4 ; SSE2_X86-NEXT: retl ; ; AVX_X86-LABEL: int_to_float_rm: @@ -222,6 +231,7 @@ define float @int_to_float_rm(i32* %a) { ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax +; AVX_X86-NEXT: .cfi_def_cfa_offset 4 ; AVX_X86-NEXT: retl entry: %0 = load i32, i32* %a @@ -249,6 +259,7 @@ define float @int_to_float_rm_optsize(i32* %a) optsize { ; SSE2_X86-NEXT: movss %xmm0, (%esp) ; SSE2_X86-NEXT: flds (%esp) ; SSE2_X86-NEXT: popl %eax +; SSE2_X86-NEXT: .cfi_def_cfa_offset 4 ; SSE2_X86-NEXT: retl ; ; AVX_X86-LABEL: int_to_float_rm_optsize: @@ -260,6 +271,7 @@ define float @int_to_float_rm_optsize(i32* %a) optsize { ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax +; AVX_X86-NEXT: .cfi_def_cfa_offset 4 ; AVX_X86-NEXT: retl entry: %0 = load i32, i32* %a diff --git a/test/CodeGen/X86/fast-isel-store.ll b/test/CodeGen/X86/fast-isel-store.ll index e359e6205636..e2412e9c5c04 100644 --- a/test/CodeGen/X86/fast-isel-store.ll +++ b/test/CodeGen/X86/fast-isel-store.ll @@ -375,6 +375,7 @@ define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double ; SSE64-NEXT: movupd %xmm0, (%eax) ; SSE64-NEXT: movupd %xmm1, 16(%eax) ; SSE64-NEXT: addl $12, %esp +; SSE64-NEXT: .cfi_def_cfa_offset 4 ; SSE64-NEXT: retl ; ; AVX32-LABEL: test_store_4xf64: @@ -413,6 +414,7 @@ define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 ; SSE64-NEXT: movapd %xmm0, (%eax) ; SSE64-NEXT: movapd %xmm1, 16(%eax) ; SSE64-NEXT: addl $12, %esp +; SSE64-NEXT: .cfi_def_cfa_offset 4 ; SSE64-NEXT: retl ; ; AVX32-LABEL: test_store_4xf64_aligned: @@ -452,6 +454,7 @@ define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %va ; SSE64-NEXT: movups %xmm2, 32(%eax) ; SSE64-NEXT: movups %xmm3, 48(%eax) ; SSE64-NEXT: addl $12, %esp +; SSE64-NEXT: .cfi_def_cfa_offset 4 ; SSE64-NEXT: retl ; ; AVXONLY32-LABEL: test_store_16xi32: @@ -501,6 +504,7 @@ define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x ; SSE64-NEXT: movaps %xmm2, 32(%eax) ; SSE64-NEXT: movaps %xmm3, 48(%eax) ; SSE64-NEXT: addl $12, %esp +; SSE64-NEXT: .cfi_def_cfa_offset 4 ; SSE64-NEXT: retl ; ; AVXONLY32-LABEL: test_store_16xi32_aligned: @@ -550,6 +554,7 @@ define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x floa ; SSE64-NEXT: movups %xmm2, 32(%eax) ; SSE64-NEXT: movups %xmm3, 48(%eax) ; SSE64-NEXT: addl $12, %esp +; SSE64-NEXT: .cfi_def_cfa_offset 4 ; SSE64-NEXT: retl ; ; AVXONLY32-LABEL: test_store_16xf32: @@ -599,6 +604,7 @@ define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <1 ; SSE64-NEXT: movaps %xmm2, 32(%eax) ; SSE64-NEXT: movaps %xmm3, 48(%eax) ; SSE64-NEXT: addl $12, %esp +; SSE64-NEXT: .cfi_def_cfa_offset 4 ; SSE64-NEXT: retl ; ; AVXONLY32-LABEL: test_store_16xf32_aligned: @@ -656,6 +662,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double ; SSE64-NEXT: movupd %xmm2, 32(%eax) ; SSE64-NEXT: movupd %xmm3, 48(%eax) ; SSE64-NEXT: addl $12, %esp +; SSE64-NEXT: .cfi_def_cfa_offset 4 ; SSE64-NEXT: retl ; ; AVXONLY32-LABEL: test_store_8xf64: @@ -682,6 +689,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double ; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax) ; AVXONLY64-NEXT: movl %ebp, %esp ; AVXONLY64-NEXT: popl %ebp +; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 ; AVXONLY64-NEXT: retl ; ; AVX51232-LABEL: test_store_8xf64: @@ -729,6 +737,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 ; SSE64-NEXT: movapd %xmm2, 32(%eax) ; SSE64-NEXT: movapd %xmm3, 48(%eax) ; SSE64-NEXT: addl $12, %esp +; SSE64-NEXT: .cfi_def_cfa_offset 4 ; SSE64-NEXT: retl ; ; AVXONLY32-LABEL: test_store_8xf64_aligned: @@ -755,6 +764,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 ; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax) ; AVXONLY64-NEXT: movl %ebp, %esp ; AVXONLY64-NEXT: popl %ebp +; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 ; AVXONLY64-NEXT: retl ; ; AVX51232-LABEL: test_store_8xf64_aligned: diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll index 68f39469a82c..362864f72a9d 100644 --- a/test/CodeGen/X86/fma-intrinsics-x86.ll +++ b/test/CodeGen/X86/fma-intrinsics-x86.ll @@ -1,29 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA -; RUN: llc < %s -mtriple=x86_64-pc-windows -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL +; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma4,-fma -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4 ; VFMADD define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a] +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6a,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -31,21 +34,27 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca] +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca] +; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_ss: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6a,0xc2,0x00] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) ret <4 x float> %res } @@ -54,20 +63,25 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a] +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6b,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -75,21 +89,27 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca] +; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca] +; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_sd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6b,0xc2,0x00] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) ret <2 x double> %res } @@ -98,20 +118,25 @@ declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x do define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -120,20 +145,25 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -142,20 +172,25 @@ declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x do define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ret <8 x float> %res } @@ -164,20 +199,25 @@ declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x f define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ret <4 x double> %res } @@ -187,20 +227,25 @@ declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a] +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6e,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -208,21 +253,27 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca] +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca] +; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_ss: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6e,0xc2,0x00] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) ret <4 x float> %res } @@ -231,20 +282,25 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a] +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6f,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -252,21 +308,27 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca] +; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca] +; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_sd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6f,0xc2,0x00] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) ret <2 x double> %res } @@ -275,20 +337,25 @@ declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x do define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -297,20 +364,25 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -319,20 +391,25 @@ declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x do define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ret <8 x float> %res } @@ -341,20 +418,25 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ret <4 x double> %res } @@ -364,20 +446,25 @@ declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a] +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7a,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -385,21 +472,27 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca] +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca] +; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_ss: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7a,0xc2,0x00] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) ret <4 x float> %res } @@ -408,20 +501,25 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x floa define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a] +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7b,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -429,21 +527,27 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca] +; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca] +; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_sd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7b,0xc2,0x00] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) ret <2 x double> %res } @@ -452,20 +556,25 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x d define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -474,20 +583,25 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x floa define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -496,20 +610,25 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x d define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ret <8 x float> %res } @@ -518,20 +637,25 @@ declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ret <4 x double> %res } @@ -541,20 +665,25 @@ declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a] +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7e,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -562,21 +691,27 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca] +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca] +; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_ss: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7e,0xc2,0x00] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) ret <4 x float> %res } @@ -585,20 +720,25 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x floa define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a] +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01] +; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7f,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -606,21 +746,27 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 -; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca] +; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca] +; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_sd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7f,0xc2,0x00] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) ret <2 x double> %res } @@ -629,20 +775,25 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x d define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -651,20 +802,25 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -673,20 +829,25 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x d define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ret <8 x float> %res } @@ -695,20 +856,25 @@ declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ret <4 x double> %res } @@ -718,20 +884,25 @@ declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -740,20 +911,25 @@ declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x fl define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -762,20 +938,25 @@ declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ret <8 x float> %res } @@ -784,20 +965,25 @@ declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ret <4 x double> %res } @@ -807,20 +993,25 @@ declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } @@ -829,20 +1020,25 @@ declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x fl define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 -; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } @@ -851,20 +1047,25 @@ declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ret <8 x float> %res } @@ -873,20 +1074,25 @@ declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256: ; CHECK-FMA: # BB#0: -; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 -; CHECK-FMA-NEXT: retq +; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2] +; CHECK-FMA-NEXT: retq # encoding: [0xc3] +; +; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256: +; CHECK-AVX512VL: # BB#0: +; CHECK-AVX512VL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2] +; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256: ; CHECK-FMA-WIN: # BB#0: -; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 -; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 -; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 -; CHECK-FMA-WIN-NEXT: retq +; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09] +; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02] +; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00] +; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3] ; ; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256: ; CHECK-FMA4: # BB#0: -; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 -; CHECK-FMA4-NEXT: retq +; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10] +; CHECK-FMA4-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ret <4 x double> %res } diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll index ba80c839fdda..ee64790d1d94 100644 --- a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll +++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll @@ -18,11 +18,15 @@ entry: } ; CHECK-LABEL: noDebug -; CHECK: addq $24, %rsp -; CHECK: popq %rbx -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: retq - +; CHECK: addq $16, %rsp +; CHECK-NEXT: .cfi_adjust_cfa_offset -16 +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq define void @withDebug() !dbg !18 { entry: @@ -42,9 +46,11 @@ entry: ; CHECK-LABEL: withDebug ; CHECK: callq printf ; CHECK: callq printf -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: addq $16, %rsp ; CHECK: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll index f9ecf707810b..de9d6bf93d6c 100644 --- a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll +++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll @@ -9,6 +9,7 @@ define i64 @fn1NoDebug(i64 %a) { ; CHECK-LABEL: fn1NoDebug ; CHECK: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: ret define i64 @fn1WithDebug(i64 %a) !dbg !4 { @@ -19,6 +20,7 @@ define i64 @fn1WithDebug(i64 %a) !dbg !4 { ; CHECK-LABEL: fn1WithDebug ; CHECK: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: ret %struct.Buffer = type { i8, [63 x i8] } @@ -33,6 +35,7 @@ define void @fn2NoDebug(%struct.Buffer* byval align 64 %p1) { ; CHECK-NOT: sub ; CHECK: mov ; CHECK-NEXT: pop +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: ret define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !8 { @@ -46,6 +49,7 @@ define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !8 { ; CHECK-NOT: sub ; CHECK: mov ; CHECK-NEXT: pop +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: ret declare i64 @fn(i64, i64) diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll index e32c7452b0c0..7126fb233e65 100644 --- a/test/CodeGen/X86/haddsub-2.ll +++ b/test/CodeGen/X86/haddsub-2.ll @@ -724,11 +724,17 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) { ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE3-NEXT: popq %rbx +; SSE3-NEXT: .cfi_def_cfa_offset 48 ; SSE3-NEXT: popq %r12 +; SSE3-NEXT: .cfi_def_cfa_offset 40 ; SSE3-NEXT: popq %r13 +; SSE3-NEXT: .cfi_def_cfa_offset 32 ; SSE3-NEXT: popq %r14 +; SSE3-NEXT: .cfi_def_cfa_offset 24 ; SSE3-NEXT: popq %r15 +; SSE3-NEXT: .cfi_def_cfa_offset 16 ; SSE3-NEXT: popq %rbp +; SSE3-NEXT: .cfi_def_cfa_offset 8 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: avx2_vphadd_w_test: @@ -1351,11 +1357,17 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) { ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE3-NEXT: popq %rbx +; SSE3-NEXT: .cfi_def_cfa_offset 48 ; SSE3-NEXT: popq %r12 +; SSE3-NEXT: .cfi_def_cfa_offset 40 ; SSE3-NEXT: popq %r13 +; SSE3-NEXT: .cfi_def_cfa_offset 32 ; SSE3-NEXT: popq %r14 +; SSE3-NEXT: .cfi_def_cfa_offset 24 ; SSE3-NEXT: popq %r15 +; SSE3-NEXT: .cfi_def_cfa_offset 16 ; SSE3-NEXT: popq %rbp +; SSE3-NEXT: .cfi_def_cfa_offset 8 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: avx2_hadd_w: diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll index efe07cf6301e..ce2d0e9c6717 100644 --- a/test/CodeGen/X86/hipe-cc64.ll +++ b/test/CodeGen/X86/hipe-cc64.ll @@ -87,6 +87,7 @@ define cc 11 { i64, i64, i64 } @tailcaller(i64 %hp, i64 %p) #0 { ; CHECK-NEXT: movl $47, %ecx ; CHECK-NEXT: movl $63, %r8d ; CHECK-NEXT: popq %rax + ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: jmp tailcallee %ret = tail call cc11 { i64, i64, i64 } @tailcallee(i64 %hp, i64 %p, i64 15, i64 31, i64 47, i64 63, i64 79) #1 diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll new file mode 100644 index 000000000000..8f5aac493b54 --- /dev/null +++ b/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -0,0 +1,1896 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512 + +; +; 128-bit Vectors +; + +define i64 @test_reduce_v2i64(<2 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v2i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v2i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v2i64: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v2i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v2i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v2i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v2i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v2i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: retq + %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> + %2 = icmp sgt <2 x i64> %a0, %1 + %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1 + %4 = extractelement <2 x i64> %3, i32 0 + ret i64 %4 +} + +define i32 @test_reduce_v4i32(<4 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v4i32: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v4i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %2 = icmp sgt <4 x i32> %a0, %1 + %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <4 x i32> %3, %4 + %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4 + %7 = extractelement <4 x i32> %6, i32 0 + ret i32 %7 +} + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v8i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v8i16: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v8i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v8i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 + %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <8 x i16> %6, %7 + %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7 + %10 = extractelement <8 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i8: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 + %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <16 x i8> %6, %7 + %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7 + %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp sgt <16 x i8> %9, %10 + %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10 + %13 = extractelement <16 x i8> %12, i32 0 + ret i8 %13 +} + +; +; 256-bit Vectors +; + +define i64 @test_reduce_v4i64(<4 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v4i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v4i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movq %xmm2, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v4i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %2 = icmp sgt <4 x i64> %a0, %1 + %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1 + %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <4 x i64> %3, %4 + %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4 + %7 = extractelement <4 x i64> %6, i32 0 + ret i64 %7 +} + +define i32 @test_reduce_v8i32(<8 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <8 x i32> %a0, %1 + %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <8 x i32> %3, %4 + %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <8 x i32> %6, %7 + %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7 + %10 = extractelement <8 x i32> %9, i32 0 + ret i32 %10 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v16i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v16i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp sgt <16 x i16> %9, %10 + %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10 + %13 = extractelement <16 x i16> %12, i32 0 + ret i16 %13 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp sgt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp sgt <32 x i8> %12, %13 + %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13 + %16 = extractelement <32 x i8> %15, i32 0 + ret i8 %16 +} + +; +; 512-bit Vectors +; + +define i64 @test_reduce_v8i64(<8 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: subl $28, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm6 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload +; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm6 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm6, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $28, %esp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm5 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X86-SSE42-NEXT: movapd %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: por %xmm6, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm9, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm7, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm6 +; X64-SSE2-NEXT: por %xmm0, %xmm6 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm8 +; X64-SSE2-NEXT: por %xmm1, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm8, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm6 +; X64-SSE2-NEXT: pandn %xmm8, %xmm1 +; X64-SSE2-NEXT: por %xmm6, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X64-SSE42-NEXT: movapd %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <8 x i64> %a0, %1 + %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1 + %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <8 x i64> %3, %4 + %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4 + %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <8 x i64> %6, %7 + %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7 + %10 = extractelement <8 x i64> %9, i32 0 + ret i64 %10 +} + +define i32 @test_reduce_v16i32(<16 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm4, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm5 +; X64-SSE2-NEXT: por %xmm1, %xmm5 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; X64-SSE2-NEXT: pand %xmm0, %xmm4 +; X64-SSE2-NEXT: pandn %xmm5, %xmm0 +; X64-SSE2-NEXT: por %xmm4, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <16 x i32> %a0, %1 + %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <16 x i32> %3, %4 + %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <16 x i32> %6, %7 + %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7 + %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp sgt <16 x i32> %9, %10 + %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10 + %13 = extractelement <16 x i32> %12, i32 0 + ret i32 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v32i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pmaxsw %xmm3, %xmm1 +; X86-SSE-NEXT: pmaxsw %xmm2, %xmm0 +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v32i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pmaxsw %xmm3, %xmm1 +; X64-SSE-NEXT: pmaxsw %xmm2, %xmm0 +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp sgt <32 x i16> %9, %10 + %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10 + %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp sgt <32 x i16> %12, %13 + %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13 + %16 = extractelement <32 x i16> %15, i32 0 + ret i16 %16 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm5 +; X86-SSE2-NEXT: por %xmm1, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm4, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v64i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE2-NEXT: pcmpgtb %xmm3, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm5 +; X64-SSE2-NEXT: por %xmm1, %xmm5 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0 +; X64-SSE2-NEXT: pand %xmm0, %xmm4 +; X64-SSE2-NEXT: pandn %xmm5, %xmm0 +; X64-SSE2-NEXT: por %xmm4, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp sgt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp sgt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp sgt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp sgt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp sgt <64 x i8> %12, %13 + %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13 + %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %17 = icmp sgt <64 x i8> %15, %16 + %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16 + %19 = extractelement <64 x i8> %18, i32 0 + ret i8 %19 +} diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll new file mode 100644 index 000000000000..6feb963426bb --- /dev/null +++ b/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -0,0 +1,1898 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512 + +; +; 128-bit Vectors +; + +define i64 @test_reduce_v2i64(<2 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v2i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v2i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v2i64: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v2i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v2i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v2i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v2i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v2i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: retq + %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> + %2 = icmp slt <2 x i64> %a0, %1 + %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1 + %4 = extractelement <2 x i64> %3, i32 0 + ret i64 %4 +} + +define i32 @test_reduce_v4i32(<4 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v4i32: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v4i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %2 = icmp slt <4 x i32> %a0, %1 + %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <4 x i32> %3, %4 + %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4 + %7 = extractelement <4 x i32> %6, i32 0 + ret i32 %7 +} + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v8i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v8i16: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v8i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v8i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 + %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <8 x i16> %6, %7 + %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7 + %10 = extractelement <8 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i8: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 + %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <16 x i8> %6, %7 + %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7 + %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp slt <16 x i8> %9, %10 + %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10 + %13 = extractelement <16 x i8> %12, i32 0 + ret i8 %13 +} + +; +; 256-bit Vectors +; + +define i64 @test_reduce_v4i64(<4 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v4i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v4i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movq %xmm2, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v4i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %2 = icmp slt <4 x i64> %a0, %1 + %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1 + %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <4 x i64> %3, %4 + %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4 + %7 = extractelement <4 x i64> %6, i32 0 + ret i64 %7 +} + +define i32 @test_reduce_v8i32(<8 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <8 x i32> %a0, %1 + %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <8 x i32> %3, %4 + %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <8 x i32> %6, %7 + %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7 + %10 = extractelement <8 x i32> %9, i32 0 + ret i32 %10 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v16i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v16i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp slt <16 x i16> %9, %10 + %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10 + %13 = extractelement <16 x i16> %12, i32 0 + ret i16 %13 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp slt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp slt <32 x i8> %12, %13 + %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13 + %16 = extractelement <32 x i8> %15, i32 0 + ret i8 %16 +} + +; +; 512-bit Vectors +; + +define i64 @test_reduce_v8i64(<8 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: subl $28, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm6 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload +; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm6, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $28, %esp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm5 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movapd %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: pxor %xmm9, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm9, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: por %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm9, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X64-SSE2-NEXT: pxor %xmm9, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm7, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm6 +; X64-SSE2-NEXT: por %xmm1, %xmm6 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm9, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X64-SSE2-NEXT: pxor %xmm9, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pandn %xmm6, %xmm1 +; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm9, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm9 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X64-SSE42-NEXT: movapd %xmm3, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <8 x i64> %a0, %1 + %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1 + %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <8 x i64> %3, %4 + %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4 + %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <8 x i64> %6, %7 + %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7 + %10 = extractelement <8 x i64> %9, i32 0 + ret i64 %10 +} + +define i32 @test_reduce_v16i32(<16 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm4 +; X86-SSE2-NEXT: por %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsd %xmm3, %xmm1 +; X86-SSE42-NEXT: pminsd %xmm2, %xmm0 +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm4 +; X64-SSE2-NEXT: por %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; X64-SSE2-NEXT: pand %xmm0, %xmm5 +; X64-SSE2-NEXT: pandn %xmm4, %xmm0 +; X64-SSE2-NEXT: por %xmm5, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsd %xmm3, %xmm1 +; X64-SSE42-NEXT: pminsd %xmm2, %xmm0 +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <16 x i32> %a0, %1 + %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <16 x i32> %3, %4 + %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <16 x i32> %6, %7 + %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7 + %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp slt <16 x i32> %9, %10 + %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10 + %13 = extractelement <16 x i32> %12, i32 0 + ret i32 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; X86-SSE-LABEL: test_reduce_v32i16: +; X86-SSE: ## BB#0: +; X86-SSE-NEXT: pminsw %xmm3, %xmm1 +; X86-SSE-NEXT: pminsw %xmm2, %xmm0 +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE-NEXT: psrld $16, %xmm1 +; X86-SSE-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE-NEXT: movd %xmm1, %eax +; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: test_reduce_v32i16: +; X64-SSE: ## BB#0: +; X64-SSE-NEXT: pminsw %xmm3, %xmm1 +; X64-SSE-NEXT: pminsw %xmm2, %xmm0 +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE-NEXT: psrld $16, %xmm1 +; X64-SSE-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE-NEXT: movd %xmm1, %eax +; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp slt <32 x i16> %9, %10 + %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10 + %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp slt <32 x i16> %12, %13 + %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13 + %16 = extractelement <32 x i16> %15, i32 0 + ret i16 %16 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm4 +; X86-SSE2-NEXT: por %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm4, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminsb %xmm3, %xmm1 +; X86-SSE42-NEXT: pminsb %xmm2, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v64i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm4 +; X64-SSE2-NEXT: por %xmm1, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0 +; X64-SSE2-NEXT: pand %xmm0, %xmm5 +; X64-SSE2-NEXT: pandn %xmm4, %xmm0 +; X64-SSE2-NEXT: por %xmm5, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm2 +; X64-SSE2-NEXT: por %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminsb %xmm3, %xmm1 +; X64-SSE42-NEXT: pminsb %xmm2, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp slt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp slt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp slt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp slt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp slt <64 x i8> %12, %13 + %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13 + %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %17 = icmp slt <64 x i8> %15, %16 + %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16 + %19 = extractelement <64 x i8> %18, i32 0 + ret i8 %19 +} diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll new file mode 100644 index 000000000000..ee9d8955cb56 --- /dev/null +++ b/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -0,0 +1,2203 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512 + +; +; 128-bit Vectors +; + +define i64 @test_reduce_v2i64(<2 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v2i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v2i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm2, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v2i64: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X86-AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v2i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v2i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm2, %xmm3 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v2i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v2i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v2i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: retq + %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> + %2 = icmp ugt <2 x i64> %a0, %1 + %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1 + %4 = extractelement <2 x i64> %3, i32 0 + ret i64 %4 +} + +define i32 @test_reduce_v4i32(<4 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v4i32: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm3, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v4i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %2 = icmp ugt <4 x i32> %a0, %1 + %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <4 x i32> %3, %4 + %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4 + %7 = extractelement <4 x i32> %6, i32 0 + ret i32 %7 +} + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v8i16: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pxor %xmm1, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm1, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: movd %xmm3, %eax +; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v8i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 + %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <8 x i16> %6, %7 + %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7 + %10 = extractelement <8 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i8: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 + %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <16 x i8> %6, %7 + %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7 + %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ugt <16 x i8> %9, %10 + %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10 + %13 = extractelement <16 x i8> %12, i32 0 + ret i8 %13 +} + +; +; 256-bit Vectors +; + +define i64 @test_reduce_v4i64(<4 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE42-NEXT: pxor %xmm3, %xmm4 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm2, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v4i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v4i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movq %xmm2, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE42-NEXT: pxor %xmm3, %xmm4 +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm2, %xmm3 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v4i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %2 = icmp ugt <4 x i64> %a0, %1 + %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1 + %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <4 x i64> %3, %4 + %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4 + %7 = extractelement <4 x i64> %6, i32 0 + ret i64 %7 +} + +define i32 @test_reduce_v8i32(<8 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm4, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movd %xmm3, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <8 x i32> %a0, %1 + %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <8 x i32> %3, %4 + %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <8 x i32> %6, %7 + %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7 + %10 = extractelement <8 x i32> %9, i32 0 + ret i32 %10 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm4, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm3, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ugt <16 x i16> %9, %10 + %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10 + %13 = extractelement <16 x i16> %12, i32 0 + ret i16 %13 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ugt <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp ugt <32 x i8> %12, %13 + %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13 + %16 = extractelement <32 x i8> %15, i32 0 + ret i8 %16 +} + +; +; 512-bit Vectors +; + +define i64 @test_reduce_v8i64(<8 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: subl $28, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm6 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload +; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm6 +; X86-SSE2-NEXT: pandn %xmm5, %xmm0 +; X86-SSE2-NEXT: por %xmm6, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $28, %esp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm6, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE42-NEXT: pxor %xmm6, %xmm5 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE42-NEXT: pxor %xmm6, %xmm7 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm6, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X86-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movapd %xmm3, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm6, %xmm1 +; X86-SSE42-NEXT: movapd %xmm2, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm6, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm6, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm6 +; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: por %xmm6, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm9, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm7, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm6 +; X64-SSE2-NEXT: por %xmm0, %xmm6 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm8 +; X64-SSE2-NEXT: por %xmm1, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm8, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm6 +; X64-SSE2-NEXT: pandn %xmm8, %xmm1 +; X64-SSE2-NEXT: por %xmm6, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm6, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE42-NEXT: pxor %xmm6, %xmm5 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm5 +; X64-SSE42-NEXT: movdqa %xmm2, %xmm7 +; X64-SSE42-NEXT: pxor %xmm6, %xmm7 +; X64-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm6, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; X64-SSE42-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X64-SSE42-NEXT: movapd %xmm3, %xmm1 +; X64-SSE42-NEXT: xorpd %xmm6, %xmm1 +; X64-SSE42-NEXT: movapd %xmm2, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm6, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm6, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm6 +; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <8 x i64> %a0, %1 + %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1 + %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <8 x i64> %3, %4 + %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4 + %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <8 x i64> %6, %7 + %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7 + %10 = extractelement <8 x i64> %9, i32 0 + ret i64 %10 +} + +define i32 @test_reduce_v16i32(<16 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm7 +; X86-SSE2-NEXT: por %xmm0, %xmm7 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm7 +; X86-SSE2-NEXT: pandn %xmm6, %xmm1 +; X86-SSE2-NEXT: por %xmm7, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxud %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm7 +; X64-SSE2-NEXT: por %xmm0, %xmm7 +; X64-SSE2-NEXT: pand %xmm6, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm6 +; X64-SSE2-NEXT: por %xmm1, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm7 +; X64-SSE2-NEXT: pandn %xmm6, %xmm1 +; X64-SSE2-NEXT: por %xmm7, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxud %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <16 x i32> %a0, %1 + %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <16 x i32> %3, %4 + %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <16 x i32> %6, %7 + %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7 + %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ugt <16 x i32> %9, %10 + %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10 + %13 = extractelement <16 x i32> %12, i32 0 + ret i32 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm7 +; X86-SSE2-NEXT: por %xmm0, %xmm7 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm7 +; X86-SSE2-NEXT: pandn %xmm6, %xmm1 +; X86-SSE2-NEXT: por %xmm7, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm7 +; X64-SSE2-NEXT: por %xmm0, %xmm7 +; X64-SSE2-NEXT: pand %xmm6, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm6 +; X64-SSE2-NEXT: por %xmm1, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm7 +; X64-SSE2-NEXT: pandn %xmm6, %xmm1 +; X64-SSE2-NEXT: por %xmm7, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ugt <32 x i16> %9, %10 + %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10 + %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp ugt <32 x i16> %12, %13 + %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13 + %16 = extractelement <32 x i16> %15, i32 0 + ret i16 %16 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm2, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1 +; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v64i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm2, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1 +; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ugt <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ugt <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ugt <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ugt <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp ugt <64 x i8> %12, %13 + %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13 + %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %17 = icmp ugt <64 x i8> %15, %16 + %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16 + %19 = extractelement <64 x i8> %18, i32 0 + ret i8 %19 +} diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll new file mode 100644 index 000000000000..433696730420 --- /dev/null +++ b/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -0,0 +1,2207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512 + +; +; 128-bit Vectors +; + +define i64 @test_reduce_v2i64(<2 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v2i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: por %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm3 +; X86-SSE2-NEXT: por %xmm0, %xmm3 +; X86-SSE2-NEXT: movd %xmm3, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v2i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE42-NEXT: pxor %xmm0, %xmm3 +; X86-SSE42-NEXT: pxor %xmm2, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v2i64: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X86-AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v2i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm3 +; X64-SSE2-NEXT: por %xmm0, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v2i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm3 +; X64-SSE42-NEXT: pxor %xmm0, %xmm3 +; X64-SSE42-NEXT: pxor %xmm2, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v2i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v2i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v2i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: retq + %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> + %2 = icmp ult <2 x i64> %a0, %1 + %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1 + %4 = extractelement <2 x i64> %3, i32 0 + ret i64 %4 +} + +define i32 @test_reduce_v4i32(<4 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v4i32: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v4i32: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %2 = icmp ult <4 x i32> %a0, %1 + %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <4 x i32> %3, %4 + %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4 + %7 = extractelement <4 x i32> %6, i32 0 + ret i32 %7 +} + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm0, %xmm1 +; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm1 +; X86-SSE2-NEXT: por %xmm3, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v8i16: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE2-NEXT: pxor %xmm1, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm2 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm1, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm4, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm3, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v8i16: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovd %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1 + %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <8 x i16> %3, %4 + %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4 + %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <8 x i16> %6, %7 + %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7 + %10 = extractelement <8 x i16> %9, i32 0 + ret i16 %10 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: test_reduce_v16i8: +; X86-AVX: ## BB#0: +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX-LABEL: test_reduce_v16i8: +; X64-AVX: ## BB#0: +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1 + %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <16 x i8> %3, %4 + %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4 + %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <16 x i8> %6, %7 + %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7 + %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ult <16 x i8> %9, %10 + %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10 + %13 = extractelement <16 x i8> %12, i32 0 + ret i8 %13 +} + +; +; 256-bit Vectors +; + +define i64 @test_reduce_v4i64(<4 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v4i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm6, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v4i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE42-NEXT: pxor %xmm3, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm2, %xmm3 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X86-SSE42-NEXT: movd %xmm2, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v4i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v4i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v4i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm6, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movq %xmm2, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v4i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE42-NEXT: pxor %xmm3, %xmm4 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm2, %xmm3 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm3 +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; X64-SSE42-NEXT: movq %xmm2, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v4i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v4i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v4i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %2 = icmp ult <4 x i64> %a0, %1 + %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1 + %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <4 x i64> %3, %4 + %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4 + %7 = extractelement <4 x i64> %6, i32 0 + ret i64 %7 +} + +define i32 @test_reduce_v8i32(<8 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm3, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <8 x i32> %a0, %1 + %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1 + %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <8 x i32> %3, %4 + %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4 + %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <8 x i32> %6, %7 + %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7 + %10 = extractelement <8 x i32> %9, i32 0 + ret i32 %10 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm4, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm4 +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: pandn %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm0, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X64-SSE2-NEXT: pxor %xmm2, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm4 +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm2 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm4, %xmm2 +; X64-SSE2-NEXT: movd %xmm2, %eax +; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 + %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <16 x i16> %3, %4 + %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4 + %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <16 x i16> %6, %7 + %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7 + %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ult <16 x i16> %9, %10 + %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10 + %13 = extractelement <16 x i16> %12, i32 0 + ret i16 %13 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 + %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <32 x i8> %3, %4 + %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4 + %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <32 x i8> %6, %7 + %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7 + %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ult <32 x i8> %9, %10 + %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10 + %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp ult <32 x i8> %12, %13 + %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13 + %16 = extractelement <32 x i8> %15, i32 0 + ret i8 %16 +} + +; +; 512-bit Vectors +; + +define i64 @test_reduce_v8i64(<8 x i64> %a0) { +; X86-SSE2-LABEL: test_reduce_v8i64: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: subl $28, %esp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 32 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm5, %xmm6 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: por %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm0 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; X86-SSE2-NEXT: pand %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: por %xmm7, %xmm6 +; X86-SSE2-NEXT: pand %xmm6, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: por %xmm1, %xmm6 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload +; X86-SSE2-NEXT: por %xmm2, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm0, %xmm5 +; X86-SSE2-NEXT: pandn %xmm6, %xmm0 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm1, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pandn %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: movd %xmm2, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %edx +; X86-SSE2-NEXT: addl $28, %esp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v8i64: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648] +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm2, %xmm6 +; X86-SSE42-NEXT: pxor %xmm4, %xmm6 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm6 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm7 +; X86-SSE42-NEXT: pxor %xmm4, %xmm7 +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X86-SSE42-NEXT: movdqa %xmm6, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; X86-SSE42-NEXT: movapd %xmm2, %xmm1 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm1 +; X86-SSE42-NEXT: movapd %xmm3, %xmm0 +; X86-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X86-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X86-SSE42-NEXT: pxor %xmm4, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm4 +; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 +; X86-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v8i64: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v8i64: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v8i64: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: pxor %xmm9, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE2-NEXT: pxor %xmm9, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm6 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: por %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm9, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X64-SSE2-NEXT: pxor %xmm9, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] +; X64-SSE2-NEXT: por %xmm7, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm6 +; X64-SSE2-NEXT: por %xmm1, %xmm6 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm9, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm1 +; X64-SSE2-NEXT: pxor %xmm9, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm3, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pandn %xmm6, %xmm1 +; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm9, %xmm2 +; X64-SSE2-NEXT: pxor %xmm0, %xmm9 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; X64-SSE2-NEXT: por %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: movq %xmm3, %rax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v8i64: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm2, %xmm6 +; X64-SSE42-NEXT: pxor %xmm4, %xmm6 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm6 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm7 +; X64-SSE42-NEXT: pxor %xmm4, %xmm7 +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; X64-SSE42-NEXT: movdqa %xmm6, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; X64-SSE42-NEXT: movapd %xmm2, %xmm1 +; X64-SSE42-NEXT: xorpd %xmm4, %xmm1 +; X64-SSE42-NEXT: movapd %xmm3, %xmm0 +; X64-SSE42-NEXT: xorpd %xmm4, %xmm0 +; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; X64-SSE42-NEXT: movdqa %xmm3, %xmm0 +; X64-SSE42-NEXT: pxor %xmm4, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm4 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4 +; X64-SSE42-NEXT: movdqa %xmm4, %xmm0 +; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; X64-SSE42-NEXT: movq %xmm1, %rax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v8i64: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovq %xmm0, %rax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v8i64: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovq %xmm0, %rax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovq %xmm0, %rax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <8 x i64> %a0, %1 + %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1 + %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <8 x i64> %3, %4 + %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4 + %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <8 x i64> %6, %7 + %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7 + %10 = extractelement <8 x i64> %9, i32 0 + ret i64 %10 +} + +define i32 @test_reduce_v16i32(<16 x i32> %a0) { +; X86-SSE2-LABEL: test_reduce_v16i32: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm7 +; X86-SSE2-NEXT: por %xmm1, %xmm7 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pandn %xmm7, %xmm1 +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm4 +; X86-SSE2-NEXT: por %xmm3, %xmm4 +; X86-SSE2-NEXT: movd %xmm4, %eax +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v16i32: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminud %xmm3, %xmm1 +; X86-SSE42-NEXT: pminud %xmm2, %xmm0 +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminud %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v16i32: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i32: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v16i32: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm7 +; X64-SSE2-NEXT: por %xmm1, %xmm7 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pandn %xmm7, %xmm1 +; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm4 +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: movd %xmm4, %eax +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v16i32: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminud %xmm3, %xmm1 +; X64-SSE42-NEXT: pminud %xmm2, %xmm0 +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminud %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v16i32: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i32: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <16 x i32> %a0, %1 + %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1 + %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <16 x i32> %3, %4 + %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4 + %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <16 x i32> %6, %7 + %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7 + %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ult <16 x i32> %9, %10 + %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10 + %13 = extractelement <16 x i32> %12, i32 0 + ret i32 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; X86-SSE2-LABEL: test_reduce_v32i16: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm4, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pxor %xmm4, %xmm7 +; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm1 +; X86-SSE2-NEXT: pandn %xmm3, %xmm7 +; X86-SSE2-NEXT: por %xmm1, %xmm7 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pandn %xmm2, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pandn %xmm7, %xmm1 +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pandn %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm4, %xmm2 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm0, %xmm4 +; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: pandn %xmm0, %xmm4 +; X86-SSE2-NEXT: por %xmm2, %xmm4 +; X86-SSE2-NEXT: movd %xmm4, %eax +; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v32i16: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminuw %xmm3, %xmm1 +; X86-SSE42-NEXT: pminuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v32i16: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v32i16: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE2-NEXT: pxor %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm4, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm1 +; X64-SSE2-NEXT: pandn %xmm3, %xmm7 +; X64-SSE2-NEXT: por %xmm1, %xmm7 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pandn %xmm2, %xmm5 +; X64-SSE2-NEXT: por %xmm0, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm1, %xmm5 +; X64-SSE2-NEXT: pandn %xmm7, %xmm1 +; X64-SSE2-NEXT: por %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pandn %xmm0, %xmm3 +; X64-SSE2-NEXT: por %xmm1, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: pxor %xmm4, %xmm2 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pxor %xmm0, %xmm4 +; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4 +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: pandn %xmm0, %xmm4 +; X64-SSE2-NEXT: por %xmm2, %xmm4 +; X64-SSE2-NEXT: movd %xmm4, %eax +; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v32i16: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminuw %xmm3, %xmm1 +; X64-SSE42-NEXT: pminuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 +; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v32i16: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <32 x i16> %a0, %1 + %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 + %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <32 x i16> %3, %4 + %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4 + %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <32 x i16> %6, %7 + %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7 + %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ult <32 x i16> %9, %10 + %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10 + %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp ult <32 x i16> %12, %13 + %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13 + %16 = extractelement <32 x i16> %15, i32 0 + ret i16 %16 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; X86-SSE2-LABEL: test_reduce_v64i8: +; X86-SSE2: ## BB#0: +; X86-SSE2-NEXT: pminub %xmm3, %xmm1 +; X86-SSE2-NEXT: pminub %xmm2, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminub %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: test_reduce_v64i8: +; X86-SSE42: ## BB#0: +; X86-SSE42-NEXT: pminub %xmm3, %xmm1 +; X86-SSE42-NEXT: pminub %xmm2, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-SSE42-NEXT: retl +; +; X86-AVX1-LABEL: test_reduce_v64i8: +; X86-AVX1: ## BB#0: +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8: +; X86-AVX2: ## BB#0: +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE2-LABEL: test_reduce_v64i8: +; X64-SSE2: ## BB#0: +; X64-SSE2-NEXT: pminub %xmm3, %xmm1 +; X64-SSE2-NEXT: pminub %xmm2, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminub %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE2-NEXT: retq +; +; X64-SSE42-LABEL: test_reduce_v64i8: +; X64-SSE42: ## BB#0: +; X64-SSE42-NEXT: pminub %xmm3, %xmm1 +; X64-SSE42-NEXT: pminub %xmm2, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax +; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-SSE42-NEXT: retq +; +; X64-AVX1-LABEL: test_reduce_v64i8: +; X64-AVX1: ## BB#0: +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8: +; X64-AVX2: ## BB#0: +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax +; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq + %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = icmp ult <64 x i8> %a0, %1 + %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 + %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %5 = icmp ult <64 x i8> %3, %4 + %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4 + %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %8 = icmp ult <64 x i8> %6, %7 + %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7 + %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %11 = icmp ult <64 x i8> %9, %10 + %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10 + %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %14 = icmp ult <64 x i8> %12, %13 + %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13 + %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %17 = icmp ult <64 x i8> %15, %16 + %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16 + %19 = extractelement <64 x i8> %18, i32 0 + ret i8 %19 +} diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll index fd503aa6c6ee..e3b25a539c1a 100644 --- a/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -81,6 +81,7 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; X86-NEXT: orl %edx, %eax ; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: i24_insert_bit: diff --git a/test/CodeGen/X86/imul.ll b/test/CodeGen/X86/imul.ll index e364b001f945..02782f721083 100644 --- a/test/CodeGen/X86/imul.ll +++ b/test/CodeGen/X86/imul.ll @@ -307,6 +307,7 @@ define i64 @test5(i64 %a) { ; X86-NEXT: subl %ecx, %edx ; X86-NEXT: subl %esi, %edx ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %tmp3 = mul i64 %a, -31 @@ -362,6 +363,7 @@ define i64 @test7(i64 %a) { ; X86-NEXT: subl %ecx, %edx ; X86-NEXT: subl %esi, %edx ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %tmp3 = mul i64 %a, -33 @@ -390,6 +392,7 @@ define i64 @testOverflow(i64 %a) { ; X86-NEXT: addl %esi, %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %edx ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %tmp3 = mul i64 %a, 9223372036854775807 diff --git a/test/CodeGen/X86/inline-asm-A-constraint.ll b/test/CodeGen/X86/inline-asm-A-constraint.ll index 2ad011e88e0d..7975b318eff5 100644 --- a/test/CodeGen/X86/inline-asm-A-constraint.ll +++ b/test/CodeGen/X86/inline-asm-A-constraint.ll @@ -19,8 +19,7 @@ entry: %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1 ret { i64, i64 } %.fca.1.insert } -; CHECK: lock -; CHECK-NEXT: cmpxchg16b +; CHECK: lock cmpxchg16b attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind } diff --git a/test/CodeGen/X86/lea-opt-cse1.ll b/test/CodeGen/X86/lea-opt-cse1.ll index 05b47690e819..4c9ec3e0d7a3 100644 --- a/test/CodeGen/X86/lea-opt-cse1.ll +++ b/test/CodeGen/X86/lea-opt-cse1.ll @@ -30,6 +30,7 @@ define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr { ; X86-NEXT: leal 1(%edx,%ecx), %ecx ; X86-NEXT: movl %ecx, 16(%eax) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0 diff --git a/test/CodeGen/X86/lea-opt-cse2.ll b/test/CodeGen/X86/lea-opt-cse2.ll index 865dd49a6e1f..cee6f6792cb4 100644 --- a/test/CodeGen/X86/lea-opt-cse2.ll +++ b/test/CodeGen/X86/lea-opt-cse2.ll @@ -46,7 +46,9 @@ define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X86-NEXT: leal 1(%esi,%edx), %ecx ; X86-NEXT: movl %ecx, 16(%eax) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: br label %loop diff --git a/test/CodeGen/X86/lea-opt-cse3.ll b/test/CodeGen/X86/lea-opt-cse3.ll index 87949b40d487..ed3aff980362 100644 --- a/test/CodeGen/X86/lea-opt-cse3.ll +++ b/test/CodeGen/X86/lea-opt-cse3.ll @@ -91,6 +91,7 @@ define i32 @foo1_mult_basic_blocks(i32 %a, i32 %b) local_unnamed_addr #0 { ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB2_2: # %exit ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %mul = shl i32 %b, 2 @@ -143,6 +144,7 @@ define i32 @foo1_mult_basic_blocks_illegal_scale(i32 %a, i32 %b) local_unnamed_a ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB3_2: # %exit ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %mul = shl i32 %b, 1 diff --git a/test/CodeGen/X86/lea-opt-cse4.ll b/test/CodeGen/X86/lea-opt-cse4.ll index 31f31a73d44e..d068180c39cb 100644 --- a/test/CodeGen/X86/lea-opt-cse4.ll +++ b/test/CodeGen/X86/lea-opt-cse4.ll @@ -36,6 +36,7 @@ define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X86-NEXT: leal 1(%ecx,%edx), %ecx ; X86-NEXT: movl %ecx, 16(%eax) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0 @@ -110,7 +111,9 @@ define void @foo_loop(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl %edx, 16(%eax) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: br label %loop diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll index ca4cfa5b8052..7dff2c20d5af 100644 --- a/test/CodeGen/X86/legalize-shift-64.ll +++ b/test/CodeGen/X86/legalize-shift-64.ll @@ -117,9 +117,13 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) { ; CHECK-NEXT: movl %esi, 4(%eax) ; CHECK-NEXT: movl %edi, (%eax) ; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: popl %edi +; CHECK-NEXT: .cfi_def_cfa_offset 12 ; CHECK-NEXT: popl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl $4 %shl = shl <2 x i64> %A, %B ret <2 x i64> %shl @@ -160,6 +164,7 @@ define i32 @test6() { ; CHECK-NEXT: .LBB5_4: # %if.then ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 ; CHECK-NEXT: retl %x = alloca i32, align 4 %t = alloca i64, align 8 diff --git a/test/CodeGen/X86/live-out-reg-info.ll b/test/CodeGen/X86/live-out-reg-info.ll index b838065beea5..170f73593f60 100644 --- a/test/CodeGen/X86/live-out-reg-info.ll +++ b/test/CodeGen/X86/live-out-reg-info.ll @@ -18,6 +18,7 @@ define void @foo(i32 %a) { ; CHECK-NEXT: callq qux ; CHECK-NEXT: .LBB0_2: # %false ; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %t0 = lshr i32 %a, 23 br label %next diff --git a/test/CodeGen/X86/load-combine.ll b/test/CodeGen/X86/load-combine.ll index d1f5f41ac7bf..d46efc4b5eca 100644 --- a/test/CodeGen/X86/load-combine.ll +++ b/test/CodeGen/X86/load-combine.ll @@ -376,6 +376,7 @@ define i32 @load_i32_by_i8_bswap_uses(i32* %arg) { ; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_bswap_uses: @@ -496,6 +497,7 @@ define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) { ; CHECK-NEXT: movzbl 3(%ecx), %eax ; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between: diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 8983c3acb53d..207175aae1a1 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -1057,9 +1057,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; SKX: # BB#0: ; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def> ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 -; SKX-NEXT: kshiftlb $6, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k1 +; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -1068,9 +1066,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; SKX_32: # BB#0: ; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 -; SKX_32-NEXT: kshiftlb $6, %k0, %k0 -; SKX_32-NEXT: kshiftrb $6, %k0, %k1 +; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) @@ -1105,9 +1101,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; SKX: # BB#0: ; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def> ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 -; SKX-NEXT: kshiftlb $6, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k1 +; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} ; SKX-NEXT: vzeroupper @@ -1117,9 +1111,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; SKX_32: # BB#0: ; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def> ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 -; SKX_32-NEXT: kshiftlb $6, %k0, %k0 -; SKX_32-NEXT: kshiftrb $6, %k0, %k1 +; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} ; SKX_32-NEXT: vzeroupper @@ -1165,9 +1157,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; SKX: # BB#0: ; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0 -; SKX-NEXT: kshiftlb $6, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k1 +; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} ; SKX-NEXT: vmovaps %xmm2, %xmm0 ; SKX-NEXT: retq @@ -1176,9 +1166,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl ; SKX_32: # BB#0: ; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0 -; SKX_32-NEXT: kshiftlb $6, %k0, %k0 -; SKX_32-NEXT: kshiftrb $6, %k0, %k1 +; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1} ; SKX_32-NEXT: vmovaps %xmm2, %xmm0 @@ -1702,6 +1690,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: .cfi_def_cfa %esp, 4 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_gather_16i64: @@ -1736,6 +1725,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: .cfi_def_cfa %esp, 4 ; SKX_32-NEXT: retl %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) ret <16 x i64> %res @@ -1819,6 +1809,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; KNL_32-NEXT: vmovapd %zmm2, %zmm0 ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: .cfi_def_cfa %esp, 4 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_gather_16f64: @@ -1853,6 +1844,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; SKX_32-NEXT: vmovapd %zmm2, %zmm0 ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: .cfi_def_cfa %esp, 4 ; SKX_32-NEXT: retl %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res @@ -1934,6 +1926,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> % ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: .cfi_def_cfa %esp, 4 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1967,6 +1960,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> % ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: .cfi_def_cfa %esp, 4 ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) @@ -2050,6 +2044,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: .cfi_def_cfa %esp, 4 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -2083,6 +2078,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: .cfi_def_cfa %esp, 4 ; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) @@ -2127,6 +2123,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: .cfi_def_cfa %esp, 4 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_pr28312: @@ -2154,6 +2151,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: .cfi_def_cfa %esp, 4 ; SKX_32-NEXT: retl %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll index 3e257f5fd852..f43e3f6f56ec 100644 --- a/test/CodeGen/X86/masked_memop.ll +++ b/test/CodeGen/X86/masked_memop.ll @@ -285,9 +285,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) { ; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512F-NEXT: retq @@ -327,9 +325,7 @@ define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) { ; AVX512F: ## BB#0: ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512F-NEXT: retq @@ -369,9 +365,7 @@ define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) { ; AVX512F: ## BB#0: ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; AVX512F-NEXT: retq diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll index 77d9fa69182b..3f5eeba7055c 100644 --- a/test/CodeGen/X86/memcmp-optsize.ll +++ b/test/CodeGen/X86/memcmp-optsize.ll @@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize { define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB5_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB5_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB5_3 -; X86-NEXT: .LBB5_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB5_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB5_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB5_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB5_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB5_3 -; X64-NEXT: .LBB5_1: # %res_block +; X64-NEXT: .LBB5_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB5_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize { define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB10_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB10_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB10_3 -; X86-NEXT: .LBB10_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB10_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB10_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB10_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB10_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB10_3 -; X64-NEXT: .LBB10_1: # %res_block +; X64-NEXT: .LBB10_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB10_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx @@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_1: # %res_block +; X86-NEXT: .LBB11_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al @@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: incl %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: incl %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { define i1 @length8_eq_const(i8* %X) nounwind optsize { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB13_3 -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: incl %eax ; X86-NEXT: .LBB13_3: # %endblock @@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB14_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB14_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB14_3 -; X64-NEXT: .LBB14_1: # %res_block +; X64-NEXT: .LBB14_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB14_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -505,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB15_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB15_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB15_1: # %res_block +; X64-NEXT: je .LBB15_3 +; X64-NEXT: .LBB15_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB15_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -546,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: je .LBB16_3 +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -701,19 +699,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB20_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB20_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB20_3 -; X64-SSE2-NEXT: .LBB20_1: # %res_block +; X64-SSE2-NEXT: .LBB20_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB20_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -721,18 +719,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB20_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB20_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: movq 16(%rdi), %rcx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX2-NEXT: je .LBB20_3 -; X64-AVX2-NEXT: .LBB20_1: # %res_block +; X64-AVX2-NEXT: .LBB20_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB20_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -757,18 +755,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB21_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB21_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB21_3 -; X64-SSE2-NEXT: .LBB21_1: # %res_block +; X64-SSE2-NEXT: .LBB21_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB21_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -776,18 +774,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length24_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: jne .LBB21_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB21_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX2-NEXT: je .LBB21_3 -; X64-AVX2-NEXT: .LBB21_1: # %res_block +; X64-AVX2-NEXT: .LBB21_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB21_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -833,7 +831,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -841,8 +839,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB23_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB23_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -850,7 +848,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB23_3 -; X86-SSE2-NEXT: .LBB23_1: # %res_block +; X86-SSE2-NEXT: .LBB23_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB23_3: # %endblock @@ -859,14 +857,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -874,7 +872,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -909,21 +907,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-NOSSE-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB24_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB24_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB24_3 -; X86-SSE2-NEXT: .LBB24_1: # %res_block +; X86-SSE2-NEXT: .LBB24_2: # %res_block ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: incl %eax ; X86-SSE2-NEXT: .LBB24_3: # %endblock @@ -932,20 +930,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB24_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB24_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB24_3 -; X64-SSE2-NEXT: .LBB24_1: # %res_block +; X64-SSE2-NEXT: .LBB24_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB24_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1009,20 +1007,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB26_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB26_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB26_3 -; X64-AVX2-NEXT: .LBB26_1: # %res_block +; X64-AVX2-NEXT: .LBB26_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB26_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1059,20 +1057,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize { ; X64-SSE2-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB27_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB27_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB27_3 -; X64-AVX2-NEXT: .LBB27_1: # %res_block +; X64-AVX2-NEXT: .LBB27_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB27_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll index 393e4c42d8b9..84fd45b0a08c 100644 --- a/test/CodeGen/X86/memcmp.ll +++ b/test/CodeGen/X86/memcmp.ll @@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { define i1 @length3_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length3_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: cmpw (%ecx), %dx -; X86-NEXT: jne .LBB7_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 2(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 2(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: cmpw (%eax), %dx +; X86-NEXT: jne .LBB7_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 2(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 2(%eax), %dl ; X86-NEXT: je .LBB7_3 -; X86-NEXT: .LBB7_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB7_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB7_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length3_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: cmpw (%rsi), %ax -; X64-NEXT: jne .LBB7_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB7_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 2(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 2(%rsi), %cl ; X64-NEXT: je .LBB7_3 -; X64-NEXT: .LBB7_1: # %res_block +; X64-NEXT: .LBB7_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB7_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { define i1 @length5_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length5_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB12_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movb 4(%eax), %dl -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpb 4(%ecx), %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB12_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movb 4(%ecx), %dl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpb 4(%eax), %dl ; X86-NEXT: je .LBB12_3 -; X86-NEXT: .LBB12_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB12_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB12_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: length5_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: cmpl (%rsi), %eax -; X64-NEXT: jne .LBB12_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB12_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movb 4(%rdi), %cl ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpb 4(%rsi), %cl ; X64-NEXT: je .LBB12_3 -; X64-NEXT: .LBB12_1: # %res_block +; X64-NEXT: .LBB12_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB12_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -394,23 +394,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB13_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: movl 4(%esi), %ecx ; X86-NEXT: movl 4(%eax), %edx ; X86-NEXT: bswapl %ecx ; X86-NEXT: bswapl %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB13_1 -; X86-NEXT: # BB#3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB13_1: # %res_block +; X86-NEXT: je .LBB13_3 +; X86-NEXT: .LBB13_2: # %res_block ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl %edx, %ecx ; X86-NEXT: setae %al ; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB13_3: # %endblock ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -431,21 +429,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { define i1 @length8_eq(i8* %X, i8* %Y) nounwind { ; X86-LABEL: length8_eq: -; X86: # BB#0: # %loadbb -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: cmpl (%ecx), %edx -; X86-NEXT: jne .LBB14_1 -; X86-NEXT: # BB#2: # %loadbb1 -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl 4(%ecx), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: cmpl (%eax), %edx +; X86-NEXT: jne .LBB14_2 +; X86-NEXT: # BB#1: # %loadbb1 +; X86-NEXT: movl 4(%ecx), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl 4(%eax), %edx ; X86-NEXT: je .LBB14_3 -; X86-NEXT: .LBB14_1: # %res_block -; X86-NEXT: movl $1, %eax +; X86-NEXT: .LBB14_2: # %res_block +; X86-NEXT: movl $1, %ecx ; X86-NEXT: .LBB14_3: # %endblock -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sete %al ; X86-NEXT: retl ; @@ -462,15 +460,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { define i1 @length8_eq_const(i8* %X) nounwind { ; X86-LABEL: length8_eq_const: -; X86: # BB#0: # %loadbb +; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130 -; X86-NEXT: jne .LBB15_1 -; X86-NEXT: # BB#2: # %loadbb1 +; X86-NEXT: jne .LBB15_2 +; X86-NEXT: # BB#1: # %loadbb1 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534 ; X86-NEXT: je .LBB15_3 -; X86-NEXT: .LBB15_1: # %res_block +; X86-NEXT: .LBB15_2: # %res_block ; X86-NEXT: movl $1, %eax ; X86-NEXT: .LBB15_3: # %endblock ; X86-NEXT: testl %eax, %eax @@ -502,16 +500,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12_eq: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax -; X64-NEXT: jne .LBB16_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB16_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl 8(%rsi), %ecx ; X64-NEXT: je .LBB16_3 -; X64-NEXT: .LBB16_1: # %res_block +; X64-NEXT: .LBB16_2: # %res_block ; X64-NEXT: movl $1, %eax ; X64-NEXT: .LBB16_3: # %endblock ; X64-NEXT: testl %eax, %eax @@ -534,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length12: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB17_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movl 8(%rdi), %ecx ; X64-NEXT: movl 8(%rsi), %edx ; X64-NEXT: bswapl %ecx ; X64-NEXT: bswapl %edx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB17_1: # %res_block +; X64-NEXT: je .LBB17_3 +; X64-NEXT: .LBB17_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB17_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind ret i32 %m @@ -575,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { ; X86-NEXT: retl ; ; X64-LABEL: length16: -; X64: # BB#0: # %loadbb +; X64: # BB#0: ; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: movq (%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#2: # %loadbb1 +; X64-NEXT: jne .LBB18_2 +; X64-NEXT: # BB#1: # %loadbb1 ; X64-NEXT: movq 8(%rdi), %rcx ; X64-NEXT: movq 8(%rsi), %rdx ; X64-NEXT: bswapq %rcx ; X64-NEXT: bswapq %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # BB#3: # %endblock -; X64-NEXT: retq -; X64-NEXT: .LBB18_1: # %res_block +; X64-NEXT: je .LBB18_3 +; X64-NEXT: .LBB18_2: # %res_block ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq %rdx, %rcx ; X64-NEXT: setae %al ; X64-NEXT: leal -1(%rax,%rax), %eax +; X64-NEXT: .LBB18_3: # %endblock ; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind ret i32 %m @@ -754,19 +750,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB22_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB22_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movq 16(%rdi), %rcx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx ; X64-SSE2-NEXT: je .LBB22_3 -; X64-SSE2-NEXT: .LBB22_1: # %res_block +; X64-SSE2-NEXT: .LBB22_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB22_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -774,18 +770,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB22_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB22_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: movq 16(%rdi), %rcx ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: cmpq 16(%rsi), %rcx ; X64-AVX-NEXT: je .LBB22_3 -; X64-AVX-NEXT: .LBB22_1: # %res_block +; X64-AVX-NEXT: .LBB22_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB22_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -810,18 +806,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X86-NEXT: retl ; ; X64-SSE2-LABEL: length24_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB23_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB23_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi) ; X64-SSE2-NEXT: je .LBB23_3 -; X64-SSE2-NEXT: .LBB23_1: # %res_block +; X64-SSE2-NEXT: .LBB23_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB23_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -829,18 +825,18 @@ define i1 @length24_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: length24_eq_const: -; X64-AVX: # BB#0: # %loadbb +; X64-AVX: # BB#0: ; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX-NEXT: jne .LBB23_1 -; X64-AVX-NEXT: # BB#2: # %loadbb1 +; X64-AVX-NEXT: jne .LBB23_2 +; X64-AVX-NEXT: # BB#1: # %loadbb1 ; X64-AVX-NEXT: xorl %eax, %eax ; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736 ; X64-AVX-NEXT: cmpq %rcx, 16(%rdi) ; X64-AVX-NEXT: je .LBB23_3 -; X64-AVX-NEXT: .LBB23_1: # %res_block +; X64-AVX-NEXT: .LBB23_2: # %res_block ; X64-AVX-NEXT: movl $1, %eax ; X64-AVX-NEXT: .LBB23_3: # %endblock ; X64-AVX-NEXT: testl %eax, %eax @@ -898,7 +894,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 @@ -906,8 +902,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %edx ; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB25_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB25_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -915,7 +911,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB25_3 -; X86-SSE2-NEXT: .LBB25_1: # %res_block +; X86-SSE2-NEXT: .LBB25_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB25_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -923,14 +919,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu (%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB25_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB25_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 @@ -938,7 +934,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB25_3 -; X64-SSE2-NEXT: .LBB25_1: # %res_block +; X64-SSE2-NEXT: .LBB25_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB25_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -946,20 +942,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB25_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB25_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB25_3 -; X64-AVX1-NEXT: .LBB25_1: # %res_block +; X64-AVX1-NEXT: .LBB25_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB25_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1006,21 +1002,21 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE1-NEXT: retl ; ; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # BB#0: # %loadbb +; X86-SSE2: # BB#0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movdqu (%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; X86-SSE2-NEXT: jne .LBB26_1 -; X86-SSE2-NEXT: # BB#2: # %loadbb1 +; X86-SSE2-NEXT: jne .LBB26_2 +; X86-SSE2-NEXT: # BB#1: # %loadbb1 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X86-SSE2-NEXT: xorl %eax, %eax ; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X86-SSE2-NEXT: je .LBB26_3 -; X86-SSE2-NEXT: .LBB26_1: # %res_block +; X86-SSE2-NEXT: .LBB26_2: # %res_block ; X86-SSE2-NEXT: movl $1, %eax ; X86-SSE2-NEXT: .LBB26_3: # %endblock ; X86-SSE2-NEXT: testl %eax, %eax @@ -1028,20 +1024,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: length32_eq_const: -; X64-SSE2: # BB#0: # %loadbb +; X64-SSE2: # BB#0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: jne .LBB26_1 -; X64-SSE2-NEXT: # BB#2: # %loadbb1 +; X64-SSE2-NEXT: jne .LBB26_2 +; X64-SSE2-NEXT: # BB#1: # %loadbb1 ; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx ; X64-SSE2-NEXT: xorl %eax, %eax ; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-SSE2-NEXT: je .LBB26_3 -; X64-SSE2-NEXT: .LBB26_1: # %res_block +; X64-SSE2-NEXT: .LBB26_2: # %res_block ; X64-SSE2-NEXT: movl $1, %eax ; X64-SSE2-NEXT: .LBB26_3: # %endblock ; X64-SSE2-NEXT: testl %eax, %eax @@ -1049,20 +1045,20 @@ define i1 @length32_eq_const(i8* %X) nounwind { ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: length32_eq_const: -; X64-AVX1: # BB#0: # %loadbb +; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax ; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X64-AVX1-NEXT: jne .LBB26_1 -; X64-AVX1-NEXT: # BB#2: # %loadbb1 +; X64-AVX1-NEXT: jne .LBB26_2 +; X64-AVX1-NEXT: # BB#1: # %loadbb1 ; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 ; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; X64-AVX1-NEXT: je .LBB26_3 -; X64-AVX1-NEXT: .LBB26_1: # %res_block +; X64-AVX1-NEXT: .LBB26_2: # %res_block ; X64-AVX1-NEXT: movl $1, %eax ; X64-AVX1-NEXT: .LBB26_3: # %endblock ; X64-AVX1-NEXT: testl %eax, %eax @@ -1136,20 +1132,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB28_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB28_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB28_3 -; X64-AVX2-NEXT: .LBB28_1: # %res_block +; X64-AVX2-NEXT: .LBB28_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB28_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax @@ -1197,20 +1193,20 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: length64_eq_const: -; X64-AVX2: # BB#0: # %loadbb +; X64-AVX2: # BB#0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax ; X64-AVX2-NEXT: cmpl $-1, %eax -; X64-AVX2-NEXT: jne .LBB29_1 -; X64-AVX2-NEXT: # BB#2: # %loadbb1 +; X64-AVX2-NEXT: jne .LBB29_2 +; X64-AVX2-NEXT: # BB#1: # %loadbb1 ; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0 ; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx ; X64-AVX2-NEXT: xorl %eax, %eax ; X64-AVX2-NEXT: cmpl $-1, %ecx ; X64-AVX2-NEXT: je .LBB29_3 -; X64-AVX2-NEXT: .LBB29_1: # %res_block +; X64-AVX2-NEXT: .LBB29_2: # %res_block ; X64-AVX2-NEXT: movl $1, %eax ; X64-AVX2-NEXT: .LBB29_3: # %endblock ; X64-AVX2-NEXT: testl %eax, %eax diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll index f0a957c9417c..98e09377ddb7 100644 --- a/test/CodeGen/X86/memset-nonzero.ll +++ b/test/CodeGen/X86/memset-nonzero.ll @@ -148,6 +148,7 @@ define void @memset_256_nonzero_bytes(i8* %x) { ; SSE-NEXT: movl $256, %edx # imm = 0x100 ; SSE-NEXT: callq memset ; SSE-NEXT: popq %rax +; SSE-NEXT: .cfi_def_cfa_offset 8 ; SSE-NEXT: retq ; ; SSE2FAST-LABEL: memset_256_nonzero_bytes: diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll index e414f5554deb..b909b7c403bb 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -72,7 +72,9 @@ define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp { ; X32-SSE1-NEXT: movl %esi, 4(%eax) ; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_2i64_i64_12: @@ -384,6 +386,7 @@ define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp { ; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movl %ecx, 12(%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_4i32_i32_23u5: @@ -435,7 +438,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movl %ecx, 12(%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2: @@ -490,7 +495,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movl %ecx, 12(%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3: @@ -649,7 +656,9 @@ define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ; X32-SSE1-NEXT: movl $0, 12(%eax) ; X32-SSE1-NEXT: movl $0, 8(%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4: @@ -701,7 +710,9 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ; X32-SSE1-NEXT: movl $0, 12(%eax) ; X32-SSE1-NEXT: movl $0, 8(%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5: @@ -751,7 +762,9 @@ define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl %esi, 6(%eax) ; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_23u567u9: @@ -897,9 +910,13 @@ define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noin ; X32-SSE1-NEXT: movl %esi, 3(%eax) ; X32-SSE1-NEXT: movw %bp, (%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 ; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 ; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: popl %ebp +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF: @@ -1129,7 +1146,9 @@ define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinlin ; X32-SSE1-NEXT: movl %esi, 4(%eax) ; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: popl %edi +; X32-SSE1-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_2i64_i64_12_volatile: diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll index 051c8a710c85..ddcc383b65e4 100644 --- a/test/CodeGen/X86/movtopush.ll +++ b/test/CodeGen/X86/movtopush.ll @@ -382,8 +382,10 @@ entry: ; LINUX: pushl $1 ; LINUX: .cfi_adjust_cfa_offset 4 ; LINUX: calll good -; LINUX: addl $28, %esp +; LINUX: addl $16, %esp ; LINUX: .cfi_adjust_cfa_offset -16 +; LINUX: addl $12, %esp +; LINUX: .cfi_def_cfa_offset 4 ; LINUX-NOT: add ; LINUX: retl define void @pr27140() optsize { diff --git a/test/CodeGen/X86/mul-constant-result.ll b/test/CodeGen/X86/mul-constant-result.ll index 011b63ce7269..f778397f889a 100644 --- a/test/CodeGen/X86/mul-constant-result.ll +++ b/test/CodeGen/X86/mul-constant-result.ll @@ -34,84 +34,116 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 { ; X86-NEXT: .LBB0_6: ; X86-NEXT: addl %eax, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_39: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB0_40: ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_7: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_8: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: shll $2, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_9: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_10: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: addl %eax, %eax ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_11: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (,%eax,8), %ecx ; X86-NEXT: jmp .LBB0_12 ; X86-NEXT: .LBB0_13: ; X86-NEXT: shll $3, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_14: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,8), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_15: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: addl %eax, %eax ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_16: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,4), %ecx ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_17: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: shll $2, %eax ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_18: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,2), %ecx ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_19: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,2), %ecx ; X86-NEXT: jmp .LBB0_20 ; X86-NEXT: .LBB0_21: ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_22: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: shll $4, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_23: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shll $4, %ecx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_24: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: addl %eax, %eax ; X86-NEXT: leal (%eax,%eax,8), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_25: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,4), %ecx ; X86-NEXT: shll $2, %ecx ; X86-NEXT: jmp .LBB0_12 @@ -119,20 +151,26 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 { ; X86-NEXT: shll $2, %eax ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_27: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,4), %ecx ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_28: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,4), %ecx ; X86-NEXT: .LBB0_20: ; X86-NEXT: leal (%eax,%ecx,4), %ecx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_29: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,2), %ecx ; X86-NEXT: shll $3, %ecx ; X86-NEXT: jmp .LBB0_12 @@ -140,13 +178,17 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 { ; X86-NEXT: shll $3, %eax ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_31: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_32: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: jmp .LBB0_12 @@ -154,21 +196,27 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 { ; X86-NEXT: leal (%eax,%eax,8), %eax ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_34: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_35: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_36: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: shll $5, %ecx ; X86-NEXT: subl %eax, %ecx @@ -180,10 +228,13 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 { ; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; X86-NEXT: .LBB0_38: +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: shll $5, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-HSW-LABEL: mult: @@ -857,8 +908,11 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-NEXT: negl %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %ebx +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-HSW-LABEL: foo: @@ -1072,10 +1126,15 @@ define i32 @foo() local_unnamed_addr #0 { ; X64-HSW-NEXT: negl %ecx ; X64-HSW-NEXT: movl %ecx, %eax ; X64-HSW-NEXT: addq $8, %rsp +; X64-HSW-NEXT: .cfi_def_cfa_offset 40 ; X64-HSW-NEXT: popq %rbx +; X64-HSW-NEXT: .cfi_def_cfa_offset 32 ; X64-HSW-NEXT: popq %r14 +; X64-HSW-NEXT: .cfi_def_cfa_offset 24 ; X64-HSW-NEXT: popq %r15 +; X64-HSW-NEXT: .cfi_def_cfa_offset 16 ; X64-HSW-NEXT: popq %rbp +; X64-HSW-NEXT: .cfi_def_cfa_offset 8 ; X64-HSW-NEXT: retq %1 = tail call i32 @mult(i32 1, i32 0) %2 = icmp ne i32 %1, 1 diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll index 0a48ae761ec6..1e05b95dda06 100644 --- a/test/CodeGen/X86/mul-i256.ll +++ b/test/CodeGen/X86/mul-i256.ll @@ -349,10 +349,15 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 { ; X32-NEXT: movl %eax, 24(%ecx) ; X32-NEXT: movl %edx, 28(%ecx) ; X32-NEXT: addl $88, %esp +; X32-NEXT: .cfi_def_cfa_offset 20 ; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 16 ; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 12 ; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl ; ; X64-LABEL: test: @@ -421,8 +426,11 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 { ; X64-NEXT: movq %rax, 16(%r9) ; X64-NEXT: movq %rdx, 24(%r9) ; X64-NEXT: popq %rbx +; X64-NEXT: .cfi_def_cfa_offset 24 ; X64-NEXT: popq %r14 +; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: popq %r15 +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq entry: %av = load i256, i256* %a diff --git a/test/CodeGen/X86/mul128.ll b/test/CodeGen/X86/mul128.ll index 70a6173a19ff..0c11f17d8d1d 100644 --- a/test/CodeGen/X86/mul128.ll +++ b/test/CodeGen/X86/mul128.ll @@ -86,10 +86,15 @@ define i128 @foo(i128 %t, i128 %u) { ; X86-NEXT: movl %edx, 12(%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: popl %ebx +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl $4 %k = mul i128 %t, %u ret i128 %k diff --git a/test/CodeGen/X86/no-plt.ll b/test/CodeGen/X86/no-plt.ll new file mode 100644 index 000000000000..d6383c2d7d14 --- /dev/null +++ b/test/CodeGen/X86/no-plt.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \ +; RUN: | FileCheck -check-prefix=X64 %s +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu \ +; RUN: | FileCheck -check-prefix=X64 %s + +define i32 @main() #0 { +; X64: callq *_Z3foov@GOTPCREL(%rip) +; X64: callq _Z3barv +; X64: callq _Z3bazv + +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %call1 = call i32 @_Z3foov() + %call2 = call i32 @_Z3barv() + %call3 = call i32 @_Z3bazv() + ret i32 0 +} + +; Function Attrs: nonlazybind +declare i32 @_Z3foov() #1 + +declare i32 @_Z3barv() #2 + +; Function Attrs: nonlazybind +declare hidden i32 @_Z3bazv() #3 + + +attributes #1 = { nonlazybind } +attributes #3 = { nonlazybind } diff --git a/test/CodeGen/X86/pop-stack-cleanup-msvc.ll b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll new file mode 100644 index 000000000000..6330d3de72f1 --- /dev/null +++ b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s | FileCheck %s + +target triple = "i686--windows-msvc" + +declare { i8*, i32 } @param2_ret2(i32, i32) +declare i32 @__CxxFrameHandler3(...) + + +define void @test_reserved_regs() minsize optsize personality i32 (...)* @__CxxFrameHandler3 { +; CHECK-LABEL: test_reserved_regs: +; CHECK: calll _param2_ret2 +; CHECK-NEXT: popl %ecx +; CHECK-NEXT: popl %edi +start: + %s = alloca i64 + store i64 4, i64* %s + %0 = invoke { i8*, i32 } @param2_ret2(i32 0, i32 1) + to label %out unwind label %cleanup + +out: + ret void + +cleanup: + %cp = cleanuppad within none [] + cleanupret from %cp unwind to caller +} diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll index 74f6c5a361ff..54eb1fc7272b 100644 --- a/test/CodeGen/X86/pr21792.ll +++ b/test/CodeGen/X86/pr21792.ll @@ -28,6 +28,7 @@ define void @func(<4 x float> %vx) { ; CHECK-NEXT: leaq stuff+8(%r9), %r9 ; CHECK-NEXT: callq toto ; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: %tmp2 = bitcast <4 x float> %vx to <2 x i64> diff --git a/test/CodeGen/X86/pr29061.ll b/test/CodeGen/X86/pr29061.ll index 0cbe75f9ad5d..b62d082507d6 100644 --- a/test/CodeGen/X86/pr29061.ll +++ b/test/CodeGen/X86/pr29061.ll @@ -15,6 +15,7 @@ define void @t1(i8 signext %c) { ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: popl %edi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: tail call void asm sideeffect "", "{di},~{dirflag},~{fpsr},~{flags}"(i8 %c) @@ -32,6 +33,7 @@ define void @t2(i8 signext %c) { ; CHECK-NEXT: #APP ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: tail call void asm sideeffect "", "{si},~{dirflag},~{fpsr},~{flags}"(i8 %c) diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll index cc670eeb9788..d791936bd53e 100644 --- a/test/CodeGen/X86/pr29112.ll +++ b/test/CodeGen/X86/pr29112.ll @@ -65,6 +65,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> diff --git a/test/CodeGen/X86/pr30430.ll b/test/CodeGen/X86/pr30430.ll index 0254c0940b89..06007a3a4cfa 100644 --- a/test/CodeGen/X86/pr30430.ll +++ b/test/CodeGen/X86/pr30430.ll @@ -108,6 +108,7 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float ; CHECK-NEXT: vmovss %xmm14, (%rsp) # 4-byte Spill ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: retq entry: %__A.addr.i = alloca float, align 4 diff --git a/test/CodeGen/X86/pr32241.ll b/test/CodeGen/X86/pr32241.ll index f48fef5f7fbc..02f3bb122913 100644 --- a/test/CodeGen/X86/pr32241.ll +++ b/test/CodeGen/X86/pr32241.ll @@ -50,7 +50,9 @@ define i32 @_Z3foov() { ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: addl $16, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: %aa = alloca i16, align 2 diff --git a/test/CodeGen/X86/pr32256.ll b/test/CodeGen/X86/pr32256.ll index f6e254aaad06..5b6126fbc76c 100644 --- a/test/CodeGen/X86/pr32256.ll +++ b/test/CodeGen/X86/pr32256.ll @@ -27,6 +27,7 @@ define void @_Z1av() { ; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp) ; CHECK-NEXT: addl $2, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: %b = alloca i8, align 1 diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll index d6e6f6eb107d..67a0332ac537 100644 --- a/test/CodeGen/X86/pr32282.ll +++ b/test/CodeGen/X86/pr32282.ll @@ -43,6 +43,7 @@ define void @foo() { ; X86-NEXT: orl %eax, %edx ; X86-NEXT: setne {{[0-9]+}}(%esp) ; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: foo: diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll index 11eb6968709b..59be67f05792 100644 --- a/test/CodeGen/X86/pr32284.ll +++ b/test/CodeGen/X86/pr32284.ll @@ -71,6 +71,7 @@ define void @foo() { ; 686-O0-NEXT: movzbl %al, %ecx ; 686-O0-NEXT: movl %ecx, (%esp) ; 686-O0-NEXT: addl $8, %esp +; 686-O0-NEXT: .cfi_def_cfa_offset 4 ; 686-O0-NEXT: retl ; ; 686-LABEL: foo: @@ -88,6 +89,7 @@ define void @foo() { ; 686-NEXT: setle %dl ; 686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; 686-NEXT: addl $8, %esp +; 686-NEXT: .cfi_def_cfa_offset 4 ; 686-NEXT: retl entry: %a = alloca i8, align 1 @@ -232,10 +234,15 @@ define void @f1() { ; 686-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill ; 686-O0-NEXT: movl %esi, (%esp) # 4-byte Spill ; 686-O0-NEXT: addl $36, %esp +; 686-O0-NEXT: .cfi_def_cfa_offset 20 ; 686-O0-NEXT: popl %esi +; 686-O0-NEXT: .cfi_def_cfa_offset 16 ; 686-O0-NEXT: popl %edi +; 686-O0-NEXT: .cfi_def_cfa_offset 12 ; 686-O0-NEXT: popl %ebx +; 686-O0-NEXT: .cfi_def_cfa_offset 8 ; 686-O0-NEXT: popl %ebp +; 686-O0-NEXT: .cfi_def_cfa_offset 4 ; 686-O0-NEXT: retl ; ; 686-LABEL: f1: @@ -277,8 +284,11 @@ define void @f1() { ; 686-NEXT: movl %eax, _ZN8struct_210member_2_0E ; 686-NEXT: movl $0, _ZN8struct_210member_2_0E+4 ; 686-NEXT: addl $1, %esp +; 686-NEXT: .cfi_def_cfa_offset 12 ; 686-NEXT: popl %esi +; 686-NEXT: .cfi_def_cfa_offset 8 ; 686-NEXT: popl %edi +; 686-NEXT: .cfi_def_cfa_offset 4 ; 686-NEXT: retl entry: %a = alloca i8, align 1 @@ -392,8 +402,11 @@ define void @f2() { ; 686-O0-NEXT: movw %cx, %di ; 686-O0-NEXT: movw %di, (%eax) ; 686-O0-NEXT: addl $2, %esp +; 686-O0-NEXT: .cfi_def_cfa_offset 12 ; 686-O0-NEXT: popl %esi +; 686-O0-NEXT: .cfi_def_cfa_offset 8 ; 686-O0-NEXT: popl %edi +; 686-O0-NEXT: .cfi_def_cfa_offset 4 ; 686-O0-NEXT: retl ; ; 686-LABEL: f2: @@ -414,6 +427,7 @@ define void @f2() { ; 686-NEXT: sete %dl ; 686-NEXT: movw %dx, (%eax) ; 686-NEXT: addl $2, %esp +; 686-NEXT: .cfi_def_cfa_offset 4 ; 686-NEXT: retl entry: %a = alloca i16, align 2 @@ -532,6 +546,7 @@ define void @f3() #0 { ; 686-O0-NEXT: popl %esi ; 686-O0-NEXT: popl %edi ; 686-O0-NEXT: popl %ebp +; 686-O0-NEXT: .cfi_def_cfa %esp, 4 ; 686-O0-NEXT: retl ; ; 686-LABEL: f3: @@ -558,6 +573,7 @@ define void @f3() #0 { ; 686-NEXT: movl %ecx, var_46 ; 686-NEXT: movl %ebp, %esp ; 686-NEXT: popl %ebp +; 686-NEXT: .cfi_def_cfa %esp, 4 ; 686-NEXT: retl entry: %a = alloca i64, align 8 diff --git a/test/CodeGen/X86/pr32329.ll b/test/CodeGen/X86/pr32329.ll index f6bdade24c6c..9d1bb90e824e 100644 --- a/test/CodeGen/X86/pr32329.ll +++ b/test/CodeGen/X86/pr32329.ll @@ -57,9 +57,13 @@ define void @foo() local_unnamed_addr { ; X86-NEXT: imull %eax, %ebx ; X86-NEXT: movb %bl, var_218 ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: popl %ebx +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: foo: diff --git a/test/CodeGen/X86/pr32345.ll b/test/CodeGen/X86/pr32345.ll index f6802887e9e4..2bdeca207312 100644 --- a/test/CodeGen/X86/pr32345.ll +++ b/test/CodeGen/X86/pr32345.ll @@ -84,6 +84,7 @@ define void @foo() { ; 6860-NEXT: popl %edi ; 6860-NEXT: popl %ebx ; 6860-NEXT: popl %ebp +; 6860-NEXT: .cfi_def_cfa %esp, 4 ; 6860-NEXT: retl ; ; X64-LABEL: foo: @@ -127,6 +128,7 @@ define void @foo() { ; 686-NEXT: movb %dl, (%eax) ; 686-NEXT: movl %ebp, %esp ; 686-NEXT: popl %ebp +; 686-NEXT: .cfi_def_cfa %esp, 4 ; 686-NEXT: retl bb: %tmp = alloca i64, align 8 diff --git a/test/CodeGen/X86/pr32451.ll b/test/CodeGen/X86/pr32451.ll index 67c0cb39f8c5..5b7d1373d340 100644 --- a/test/CodeGen/X86/pr32451.ll +++ b/test/CodeGen/X86/pr32451.ll @@ -30,7 +30,9 @@ define i8** @japi1_convert_690(i8**, i8***, i32) { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload ; CHECK-NEXT: movl %eax, (%ecx) ; CHECK-NEXT: addl $16, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: popl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl top: %3 = alloca i8*** diff --git a/test/CodeGen/X86/pr34088.ll b/test/CodeGen/X86/pr34088.ll index 2049c5507c67..4d85722057f7 100644 --- a/test/CodeGen/X86/pr34088.ll +++ b/test/CodeGen/X86/pr34088.ll @@ -27,6 +27,7 @@ define i32 @pr34088() local_unnamed_addr { ; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 ; CHECK-NEXT: retl entry: %foo = alloca %struct.Foo, align 4 diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll new file mode 100644 index 000000000000..129dbcacc95e --- /dev/null +++ b/test/CodeGen/X86/pr34653.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx512f -o - | FileCheck %s + +declare fastcc <38 x double> @test() + +define void @pr34653() { +; CHECK-LABEL: pr34653: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-512, %rsp # imm = 0xFE00 +; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: callq test +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm2 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovaps %xmm3, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vmovaps %xmm5, %xmm6 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm7 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm8 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm9 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm10 +; CHECK-NEXT: vextractf32x4 $3, %zmm10, %xmm11 +; CHECK-NEXT: vmovaps %xmm11, %xmm12 +; CHECK-NEXT: vextractf32x4 $2, %zmm10, %xmm13 +; CHECK-NEXT: vmovaps %xmm13, %xmm14 +; CHECK-NEXT: vmovaps %xmm10, %xmm15 +; CHECK-NEXT: vmovaps %xmm15, %xmm2 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm9, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm8, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $3, %zmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vextractf32x4 $2, %zmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm7, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0] +; CHECK-NEXT: # kill: %YMM10<def> %YMM10<kill> %ZMM10<kill> +; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm10 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm10, %xmm0 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM9<def> %YMM9<kill> %ZMM9<kill> +; CHECK-NEXT: vextractf128 $1, %ymm9, %xmm9 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm9, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM8<def> %YMM8<kill> %ZMM8<kill> +; CHECK-NEXT: vextractf128 $1, %ymm8, %xmm8 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm8, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: # kill: %YMM7<def> %YMM7<kill> %ZMM7<kill> +; CHECK-NEXT: vextractf128 $1, %ymm7, %xmm7 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps %xmm7, %xmm0 +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd %xmm8, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm13, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm1, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm14, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm2, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm4, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm9, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm10, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm15, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm11, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm3, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm6, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm5, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm12, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm7, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %v = call fastcc <38 x double> @test() + %v.0 = extractelement <38 x double> %v, i32 0 + ret void +} + diff --git a/test/CodeGen/X86/pr34657.ll b/test/CodeGen/X86/pr34657.ll new file mode 100644 index 000000000000..a63bc2a08dde --- /dev/null +++ b/test/CodeGen/X86/pr34657.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s + +define <112 x i8> @pr34657() local_unnamed_addr { +; CHECK-LABEL: pr34657 +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vmovups (%rax), %xmm0 +; CHECK-NEXT: vmovups (%rax), %ymm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovups (%rax), %zmm2 +; CHECK-NEXT: vmovaps %ymm1, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm2, (%rdi) +; CHECK-NEXT: vextractf32x4 $2, %zmm0, 96(%rdi) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %wide.vec51 = load <112 x i8>, <112 x i8>* undef, align 2 + ret <112 x i8> %wide.vec51 +} diff --git a/test/CodeGen/X86/pr9743.ll b/test/CodeGen/X86/pr9743.ll index 73b3c7f835c5..ac3d45755108 100644 --- a/test/CodeGen/X86/pr9743.ll +++ b/test/CodeGen/X86/pr9743.ll @@ -11,4 +11,5 @@ define void @f() { ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: ret diff --git a/test/CodeGen/X86/push-cfi-debug.ll b/test/CodeGen/X86/push-cfi-debug.ll index 7f438e306e4d..01fa12e87d01 100644 --- a/test/CodeGen/X86/push-cfi-debug.ll +++ b/test/CodeGen/X86/push-cfi-debug.ll @@ -23,8 +23,10 @@ declare x86_stdcallcc void @stdfoo(i32, i32) #0 ; CHECK: .cfi_adjust_cfa_offset 4 ; CHECK: calll stdfoo ; CHECK: .cfi_adjust_cfa_offset -8 -; CHECK: addl $20, %esp +; CHECK: addl $8, %esp ; CHECK: .cfi_adjust_cfa_offset -8 +; CHECK: addl $12, %esp +; CHECK: .cfi_def_cfa_offset 4 define void @test1() #0 !dbg !4 { entry: tail call void @foo(i32 1, i32 2) #1, !dbg !10 diff --git a/test/CodeGen/X86/push-cfi-obj.ll b/test/CodeGen/X86/push-cfi-obj.ll index 33291ec3318a..2c9ec3340270 100644 --- a/test/CodeGen/X86/push-cfi-obj.ll +++ b/test/CodeGen/X86/push-cfi-obj.ll @@ -12,7 +12,7 @@ ; LINUX-NEXT: ] ; LINUX-NEXT: Address: 0x0 ; LINUX-NEXT: Offset: 0x68 -; LINUX-NEXT: Size: 64 +; LINUX-NEXT: Size: 72 ; LINUX-NEXT: Link: 0 ; LINUX-NEXT: Info: 0 ; LINUX-NEXT: AddressAlignment: 4 @@ -22,8 +22,9 @@ ; LINUX-NEXT: SectionData ( ; LINUX-NEXT: 0000: 1C000000 00000000 017A504C 5200017C |.........zPLR..|| ; LINUX-NEXT: 0010: 08070000 00000000 1B0C0404 88010000 |................| -; LINUX-NEXT: 0020: 1C000000 24000000 00000000 1D000000 |....$...........| +; LINUX-NEXT: 0020: 24000000 24000000 00000000 1D000000 |$...$...........| ; LINUX-NEXT: 0030: 04000000 00410E08 8502420D 05432E10 |.....A....B..C..| +; LINUX-NEXT: 0040: 540C0404 410C0508 |T...A...| ; LINUX-NEXT: ) declare i32 @__gxx_personality_v0(...) @@ -35,7 +36,7 @@ entry: to label %continue unwind label %cleanup continue: ret void -cleanup: +cleanup: landingpad { i8*, i32 } cleanup ret void diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll index 91e579a8391b..44f8bf857c4c 100644 --- a/test/CodeGen/X86/push-cfi.ll +++ b/test/CodeGen/X86/push-cfi.ll @@ -74,8 +74,9 @@ cleanup: ; LINUX-NEXT: pushl $1 ; LINUX-NEXT: .cfi_adjust_cfa_offset 4 ; LINUX-NEXT: call -; LINUX-NEXT: addl $28, %esp +; LINUX-NEXT: addl $16, %esp ; LINUX: .cfi_adjust_cfa_offset -16 +; LINUX: addl $12, %esp ; DARWIN-NOT: .cfi_escape ; DARWIN-NOT: pushl define void @test2_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll index 0e9d149373b1..296d165b3eb5 100644 --- a/test/CodeGen/X86/recip-fastmath.ll +++ b/test/CodeGen/X86/recip-fastmath.ll @@ -144,14 +144,14 @@ define float @f32_one_step(float %x) #1 { ; ; KNL-LABEL: f32_one_step: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: f32_one_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50] ; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -257,7 +257,7 @@ define float @f32_two_step(float %x) #2 { ; ; KNL-LABEL: f32_two_step: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] @@ -268,7 +268,7 @@ define float @f32_two_step(float %x) #2 { ; ; SKX-LABEL: f32_two_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33] @@ -416,7 +416,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; ; SKX-LABEL: v4f32_one_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50] ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -533,7 +533,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; ; SKX-LABEL: v4f32_two_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33] @@ -691,7 +691,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; ; SKX-LABEL: v8f32_one_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50] ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] @@ -821,7 +821,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; ; SKX-LABEL: v8f32_two_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33] diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll index a263e9d3b652..f6eeeec57f11 100644 --- a/test/CodeGen/X86/recip-fastmath2.ll +++ b/test/CodeGen/X86/recip-fastmath2.ll @@ -56,13 +56,13 @@ define float @f32_no_step_2(float %x) #3 { ; ; KNL-LABEL: f32_no_step_2: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] ; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: f32_no_step_2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast float 1234.0, %x @@ -144,7 +144,7 @@ define float @f32_one_step_2(float %x) #1 { ; ; KNL-LABEL: f32_one_step_2: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] @@ -152,7 +152,7 @@ define float @f32_one_step_2(float %x) #1 { ; ; SKX-LABEL: f32_one_step_2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50] ; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] @@ -243,7 +243,7 @@ define float @f32_one_step_2_divs(float %x) #1 { ; ; KNL-LABEL: f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] @@ -252,7 +252,7 @@ define float @f32_one_step_2_divs(float %x) #1 { ; ; SKX-LABEL: f32_one_step_2_divs: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50] ; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] @@ -368,7 +368,7 @@ define float @f32_two_step_2(float %x) #2 { ; ; KNL-LABEL: f32_two_step_2: ; KNL: # BB#0: -; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] @@ -380,7 +380,7 @@ define float @f32_two_step_2(float %x) #2 { ; ; SKX-LABEL: f32_two_step_2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33] @@ -478,7 +478,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; ; SKX-LABEL: v4f32_one_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50] ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] @@ -580,7 +580,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; ; SKX-LABEL: v4f32_one_step_2_divs: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50] ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] @@ -708,7 +708,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; ; SKX-LABEL: v4f32_two_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33] @@ -814,7 +814,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; ; SKX-LABEL: v8f32_one_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50] ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33] ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50] @@ -925,7 +925,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; ; SKX-LABEL: v8f32_one_step_2_divs: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50] ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33] ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50] @@ -1067,7 +1067,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; ; SKX-LABEL: v8f32_two_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00] +; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33] @@ -1124,7 +1124,7 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { ; ; SKX-LABEL: v8f32_no_step: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00] +; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x ret <8 x float> %div @@ -1183,7 +1183,7 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; ; SKX-LABEL: v8f32_no_step2: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00] +; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00] ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x diff --git a/test/CodeGen/X86/return-ext.ll b/test/CodeGen/X86/return-ext.ll index ef160f43b4aa..c66e518943a0 100644 --- a/test/CodeGen/X86/return-ext.ll +++ b/test/CodeGen/X86/return-ext.ll @@ -106,6 +106,7 @@ entry: ; CHECK: call ; CHECK-NEXT: movzbl ; CHECK-NEXT: {{pop|add}} +; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}} ; CHECK-NEXT: ret } @@ -120,6 +121,7 @@ entry: ; CHECK: call ; CHECK-NEXT: movzbl ; CHECK-NEXT: {{pop|add}} +; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}} ; CHECK-NEXT: ret } @@ -134,5 +136,6 @@ entry: ; CHECK: call ; CHECK-NEXT: movzwl ; CHECK-NEXT: {{pop|add}} +; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}} ; CHECK-NEXT: ret } diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll index bd2d3e544bda..a1feeb5999bb 100644 --- a/test/CodeGen/X86/rtm.ll +++ b/test/CodeGen/X86/rtm.ll @@ -75,6 +75,7 @@ define void @f2(i32 %x) nounwind uwtable { ; X64-NEXT: xabort $1 ; X64-NEXT: callq f1 ; X64-NEXT: popq %rax +; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq entry: %x.addr = alloca i32, align 4 diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll new file mode 100644 index 000000000000..5dc06e61cc6e --- /dev/null +++ b/test/CodeGen/X86/schedule-x86_32.ll @@ -0,0 +1,348 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=i686 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i8 @test_aaa(i8 %a0) optsize { +; GENERIC-LABEL: test_aaa: +; GENERIC: # BB#0: +; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al +; GENERIC-NEXT: #APP +; GENERIC-NEXT: aaa +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: retl +; +; ATOM-LABEL: test_aaa: +; ATOM: # BB#0: +; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00] +; ATOM-NEXT: #APP +; ATOM-NEXT: aaa +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: retl # sched: [79:39.50] +; +; SLM-LABEL: test_aaa: +; SLM: # BB#0: +; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00] +; SLM-NEXT: #APP +; SLM-NEXT: aaa +; SLM-NEXT: #NO_APP +; SLM-NEXT: retl # sched: [4:1.00] +; +; SANDY-LABEL: test_aaa: +; SANDY: # BB#0: +; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SANDY-NEXT: #APP +; SANDY-NEXT: aaa +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: retl # sched: [5:1.00] +; +; HASWELL-LABEL: test_aaa: +; HASWELL: # BB#0: +; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50] +; HASWELL-NEXT: #APP +; HASWELL-NEXT: aaa +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: retl # sched: [5:0.50] +; +; BROADWELL-LABEL: test_aaa: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: aaa +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: retl # sched: [6:0.50] +; +; SKYLAKE-LABEL: test_aaa: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: aaa +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: retl # sched: [6:0.50] +; +; SKX-LABEL: test_aaa: +; SKX: # BB#0: +; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SKX-NEXT: #APP +; SKX-NEXT: aaa +; SKX-NEXT: #NO_APP +; SKX-NEXT: retl # sched: [6:0.50] +; +; BTVER2-LABEL: test_aaa: +; BTVER2: # BB#0: +; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00] +; BTVER2-NEXT: #APP +; BTVER2-NEXT: aaa +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: retl # sched: [4:1.00] +; +; ZNVER1-LABEL: test_aaa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50] +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: aaa +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: retl # sched: [1:0.50] + %1 = tail call i8 asm "aaa", "=r,r"(i8 %a0) nounwind + ret i8 %1 +} + +define i8 @test_aad(i16 %a0) optsize { +; GENERIC-LABEL: test_aad: +; GENERIC: # BB#0: +; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; GENERIC-NEXT: #APP +; GENERIC-NEXT: aad +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: retl +; +; ATOM-LABEL: test_aad: +; ATOM: # BB#0: +; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00] +; ATOM-NEXT: #APP +; ATOM-NEXT: aad +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: retl # sched: [79:39.50] +; +; SLM-LABEL: test_aad: +; SLM: # BB#0: +; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00] +; SLM-NEXT: #APP +; SLM-NEXT: aad +; SLM-NEXT: #NO_APP +; SLM-NEXT: retl # sched: [4:1.00] +; +; SANDY-LABEL: test_aad: +; SANDY: # BB#0: +; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50] +; SANDY-NEXT: #APP +; SANDY-NEXT: aad +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: retl # sched: [5:1.00] +; +; HASWELL-LABEL: test_aad: +; HASWELL: # BB#0: +; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:0.50] +; HASWELL-NEXT: #APP +; HASWELL-NEXT: aad +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: retl # sched: [5:0.50] +; +; BROADWELL-LABEL: test_aad: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50] +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: aad +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: retl # sched: [6:0.50] +; +; SKYLAKE-LABEL: test_aad: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50] +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: aad +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: retl # sched: [6:0.50] +; +; SKX-LABEL: test_aad: +; SKX: # BB#0: +; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50] +; SKX-NEXT: #APP +; SKX-NEXT: aad +; SKX-NEXT: #NO_APP +; SKX-NEXT: retl # sched: [6:0.50] +; +; BTVER2-LABEL: test_aad: +; BTVER2: # BB#0: +; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00] +; BTVER2-NEXT: #APP +; BTVER2-NEXT: aad +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: retl # sched: [4:1.00] +; +; ZNVER1-LABEL: test_aad: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50] +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: aad +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: retl # sched: [1:0.50] + %1 = tail call i8 asm "aad", "=r,r"(i16 %a0) nounwind + ret i8 %1 +} + +define i16 @test_aam(i8 %a0) optsize { +; GENERIC-LABEL: test_aam: +; GENERIC: # BB#0: +; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al +; GENERIC-NEXT: #APP +; GENERIC-NEXT: aam +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: retl +; +; ATOM-LABEL: test_aam: +; ATOM: # BB#0: +; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00] +; ATOM-NEXT: #APP +; ATOM-NEXT: aam +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: retl # sched: [79:39.50] +; +; SLM-LABEL: test_aam: +; SLM: # BB#0: +; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00] +; SLM-NEXT: #APP +; SLM-NEXT: aam +; SLM-NEXT: #NO_APP +; SLM-NEXT: retl # sched: [4:1.00] +; +; SANDY-LABEL: test_aam: +; SANDY: # BB#0: +; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SANDY-NEXT: #APP +; SANDY-NEXT: aam +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: retl # sched: [5:1.00] +; +; HASWELL-LABEL: test_aam: +; HASWELL: # BB#0: +; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50] +; HASWELL-NEXT: #APP +; HASWELL-NEXT: aam +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: retl # sched: [5:0.50] +; +; BROADWELL-LABEL: test_aam: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: aam +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: retl # sched: [6:0.50] +; +; SKYLAKE-LABEL: test_aam: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: aam +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: retl # sched: [6:0.50] +; +; SKX-LABEL: test_aam: +; SKX: # BB#0: +; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SKX-NEXT: #APP +; SKX-NEXT: aam +; SKX-NEXT: #NO_APP +; SKX-NEXT: retl # sched: [6:0.50] +; +; BTVER2-LABEL: test_aam: +; BTVER2: # BB#0: +; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00] +; BTVER2-NEXT: #APP +; BTVER2-NEXT: aam +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: retl # sched: [4:1.00] +; +; ZNVER1-LABEL: test_aam: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50] +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: aam +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: retl # sched: [1:0.50] + %1 = tail call i16 asm "aam", "=r,r"(i8 %a0) nounwind + ret i16 %1 +} + +define i8 @test_aas(i8 %a0) optsize { +; GENERIC-LABEL: test_aas: +; GENERIC: # BB#0: +; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al +; GENERIC-NEXT: #APP +; GENERIC-NEXT: aas +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: retl +; +; ATOM-LABEL: test_aas: +; ATOM: # BB#0: +; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00] +; ATOM-NEXT: #APP +; ATOM-NEXT: aas +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: retl # sched: [79:39.50] +; +; SLM-LABEL: test_aas: +; SLM: # BB#0: +; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00] +; SLM-NEXT: #APP +; SLM-NEXT: aas +; SLM-NEXT: #NO_APP +; SLM-NEXT: retl # sched: [4:1.00] +; +; SANDY-LABEL: test_aas: +; SANDY: # BB#0: +; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SANDY-NEXT: #APP +; SANDY-NEXT: aas +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: retl # sched: [5:1.00] +; +; HASWELL-LABEL: test_aas: +; HASWELL: # BB#0: +; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50] +; HASWELL-NEXT: #APP +; HASWELL-NEXT: aas +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: retl # sched: [5:0.50] +; +; BROADWELL-LABEL: test_aas: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: aas +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: retl # sched: [6:0.50] +; +; SKYLAKE-LABEL: test_aas: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: aas +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: retl # sched: [6:0.50] +; +; SKX-LABEL: test_aas: +; SKX: # BB#0: +; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50] +; SKX-NEXT: #APP +; SKX-NEXT: aas +; SKX-NEXT: #NO_APP +; SKX-NEXT: retl # sched: [6:0.50] +; +; BTVER2-LABEL: test_aas: +; BTVER2: # BB#0: +; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00] +; BTVER2-NEXT: #APP +; BTVER2-NEXT: aas +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: retl # sched: [4:1.00] +; +; ZNVER1-LABEL: test_aas: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50] +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: aas +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: retl # sched: [1:0.50] + %1 = tail call i8 asm "aas", "=r,r"(i8 %a0) nounwind + ret i8 %1 +} diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll new file mode 100644 index 000000000000..1db8c8768bda --- /dev/null +++ b/test/CodeGen/X86/schedule-x86_64.ll @@ -0,0 +1,737 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 + +define i16 @test_bsf16(i16 %a0, i16* %a1) optsize { +; GENERIC-LABEL: test_bsf16: +; GENERIC: # BB#0: +; GENERIC-NEXT: #APP +; GENERIC-NEXT: bsfw %di, %ax +; GENERIC-NEXT: bsfw (%rsi), %cx +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33] +; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_bsf16: +; ATOM: # BB#0: +; ATOM-NEXT: #APP +; ATOM-NEXT: bsfw %di, %ax +; ATOM-NEXT: bsfw (%rsi), %cx +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: test_bsf16: +; SLM: # BB#0: +; SLM-NEXT: #APP +; SLM-NEXT: bsfw %di, %ax +; SLM-NEXT: bsfw (%rsi), %cx +; SLM-NEXT: #NO_APP +; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_bsf16: +; SANDY: # BB#0: +; SANDY-NEXT: #APP +; SANDY-NEXT: bsfw %di, %ax +; SANDY-NEXT: bsfw (%rsi), %cx +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_bsf16: +; HASWELL: # BB#0: +; HASWELL-NEXT: #APP +; HASWELL-NEXT: bsfw %di, %ax +; HASWELL-NEXT: bsfw (%rsi), %cx +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: test_bsf16: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: bsfw %di, %ax +; BROADWELL-NEXT: bsfw (%rsi), %cx +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; BROADWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: test_bsf16: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: bsfw %di, %ax +; SKYLAKE-NEXT: bsfw (%rsi), %cx +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKYLAKE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: test_bsf16: +; SKX: # BB#0: +; SKX-NEXT: #APP +; SKX-NEXT: bsfw %di, %ax +; SKX-NEXT: bsfw (%rsi), %cx +; SKX-NEXT: #NO_APP +; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: test_bsf16: +; BTVER2: # BB#0: +; BTVER2-NEXT: #APP +; BTVER2-NEXT: bsfw %di, %ax +; BTVER2-NEXT: bsfw (%rsi), %cx +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bsf16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: bsfw %di, %ax +; ZNVER1-NEXT: bsfw (%rsi), %cx +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = call { i16, i16 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1) + %2 = extractvalue { i16, i16 } %1, 0 + %3 = extractvalue { i16, i16 } %1, 1 + %4 = or i16 %2, %3 + ret i16 %4 +} +define i32 @test_bsf32(i32 %a0, i32* %a1) optsize { +; GENERIC-LABEL: test_bsf32: +; GENERIC: # BB#0: +; GENERIC-NEXT: #APP +; GENERIC-NEXT: bsfl %edi, %eax +; GENERIC-NEXT: bsfl (%rsi), %ecx +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_bsf32: +; ATOM: # BB#0: +; ATOM-NEXT: #APP +; ATOM-NEXT: bsfl %edi, %eax +; ATOM-NEXT: bsfl (%rsi), %ecx +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: test_bsf32: +; SLM: # BB#0: +; SLM-NEXT: #APP +; SLM-NEXT: bsfl %edi, %eax +; SLM-NEXT: bsfl (%rsi), %ecx +; SLM-NEXT: #NO_APP +; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_bsf32: +; SANDY: # BB#0: +; SANDY-NEXT: #APP +; SANDY-NEXT: bsfl %edi, %eax +; SANDY-NEXT: bsfl (%rsi), %ecx +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_bsf32: +; HASWELL: # BB#0: +; HASWELL-NEXT: #APP +; HASWELL-NEXT: bsfl %edi, %eax +; HASWELL-NEXT: bsfl (%rsi), %ecx +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: test_bsf32: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: bsfl %edi, %eax +; BROADWELL-NEXT: bsfl (%rsi), %ecx +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: test_bsf32: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: bsfl %edi, %eax +; SKYLAKE-NEXT: bsfl (%rsi), %ecx +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: test_bsf32: +; SKX: # BB#0: +; SKX-NEXT: #APP +; SKX-NEXT: bsfl %edi, %eax +; SKX-NEXT: bsfl (%rsi), %ecx +; SKX-NEXT: #NO_APP +; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: test_bsf32: +; BTVER2: # BB#0: +; BTVER2-NEXT: #APP +; BTVER2-NEXT: bsfl %edi, %eax +; BTVER2-NEXT: bsfl (%rsi), %ecx +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bsf32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: bsfl %edi, %eax +; ZNVER1-NEXT: bsfl (%rsi), %ecx +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = call { i32, i32 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1) + %2 = extractvalue { i32, i32 } %1, 0 + %3 = extractvalue { i32, i32 } %1, 1 + %4 = or i32 %2, %3 + ret i32 %4 +} +define i64 @test_bsf64(i64 %a0, i64* %a1) optsize { +; GENERIC-LABEL: test_bsf64: +; GENERIC: # BB#0: +; GENERIC-NEXT: #APP +; GENERIC-NEXT: bsfq %rdi, %rax +; GENERIC-NEXT: bsfq (%rsi), %rcx +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_bsf64: +; ATOM: # BB#0: +; ATOM-NEXT: #APP +; ATOM-NEXT: bsfq %rdi, %rax +; ATOM-NEXT: bsfq (%rsi), %rcx +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: test_bsf64: +; SLM: # BB#0: +; SLM-NEXT: #APP +; SLM-NEXT: bsfq %rdi, %rax +; SLM-NEXT: bsfq (%rsi), %rcx +; SLM-NEXT: #NO_APP +; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_bsf64: +; SANDY: # BB#0: +; SANDY-NEXT: #APP +; SANDY-NEXT: bsfq %rdi, %rax +; SANDY-NEXT: bsfq (%rsi), %rcx +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_bsf64: +; HASWELL: # BB#0: +; HASWELL-NEXT: #APP +; HASWELL-NEXT: bsfq %rdi, %rax +; HASWELL-NEXT: bsfq (%rsi), %rcx +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: test_bsf64: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: bsfq %rdi, %rax +; BROADWELL-NEXT: bsfq (%rsi), %rcx +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: test_bsf64: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: bsfq %rdi, %rax +; SKYLAKE-NEXT: bsfq (%rsi), %rcx +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: test_bsf64: +; SKX: # BB#0: +; SKX-NEXT: #APP +; SKX-NEXT: bsfq %rdi, %rax +; SKX-NEXT: bsfq (%rsi), %rcx +; SKX-NEXT: #NO_APP +; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: test_bsf64: +; BTVER2: # BB#0: +; BTVER2-NEXT: #APP +; BTVER2-NEXT: bsfq %rdi, %rax +; BTVER2-NEXT: bsfq (%rsi), %rcx +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bsf64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: bsfq %rdi, %rax +; ZNVER1-NEXT: bsfq (%rsi), %rcx +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = call { i64, i64 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1) + %2 = extractvalue { i64, i64 } %1, 0 + %3 = extractvalue { i64, i64 } %1, 1 + %4 = or i64 %2, %3 + ret i64 %4 +} + +define i16 @test_bsr16(i16 %a0, i16* %a1) optsize { +; GENERIC-LABEL: test_bsr16: +; GENERIC: # BB#0: +; GENERIC-NEXT: #APP +; GENERIC-NEXT: bsrw %di, %ax +; GENERIC-NEXT: bsrw (%rsi), %cx +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33] +; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_bsr16: +; ATOM: # BB#0: +; ATOM-NEXT: #APP +; ATOM-NEXT: bsrw %di, %ax +; ATOM-NEXT: bsrw (%rsi), %cx +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: test_bsr16: +; SLM: # BB#0: +; SLM-NEXT: #APP +; SLM-NEXT: bsrw %di, %ax +; SLM-NEXT: bsrw (%rsi), %cx +; SLM-NEXT: #NO_APP +; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_bsr16: +; SANDY: # BB#0: +; SANDY-NEXT: #APP +; SANDY-NEXT: bsrw %di, %ax +; SANDY-NEXT: bsrw (%rsi), %cx +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_bsr16: +; HASWELL: # BB#0: +; HASWELL-NEXT: #APP +; HASWELL-NEXT: bsrw %di, %ax +; HASWELL-NEXT: bsrw (%rsi), %cx +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: test_bsr16: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: bsrw %di, %ax +; BROADWELL-NEXT: bsrw (%rsi), %cx +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; BROADWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: test_bsr16: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: bsrw %di, %ax +; SKYLAKE-NEXT: bsrw (%rsi), %cx +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKYLAKE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: test_bsr16: +; SKX: # BB#0: +; SKX-NEXT: #APP +; SKX-NEXT: bsrw %di, %ax +; SKX-NEXT: bsrw (%rsi), %cx +; SKX-NEXT: #NO_APP +; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: test_bsr16: +; BTVER2: # BB#0: +; BTVER2-NEXT: #APP +; BTVER2-NEXT: bsrw %di, %ax +; BTVER2-NEXT: bsrw (%rsi), %cx +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bsr16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: bsrw %di, %ax +; ZNVER1-NEXT: bsrw (%rsi), %cx +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = call { i16, i16 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1) + %2 = extractvalue { i16, i16 } %1, 0 + %3 = extractvalue { i16, i16 } %1, 1 + %4 = or i16 %2, %3 + ret i16 %4 +} +define i32 @test_bsr32(i32 %a0, i32* %a1) optsize { +; GENERIC-LABEL: test_bsr32: +; GENERIC: # BB#0: +; GENERIC-NEXT: #APP +; GENERIC-NEXT: bsrl %edi, %eax +; GENERIC-NEXT: bsrl (%rsi), %ecx +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_bsr32: +; ATOM: # BB#0: +; ATOM-NEXT: #APP +; ATOM-NEXT: bsrl %edi, %eax +; ATOM-NEXT: bsrl (%rsi), %ecx +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: test_bsr32: +; SLM: # BB#0: +; SLM-NEXT: #APP +; SLM-NEXT: bsrl %edi, %eax +; SLM-NEXT: bsrl (%rsi), %ecx +; SLM-NEXT: #NO_APP +; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_bsr32: +; SANDY: # BB#0: +; SANDY-NEXT: #APP +; SANDY-NEXT: bsrl %edi, %eax +; SANDY-NEXT: bsrl (%rsi), %ecx +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_bsr32: +; HASWELL: # BB#0: +; HASWELL-NEXT: #APP +; HASWELL-NEXT: bsrl %edi, %eax +; HASWELL-NEXT: bsrl (%rsi), %ecx +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: test_bsr32: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: bsrl %edi, %eax +; BROADWELL-NEXT: bsrl (%rsi), %ecx +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: test_bsr32: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: bsrl %edi, %eax +; SKYLAKE-NEXT: bsrl (%rsi), %ecx +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: test_bsr32: +; SKX: # BB#0: +; SKX-NEXT: #APP +; SKX-NEXT: bsrl %edi, %eax +; SKX-NEXT: bsrl (%rsi), %ecx +; SKX-NEXT: #NO_APP +; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: test_bsr32: +; BTVER2: # BB#0: +; BTVER2-NEXT: #APP +; BTVER2-NEXT: bsrl %edi, %eax +; BTVER2-NEXT: bsrl (%rsi), %ecx +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bsr32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: bsrl %edi, %eax +; ZNVER1-NEXT: bsrl (%rsi), %ecx +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = call { i32, i32 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1) + %2 = extractvalue { i32, i32 } %1, 0 + %3 = extractvalue { i32, i32 } %1, 1 + %4 = or i32 %2, %3 + ret i32 %4 +} +define i64 @test_bsr64(i64 %a0, i64* %a1) optsize { +; GENERIC-LABEL: test_bsr64: +; GENERIC: # BB#0: +; GENERIC-NEXT: #APP +; GENERIC-NEXT: bsrq %rdi, %rax +; GENERIC-NEXT: bsrq (%rsi), %rcx +; GENERIC-NEXT: #NO_APP +; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_bsr64: +; ATOM: # BB#0: +; ATOM-NEXT: #APP +; ATOM-NEXT: bsrq %rdi, %rax +; ATOM-NEXT: bsrq (%rsi), %rcx +; ATOM-NEXT: #NO_APP +; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: test_bsr64: +; SLM: # BB#0: +; SLM-NEXT: #APP +; SLM-NEXT: bsrq %rdi, %rax +; SLM-NEXT: bsrq (%rsi), %rcx +; SLM-NEXT: #NO_APP +; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_bsr64: +; SANDY: # BB#0: +; SANDY-NEXT: #APP +; SANDY-NEXT: bsrq %rdi, %rax +; SANDY-NEXT: bsrq (%rsi), %rcx +; SANDY-NEXT: #NO_APP +; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_bsr64: +; HASWELL: # BB#0: +; HASWELL-NEXT: #APP +; HASWELL-NEXT: bsrq %rdi, %rax +; HASWELL-NEXT: bsrq (%rsi), %rcx +; HASWELL-NEXT: #NO_APP +; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: test_bsr64: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: #APP +; BROADWELL-NEXT: bsrq %rdi, %rax +; BROADWELL-NEXT: bsrq (%rsi), %rcx +; BROADWELL-NEXT: #NO_APP +; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: test_bsr64: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: #APP +; SKYLAKE-NEXT: bsrq %rdi, %rax +; SKYLAKE-NEXT: bsrq (%rsi), %rcx +; SKYLAKE-NEXT: #NO_APP +; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: test_bsr64: +; SKX: # BB#0: +; SKX-NEXT: #APP +; SKX-NEXT: bsrq %rdi, %rax +; SKX-NEXT: bsrq (%rsi), %rcx +; SKX-NEXT: #NO_APP +; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: test_bsr64: +; BTVER2: # BB#0: +; BTVER2-NEXT: #APP +; BTVER2-NEXT: bsrq %rdi, %rax +; BTVER2-NEXT: bsrq (%rsi), %rcx +; BTVER2-NEXT: #NO_APP +; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bsr64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: #APP +; ZNVER1-NEXT: bsrq %rdi, %rax +; ZNVER1-NEXT: bsrq (%rsi), %rcx +; ZNVER1-NEXT: #NO_APP +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = call { i64, i64 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1) + %2 = extractvalue { i64, i64 } %1, 0 + %3 = extractvalue { i64, i64 } %1, 1 + %4 = or i64 %2, %3 + ret i64 %4 +} + +define i32 @test_bswap32(i32 %a0) optsize { +; GENERIC-LABEL: test_bswap32: +; GENERIC: # BB#0: +; GENERIC-NEXT: bswapl %edi # sched: [2:1.00] +; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_bswap32: +; ATOM: # BB#0: +; ATOM-NEXT: bswapl %edi # sched: [1:1.00] +; ATOM-NEXT: movl %edi, %eax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: test_bswap32: +; SLM: # BB#0: +; SLM-NEXT: bswapl %edi # sched: [1:0.50] +; SLM-NEXT: movl %edi, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_bswap32: +; SANDY: # BB#0: +; SANDY-NEXT: bswapl %edi # sched: [2:1.00] +; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_bswap32: +; HASWELL: # BB#0: +; HASWELL-NEXT: bswapl %edi # sched: [2:0.50] +; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: test_bswap32: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: bswapl %edi # sched: [2:0.50] +; BROADWELL-NEXT: movl %edi, %eax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: test_bswap32: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: bswapl %edi # sched: [2:0.50] +; SKYLAKE-NEXT: movl %edi, %eax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: test_bswap32: +; SKX: # BB#0: +; SKX-NEXT: bswapl %edi # sched: [2:0.50] +; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: test_bswap32: +; BTVER2: # BB#0: +; BTVER2-NEXT: bswapl %edi # sched: [1:0.50] +; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bswap32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bswapl %edi # sched: [1:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = tail call i32 asm "bswap $0", "=r,0"(i32 %a0) nounwind + ret i32 %1 +} +define i64 @test_bswap64(i64 %a0) optsize { +; GENERIC-LABEL: test_bswap64: +; GENERIC: # BB#0: +; GENERIC-NEXT: bswapq %rdi # sched: [2:1.00] +; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: test_bswap64: +; ATOM: # BB#0: +; ATOM-NEXT: bswapq %rdi # sched: [1:1.00] +; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: test_bswap64: +; SLM: # BB#0: +; SLM-NEXT: bswapq %rdi # sched: [1:0.50] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_bswap64: +; SANDY: # BB#0: +; SANDY-NEXT: bswapq %rdi # sched: [2:1.00] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: test_bswap64: +; HASWELL: # BB#0: +; HASWELL-NEXT: bswapq %rdi # sched: [2:0.50] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: test_bswap64: +; BROADWELL: # BB#0: +; BROADWELL-NEXT: bswapq %rdi # sched: [2:0.50] +; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: test_bswap64: +; SKYLAKE: # BB#0: +; SKYLAKE-NEXT: bswapq %rdi # sched: [2:0.50] +; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: test_bswap64: +; SKX: # BB#0: +; SKX-NEXT: bswapq %rdi # sched: [2:0.50] +; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: test_bswap64: +; BTVER2: # BB#0: +; BTVER2-NEXT: bswapq %rdi # sched: [1:0.50] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bswap64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bswapq %rdi # sched: [1:1.00] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = tail call i64 asm "bswap $0", "=r,0"(i64 %a0) nounwind + ret i64 %1 +} diff --git a/test/CodeGen/X86/select-mmx.ll b/test/CodeGen/X86/select-mmx.ll index 795990e3c325..7ad8b6f1b9c7 100644 --- a/test/CodeGen/X86/select-mmx.ll +++ b/test/CodeGen/X86/select-mmx.ll @@ -48,6 +48,7 @@ define i64 @test47(i64 %arg) { ; I32-NEXT: movl {{[0-9]+}}(%esp), %edx ; I32-NEXT: movl %ebp, %esp ; I32-NEXT: popl %ebp +; I32-NEXT: .cfi_def_cfa %esp, 4 ; I32-NEXT: retl %cond = icmp eq i64 %arg, 0 %slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx) @@ -100,6 +101,7 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) { ; I32-NEXT: movl {{[0-9]+}}(%esp), %edx ; I32-NEXT: movl %ebp, %esp ; I32-NEXT: popl %ebp +; I32-NEXT: .cfi_def_cfa %esp, 4 ; I32-NEXT: retl %cond = icmp eq i64 %arg, 0 %xmmx = bitcast i64 %x to x86_mmx diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index 52225397ef0b..c3674639eab9 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -15,7 +15,6 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind { ; CHECK-NEXT: cmovneq %rdi, %rsi ; CHECK-NEXT: movl (%rsi), %eax ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test1: ; MCU: # BB#0: @@ -45,7 +44,7 @@ define i32 @test2() nounwind { ; GENERIC-NEXT: callq _return_false ; GENERIC-NEXT: xorl %ecx, %ecx ; GENERIC-NEXT: testb $1, %al -; GENERIC-NEXT: movl $-480, %eax +; GENERIC-NEXT: movl $-480, %eax ## imm = 0xFE20 ; GENERIC-NEXT: cmovnel %ecx, %eax ; GENERIC-NEXT: shll $3, %eax ; GENERIC-NEXT: cmpl $32768, %eax ## imm = 0x8000 @@ -55,14 +54,13 @@ define i32 @test2() nounwind { ; GENERIC-NEXT: popq %rcx ; GENERIC-NEXT: retq ; GENERIC-NEXT: LBB1_1: ## %bb90 -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test2: ; ATOM: ## BB#0: ## %entry ; ATOM-NEXT: pushq %rax ; ATOM-NEXT: callq _return_false ; ATOM-NEXT: xorl %ecx, %ecx -; ATOM-NEXT: movl $-480, %edx +; ATOM-NEXT: movl $-480, %edx ## imm = 0xFE20 ; ATOM-NEXT: testb $1, %al ; ATOM-NEXT: cmovnel %ecx, %edx ; ATOM-NEXT: shll $3, %edx @@ -73,17 +71,16 @@ define i32 @test2() nounwind { ; ATOM-NEXT: popq %rcx ; ATOM-NEXT: retq ; ATOM-NEXT: LBB1_1: ## %bb90 -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test2: ; MCU: # BB#0: # %entry ; MCU-NEXT: calll return_false -; MCU-NEXT: xorl %ecx, %ecx +; MCU-NEXT: xorl %ecx, %ecx ; MCU-NEXT: testb $1, %al ; MCU-NEXT: jne .LBB1_2 ; MCU-NEXT: # BB#1: # %entry ; MCU-NEXT: movl $-480, %ecx # imm = 0xFE20 -; MCU-NEXT: .LBB1_2: +; MCU-NEXT: .LBB1_2: # %entry ; MCU-NEXT: shll $3, %ecx ; MCU-NEXT: cmpl $32768, %ecx # imm = 0x8000 ; MCU-NEXT: jge .LBB1_3 @@ -116,7 +113,6 @@ define float @test3(i32 %x) nounwind readnone { ; CHECK-NEXT: leaq {{.*}}(%rip), %rcx ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test3: ; MCU: # BB#0: # %entry @@ -140,7 +136,6 @@ define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly { ; CHECK-NEXT: seta %al ; CHECK-NEXT: movsbl (%rdi,%rax,4), %eax ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test4: ; MCU: # BB#0: # %entry @@ -175,7 +170,6 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind { ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: movd %xmm0, (%rsi) ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test5: ; MCU: # BB#0: @@ -211,7 +205,6 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind { ; CHECK-NEXT: mulps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, (%rsi) ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test6: ; MCU: # BB#0: @@ -283,7 +276,6 @@ define x86_fp80 @test7(i32 %tmp8) nounwind { ; CHECK-NEXT: leaq {{.*}}(%rip), %rcx ; CHECK-NEXT: fldt (%rax,%rcx) ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test7: ; MCU: # BB#0: @@ -333,7 +325,6 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; GENERIC-NEXT: movq %xmm1, 16(%rsi) ; GENERIC-NEXT: movdqa %xmm0, (%rsi) ; GENERIC-NEXT: retq -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test8: ; ATOM: ## BB#0: @@ -366,7 +357,6 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; ATOM-NEXT: movdqa %xmm0, (%rsi) ; ATOM-NEXT: movq %xmm1, 16(%rsi) ; ATOM-NEXT: retq -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test8: ; MCU: # BB#0: @@ -456,7 +446,6 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; GENERIC-NEXT: sbbq %rax, %rax ; GENERIC-NEXT: orq %rsi, %rax ; GENERIC-NEXT: retq -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test9: ; ATOM: ## BB#0: @@ -466,7 +455,6 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; ATOM-NEXT: nop ; ATOM-NEXT: nop ; ATOM-NEXT: retq -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test9: ; MCU: # BB#0: @@ -493,7 +481,6 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; GENERIC-NEXT: sbbq %rax, %rax ; GENERIC-NEXT: orq %rsi, %rax ; GENERIC-NEXT: retq -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test9a: ; ATOM: ## BB#0: @@ -503,7 +490,6 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; ATOM-NEXT: nop ; ATOM-NEXT: nop ; ATOM-NEXT: retq -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test9a: ; MCU: # BB#0: @@ -528,7 +514,6 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; GENERIC-NEXT: sbbq %rax, %rax ; GENERIC-NEXT: orq %rsi, %rax ; GENERIC-NEXT: retq -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test9b: ; ATOM: ## BB#0: @@ -538,7 +523,6 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; ATOM-NEXT: nop ; ATOM-NEXT: nop ; ATOM-NEXT: retq -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test9b: ; MCU: # BB#0: @@ -566,7 +550,6 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; CHECK-NEXT: setne %al ; CHECK-NEXT: leaq -1(%rax,%rax), %rax ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test10: ; MCU: # BB#0: @@ -592,7 +575,6 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; CHECK-NEXT: notq %rax ; CHECK-NEXT: orq %rsi, %rax ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test11: ; MCU: # BB#0: @@ -619,7 +601,6 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone { ; CHECK-NEXT: notq %rax ; CHECK-NEXT: orq %rsi, %rax ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test11a: ; MCU: # BB#0: @@ -649,7 +630,6 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone { ; GENERIC-NEXT: movq $-1, %rdi ; GENERIC-NEXT: cmovnoq %rax, %rdi ; GENERIC-NEXT: jmp __Znam ## TAILCALL -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test12: ; ATOM: ## BB#0: ## %entry @@ -659,7 +639,6 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone { ; ATOM-NEXT: movq $-1, %rdi ; ATOM-NEXT: cmovnoq %rax, %rdi ; ATOM-NEXT: jmp __Znam ## TAILCALL -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test12: ; MCU: # BB#0: # %entry @@ -710,7 +689,6 @@ define i32 @test13(i32 %a, i32 %b) nounwind { ; GENERIC-NEXT: cmpl %esi, %edi ; GENERIC-NEXT: sbbl %eax, %eax ; GENERIC-NEXT: retq -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test13: ; ATOM: ## BB#0: @@ -721,7 +699,6 @@ define i32 @test13(i32 %a, i32 %b) nounwind { ; ATOM-NEXT: nop ; ATOM-NEXT: nop ; ATOM-NEXT: retq -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test13: ; MCU: # BB#0: @@ -741,7 +718,6 @@ define i32 @test14(i32 %a, i32 %b) nounwind { ; CHECK-NEXT: setae %al ; CHECK-NEXT: negl %eax ; CHECK-NEXT: retq -; CHECK-NEXT: ## -- End function ; ; MCU-LABEL: test14: ; MCU: # BB#0: @@ -763,7 +739,6 @@ define i32 @test15(i32 %x) nounwind { ; GENERIC-NEXT: negl %edi ; GENERIC-NEXT: sbbl %eax, %eax ; GENERIC-NEXT: retq -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test15: ; ATOM: ## BB#0: ## %entry @@ -774,7 +749,6 @@ define i32 @test15(i32 %x) nounwind { ; ATOM-NEXT: nop ; ATOM-NEXT: nop ; ATOM-NEXT: retq -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test15: ; MCU: # BB#0: # %entry @@ -826,7 +800,6 @@ define i16 @test17(i16 %x) nounwind { ; GENERIC-NEXT: sbbl %eax, %eax ; GENERIC-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; GENERIC-NEXT: retq -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test17: ; ATOM: ## BB#0: ## %entry @@ -838,7 +811,6 @@ define i16 @test17(i16 %x) nounwind { ; ATOM-NEXT: nop ; ATOM-NEXT: nop ; ATOM-NEXT: retq -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test17: ; MCU: # BB#0: # %entry @@ -859,7 +831,6 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind { ; GENERIC-NEXT: cmovgel %edx, %esi ; GENERIC-NEXT: movl %esi, %eax ; GENERIC-NEXT: retq -; GENERIC-NEXT: ## -- End function ; ; ATOM-LABEL: test18: ; ATOM: ## BB#0: @@ -869,7 +840,6 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind { ; ATOM-NEXT: nop ; ATOM-NEXT: nop ; ATOM-NEXT: retq -; ATOM-NEXT: ## -- End function ; ; MCU-LABEL: test18: ; MCU: # BB#0: diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index 20c77a4a5173..5ae2cc5f35c1 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -23,10 +23,9 @@ define <8 x i16> @pr25080(<8 x i32> %a) { ; ; KNL-32-LABEL: pr25080: ; KNL-32: # BB#0: # %entry -; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607] -; KNL-32-NEXT: vpand %ymm1, %ymm0, %ymm0 -; KNL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-32-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; KNL-32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; KNL-32-NEXT: vbroadcastss {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607] +; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0 ; KNL-32-NEXT: movb $15, %al ; KNL-32-NEXT: kmovw %eax, %k1 ; KNL-32-NEXT: korw %k1, %k0, %k1 @@ -90,6 +89,7 @@ define void @pr26232(i64 %a, <16 x i1> %b) { ; KNL-32-NEXT: jne .LBB1_1 ; KNL-32-NEXT: # BB#2: # %for_exit600 ; KNL-32-NEXT: popl %esi +; KNL-32-NEXT: .cfi_def_cfa_offset 4 ; KNL-32-NEXT: retl allocas: br label %for_test11.preheader diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index 79cf0f2c8f11..a2767205fe29 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -31,6 +31,7 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_2xi8: @@ -89,6 +90,7 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_4xi8: @@ -148,6 +150,7 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) ; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_8xi8: @@ -220,6 +223,7 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) ; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_16xi8: @@ -288,6 +292,7 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_2xi16: @@ -342,6 +347,7 @@ define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_4xi16: @@ -399,6 +405,7 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) ; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_8xi16: @@ -469,6 +476,7 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) ; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_16xi16: @@ -541,6 +549,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, ; X86-NEXT: psrad $16, %xmm0 ; X86-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_2xi8_sext: @@ -606,6 +615,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; X86-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_2xi8_sext_zext: @@ -666,6 +676,7 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_2xi16_sext: @@ -733,6 +744,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X86-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_2xi16_sext_zext: @@ -813,6 +825,7 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) ; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) ; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X64-LABEL: mul_16xi16_sext: diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll index f178e18a2596..ca74ee5732db 100644 --- a/test/CodeGen/X86/sse-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse-intrinsics-x86.ll @@ -401,15 +401,10 @@ define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) { ; SSE-NEXT: rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_rcp_ps: -; AVX2: ## BB#0: -; AVX2-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_rcp_ps: -; SKX: ## BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; VCHECK-LABEL: test_x86_sse_rcp_ps: +; VCHECK: ## BB#0: +; VCHECK-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0] +; VCHECK-NEXT: retl ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -438,15 +433,10 @@ define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) { ; SSE-NEXT: rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; AVX2-LABEL: test_x86_sse_rsqrt_ps: -; AVX2: ## BB#0: -; AVX2-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_rsqrt_ps: -; SKX: ## BB#0: -; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] +; VCHECK-LABEL: test_x86_sse_rsqrt_ps: +; VCHECK: ## BB#0: +; VCHECK-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0] +; VCHECK-NEXT: retl ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -475,10 +465,15 @@ define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) { ; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_sqrt_ps: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse_sqrt_ps: +; AVX2: ## BB#0: +; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse_sqrt_ps: +; SKX: ## BB#0: +; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -491,10 +486,15 @@ define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) { ; SSE-NEXT: sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse_sqrt_ss: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse_sqrt_ss: +; AVX2: ## BB#0: +; AVX2-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse_sqrt_ss: +; SKX: ## BB#0: +; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll index b5c2bff4b8f9..d3c995197e83 100644 --- a/test/CodeGen/X86/sse-schedule.ll +++ b/test/CodeGen/X86/sse-schedule.ll @@ -2547,8 +2547,8 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) { ; ; SKX-LABEL: test_rcpps: ; SKX: # BB#0: -; SKX-NEXT: vrcp14ps %xmm0, %xmm0 # sched: [4:1.00] -; SKX-NEXT: vrcp14ps (%rdi), %xmm1 # sched: [10:1.00] +; SKX-NEXT: vrcpps %xmm0, %xmm0 # sched: [4:1.00] +; SKX-NEXT: vrcpps (%rdi), %xmm1 # sched: [10:1.00] ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; @@ -2719,8 +2719,8 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) { ; ; SKX-LABEL: test_rsqrtps: ; SKX: # BB#0: -; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 # sched: [4:1.00] -; SKX-NEXT: vrsqrt14ps (%rdi), %xmm1 # sched: [10:1.00] +; SKX-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [4:1.00] +; SKX-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [10:1.00] ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index d4047faad9bb..72c68c566380 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1592,10 +1592,15 @@ define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { ; SSE-NEXT: sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse2_sqrt_pd: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse2_sqrt_pd: +; AVX2: ## BB#0: +; AVX2-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_sqrt_pd: +; SKX: ## BB#0: +; SKX-NEXT: vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -1608,10 +1613,15 @@ define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) { ; SSE-NEXT: sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0] ; SSE-NEXT: retl ## encoding: [0xc3] ; -; VCHECK-LABEL: test_x86_sse2_sqrt_sd: -; VCHECK: ## BB#0: -; VCHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] -; VCHECK-NEXT: retl ## encoding: [0xc3] +; AVX2-LABEL: test_x86_sse2_sqrt_sd: +; AVX2: ## BB#0: +; AVX2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_sqrt_sd: +; SKX: ## BB#0: +; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1] ret <2 x double> %res } @@ -1637,7 +1647,7 @@ define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) { ; SKX: ## BB#0: ; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; SKX-NEXT: vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00] -; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] +; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0] ; SKX-NEXT: retl ## encoding: [0xc3] %a1 = load <2 x double>, <2 x double>* %a0, align 16 %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1] diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll index bd2dd53b654a..d80c87b99b64 100644 --- a/test/CodeGen/X86/statepoint-call-lowering.ll +++ b/test/CodeGen/X86/statepoint-call-lowering.ll @@ -83,6 +83,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" { ; CHECK: callq return_i1 ; CHECK-NEXT: .Ltmp5: ; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a) diff --git a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll index b88ca03805f2..90f2002e2d45 100644 --- a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll +++ b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll @@ -69,6 +69,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" { ; CHECK: callq return_i1 ; CHECK-NEXT: .Ltmp4: ; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq entry: %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 1, i32 0, i32 0, i32 addrspace(1)* %a) diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll index 784b932addc8..5aa902546c16 100644 --- a/test/CodeGen/X86/statepoint-invoke.ll +++ b/test/CodeGen/X86/statepoint-invoke.ll @@ -142,6 +142,7 @@ normal_return: ; CHECK-LABEL: %normal_return ; CHECK: xorl %eax, %eax ; CHECK-NEXT: popq + ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %null.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13) %undef.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14) @@ -169,6 +170,7 @@ entry: normal_return: ; CHECK: leaq ; CHECK-NEXT: popq + ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %aa.rel = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %sp, i32 13, i32 13) %aa.converted = bitcast i32 addrspace(1)* %aa.rel to i64 addrspace(1)* @@ -177,6 +179,7 @@ normal_return: exceptional_return: ; CHECK: movl $15 ; CHECK-NEXT: popq + ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %landing_pad = landingpad token cleanup diff --git a/test/CodeGen/X86/throws-cfi-fp.ll b/test/CodeGen/X86/throws-cfi-fp.ll new file mode 100644 index 000000000000..bacd965054c9 --- /dev/null +++ b/test/CodeGen/X86/throws-cfi-fp.ll @@ -0,0 +1,98 @@ +; RUN: llc %s -o - | FileCheck %s + +; ModuleID = 'throws-cfi-fp.cpp' +source_filename = "throws-cfi-fp.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +$__clang_call_terminate = comdat any + +@_ZL11ShouldThrow = internal unnamed_addr global i1 false, align 1 +@_ZTIi = external constant i8* +@str = private unnamed_addr constant [20 x i8] c"Threw an exception!\00" + +; Function Attrs: uwtable +define void @_Z6throwsv() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + +; CHECK-LABEL: _Z6throwsv: +; CHECK: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: .cfi_def_cfa %rbp, 16 + +entry: + %.b5 = load i1, i1* @_ZL11ShouldThrow, align 1 + br i1 %.b5, label %if.then, label %try.cont + +if.then: ; preds = %entry + %exception = tail call i8* @__cxa_allocate_exception(i64 4) + %0 = bitcast i8* %exception to i32* + store i32 1, i32* %0, align 16 + invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null) + to label %unreachable unwind label %lpad + +lpad: ; preds = %if.then + %1 = landingpad { i8*, i32 } + catch i8* null + %2 = extractvalue { i8*, i32 } %1, 0 + %3 = tail call i8* @__cxa_begin_catch(i8* %2) + %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str, i64 0, i64 0)) + invoke void @__cxa_rethrow() + to label %unreachable unwind label %lpad1 + +lpad1: ; preds = %lpad + %4 = landingpad { i8*, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +try.cont: ; preds = %entry + ret void + +eh.resume: ; preds = %lpad1 + resume { i8*, i32 } %4 + +terminate.lpad: ; preds = %lpad1 + %5 = landingpad { i8*, i32 } + catch i8* null + %6 = extractvalue { i8*, i32 } %5, 0 + tail call void @__clang_call_terminate(i8* %6) + unreachable + +unreachable: ; preds = %lpad, %if.then + unreachable +} + +declare i8* @__cxa_allocate_exception(i64) + +declare void @__cxa_throw(i8*, i8*, i8*) + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) + +declare void @__cxa_rethrow() + +declare void @__cxa_end_catch() + +; Function Attrs: noinline noreturn nounwind +declare void @__clang_call_terminate(i8*) + +declare void @_ZSt9terminatev() + +; Function Attrs: nounwind +declare i32 @puts(i8* nocapture readonly) + +attributes #0 = { "no-frame-pointer-elim"="true" } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!8, !9, !10} + +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 316ebefb7fff8ad324a08a694347500b6cd7c95f) (https://github.com/llvm-mirror/llvm.git dcae9be81fc17cdfbe989402354d3c8ecd0a2c79)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5) +!3 = !DIFile(filename: "throws-cfi-fp.cpp", directory: "epilogue-dwarf/test") +!4 = !{} +!5 = !{} +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 2, !"Debug Info Version", i32 3} +!10 = !{i32 1, !"wchar_size", i32 4} diff --git a/test/CodeGen/X86/throws-cfi-no-fp.ll b/test/CodeGen/X86/throws-cfi-no-fp.ll new file mode 100644 index 000000000000..1483e6b8483c --- /dev/null +++ b/test/CodeGen/X86/throws-cfi-no-fp.ll @@ -0,0 +1,97 @@ +; RUN: llc %s -o - | FileCheck %s + +; ModuleID = 'throws-cfi-no-fp.cpp' +source_filename = "throws-cfi-no-fp.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +$__clang_call_terminate = comdat any + +@_ZL11ShouldThrow = internal unnamed_addr global i1 false, align 1 +@_ZTIi = external constant i8* +@str = private unnamed_addr constant [20 x i8] c"Threw an exception!\00" + +; Function Attrs: uwtable +define void @_Z6throwsv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + +; CHECK-LABEL: _Z6throwsv: +; CHECK: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: .cfi_def_cfa_offset 16 + +entry: + %.b5 = load i1, i1* @_ZL11ShouldThrow, align 1 + br i1 %.b5, label %if.then, label %try.cont + +if.then: ; preds = %entry + %exception = tail call i8* @__cxa_allocate_exception(i64 4) + %0 = bitcast i8* %exception to i32* + store i32 1, i32* %0, align 16 + invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null) + to label %unreachable unwind label %lpad + +lpad: ; preds = %if.then + %1 = landingpad { i8*, i32 } + catch i8* null + %2 = extractvalue { i8*, i32 } %1, 0 + %3 = tail call i8* @__cxa_begin_catch(i8* %2) + %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str, i64 0, i64 0)) + invoke void @__cxa_rethrow() #4 + to label %unreachable unwind label %lpad1 + +lpad1: ; preds = %lpad + %4 = landingpad { i8*, i32 } + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +try.cont: ; preds = %entry + ret void + +eh.resume: ; preds = %lpad1 + resume { i8*, i32 } %4 + +terminate.lpad: ; preds = %lpad1 + %5 = landingpad { i8*, i32 } + catch i8* null + %6 = extractvalue { i8*, i32 } %5, 0 + tail call void @__clang_call_terminate(i8* %6) + unreachable + +unreachable: ; preds = %lpad, %if.then + unreachable +} + +declare i8* @__cxa_allocate_exception(i64) + +declare void @__cxa_throw(i8*, i8*, i8*) + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) + +declare void @__cxa_rethrow() + +declare void @__cxa_end_catch() + +; Function Attrs: noinline noreturn nounwind +declare void @__clang_call_terminate(i8*) + +declare void @_ZSt9terminatev() + + +; Function Attrs: nounwind +declare i32 @puts(i8* nocapture readonly) + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!8, !9, !10} + +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 316ebefb7fff8ad324a08a694347500b6cd7c95f) (https://github.com/llvm-mirror/llvm.git dcae9be81fc17cdfbe989402354d3c8ecd0a2c79)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5) +!3 = !DIFile(filename: "throws-cfi-no-fp.cpp", directory: "epilogue-dwarf/test") +!4 = !{} +!5 = !{} +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 2, !"Debug Info Version", i32 3} +!10 = !{i32 1, !"wchar_size", i32 4} diff --git a/test/CodeGen/X86/var-permute-128.ll b/test/CodeGen/X86/var-permute-128.ll index f74343d7f2a8..208fab88b585 100644 --- a/test/CodeGen/X86/var-permute-128.ll +++ b/test/CodeGen/X86/var-permute-128.ll @@ -143,35 +143,40 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; -; AVX-LABEL: var_shuffle_v8i16: -; AVX: # BB#0: -; AVX-NEXT: vmovd %xmm1, %eax -; AVX-NEXT: vpextrw $1, %xmm1, %r10d -; AVX-NEXT: vpextrw $2, %xmm1, %ecx -; AVX-NEXT: vpextrw $3, %xmm1, %edx -; AVX-NEXT: vpextrw $4, %xmm1, %esi -; AVX-NEXT: vpextrw $5, %xmm1, %edi -; AVX-NEXT: vpextrw $6, %xmm1, %r8d -; AVX-NEXT: vpextrw $7, %xmm1, %r9d -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $7, %eax -; AVX-NEXT: andl $7, %r10d -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: andl $7, %edx -; AVX-NEXT: andl $7, %esi -; AVX-NEXT: andl $7, %edi -; AVX-NEXT: andl $7, %r8d -; AVX-NEXT: andl $7, %r9d -; AVX-NEXT: movzwl -24(%rsp,%rax,2), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVXNOVLBW-LABEL: var_shuffle_v8i16: +; AVXNOVLBW: # BB#0: +; AVXNOVLBW-NEXT: vmovd %xmm1, %eax +; AVXNOVLBW-NEXT: vpextrw $1, %xmm1, %r10d +; AVXNOVLBW-NEXT: vpextrw $2, %xmm1, %ecx +; AVXNOVLBW-NEXT: vpextrw $3, %xmm1, %edx +; AVXNOVLBW-NEXT: vpextrw $4, %xmm1, %esi +; AVXNOVLBW-NEXT: vpextrw $5, %xmm1, %edi +; AVXNOVLBW-NEXT: vpextrw $6, %xmm1, %r8d +; AVXNOVLBW-NEXT: vpextrw $7, %xmm1, %r9d +; AVXNOVLBW-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVXNOVLBW-NEXT: andl $7, %eax +; AVXNOVLBW-NEXT: andl $7, %r10d +; AVXNOVLBW-NEXT: andl $7, %ecx +; AVXNOVLBW-NEXT: andl $7, %edx +; AVXNOVLBW-NEXT: andl $7, %esi +; AVXNOVLBW-NEXT: andl $7, %edi +; AVXNOVLBW-NEXT: andl $7, %r8d +; AVXNOVLBW-NEXT: andl $7, %r9d +; AVXNOVLBW-NEXT: movzwl -24(%rsp,%rax,2), %eax +; AVXNOVLBW-NEXT: vmovd %eax, %xmm0 +; AVXNOVLBW-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0 +; AVXNOVLBW-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0 +; AVXNOVLBW-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0 +; AVXNOVLBW-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0 +; AVXNOVLBW-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0 +; AVXNOVLBW-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0 +; AVXNOVLBW-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0 +; AVXNOVLBW-NEXT: retq +; +; AVX512VLBW-LABEL: var_shuffle_v8i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpermw %xmm0, %xmm1, %xmm0 +; AVX512VLBW-NEXT: retq %index0 = extractelement <8 x i16> %indices, i32 0 %index1 = extractelement <8 x i16> %indices, i32 1 %index2 = extractelement <8 x i16> %indices, i32 2 @@ -202,143 +207,13 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind { define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSSE3-LABEL: var_shuffle_v16i8: ; SSSE3: # BB#0: -; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm8 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm15 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm9 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm10 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm7 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm11 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm6 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm12 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm5 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm13 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm14 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%rax), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: var_shuffle_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vpextrb $0, %xmm1, %eax -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movzbl (%rax,%rcx), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpextrb $1, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $2, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $3, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $4, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $5, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $6, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $7, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $9, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $10, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $11, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $13, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $14, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0 -; AVX-NEXT: vpextrb $15, %xmm1, %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %index0 = extractelement <16 x i8> %indices, i32 0 %index1 = extractelement <16 x i8> %indices, i32 1 diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll index dff145314eae..beef4643c131 100644 --- a/test/CodeGen/X86/var-permute-256.ll +++ b/test/CodeGen/X86/var-permute-256.ll @@ -34,32 +34,69 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind { ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; -; INT256-LABEL: var_shuffle_v4i64: -; INT256: # BB#0: -; INT256-NEXT: pushq %rbp -; INT256-NEXT: movq %rsp, %rbp -; INT256-NEXT: andq $-32, %rsp -; INT256-NEXT: subq $64, %rsp -; INT256-NEXT: vmovq %xmm1, %rax -; INT256-NEXT: andl $3, %eax -; INT256-NEXT: vpextrq $1, %xmm1, %rcx -; INT256-NEXT: andl $3, %ecx -; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1 -; INT256-NEXT: vmovq %xmm1, %rdx -; INT256-NEXT: andl $3, %edx -; INT256-NEXT: vpextrq $1, %xmm1, %rsi -; INT256-NEXT: andl $3, %esi -; INT256-NEXT: vmovaps %ymm0, (%rsp) -; INT256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; INT256-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; INT256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; INT256-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; INT256-NEXT: movq %rbp, %rsp -; INT256-NEXT: popq %rbp -; INT256-NEXT: retq +; AVX2-LABEL: var_shuffle_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: andl $3, %eax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovaps %ymm0, (%rsp) +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: var_shuffle_v4i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $64, %rsp +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: andl $3, %eax +; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512F-NEXT: andl $3, %ecx +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm1, %rdx +; AVX512F-NEXT: andl $3, %edx +; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512F-NEXT: andl $3, %esi +; AVX512F-NEXT: vmovaps %ymm0, (%rsp) +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_shuffle_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBW-LABEL: var_shuffle_v4i64: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq %index0 = extractelement <4 x i64> %indices, i32 0 %index1 = extractelement <4 x i64> %indices, i32 1 %index2 = extractelement <4 x i64> %indices, i32 2 @@ -120,44 +157,7 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { ; ; INT256-LABEL: var_shuffle_v8i32: ; INT256: # BB#0: -; INT256-NEXT: pushq %rbp -; INT256-NEXT: movq %rsp, %rbp -; INT256-NEXT: andq $-32, %rsp -; INT256-NEXT: subq $64, %rsp -; INT256-NEXT: vpextrq $1, %xmm1, %r8 -; INT256-NEXT: movq %r8, %rcx -; INT256-NEXT: shrq $30, %rcx -; INT256-NEXT: vmovq %xmm1, %r9 -; INT256-NEXT: movq %r9, %rsi -; INT256-NEXT: shrq $30, %rsi -; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1 -; INT256-NEXT: vpextrq $1, %xmm1, %r10 -; INT256-NEXT: movq %r10, %rdi -; INT256-NEXT: shrq $30, %rdi -; INT256-NEXT: vmovq %xmm1, %rax -; INT256-NEXT: movq %rax, %rdx -; INT256-NEXT: shrq $30, %rdx -; INT256-NEXT: vmovaps %ymm0, (%rsp) -; INT256-NEXT: andl $7, %r9d -; INT256-NEXT: andl $28, %esi -; INT256-NEXT: andl $7, %r8d -; INT256-NEXT: andl $28, %ecx -; INT256-NEXT: andl $7, %eax -; INT256-NEXT: andl $28, %edx -; INT256-NEXT: andl $7, %r10d -; INT256-NEXT: andl $28, %edi -; INT256-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; INT256-NEXT: movq %rsp, %rax -; INT256-NEXT: vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0 -; INT256-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0 -; INT256-NEXT: vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0 -; INT256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; INT256-NEXT: vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1 -; INT256-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1 -; INT256-NEXT: vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1 -; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; INT256-NEXT: movq %rbp, %rsp -; INT256-NEXT: popq %rbp +; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; INT256-NEXT: retq %index0 = extractelement <8 x i32> %indices, i32 0 %index1 = extractelement <8 x i32> %indices, i32 1 @@ -250,68 +250,199 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; -; INT256-LABEL: var_shuffle_v16i16: -; INT256: # BB#0: -; INT256-NEXT: pushq %rbp -; INT256-NEXT: movq %rsp, %rbp -; INT256-NEXT: andq $-32, %rsp -; INT256-NEXT: subq $64, %rsp -; INT256-NEXT: vextracti128 $1, %ymm1, %xmm2 -; INT256-NEXT: vmovd %xmm2, %eax -; INT256-NEXT: vmovaps %ymm0, (%rsp) -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: movzwl (%rsp,%rax,2), %eax -; INT256-NEXT: vmovd %eax, %xmm0 -; INT256-NEXT: vpextrw $1, %xmm2, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 -; INT256-NEXT: vpextrw $2, %xmm2, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 -; INT256-NEXT: vpextrw $3, %xmm2, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 -; INT256-NEXT: vpextrw $4, %xmm2, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 -; INT256-NEXT: vpextrw $5, %xmm2, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 -; INT256-NEXT: vpextrw $6, %xmm2, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 -; INT256-NEXT: vpextrw $7, %xmm2, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 -; INT256-NEXT: vmovd %xmm1, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: movzwl (%rsp,%rax,2), %eax -; INT256-NEXT: vmovd %eax, %xmm2 -; INT256-NEXT: vpextrw $1, %xmm1, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2 -; INT256-NEXT: vpextrw $2, %xmm1, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2 -; INT256-NEXT: vpextrw $3, %xmm1, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2 -; INT256-NEXT: vpextrw $4, %xmm1, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2 -; INT256-NEXT: vpextrw $5, %xmm1, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2 -; INT256-NEXT: vpextrw $6, %xmm1, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2 -; INT256-NEXT: vpextrw $7, %xmm1, %eax -; INT256-NEXT: andl $15, %eax -; INT256-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1 -; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; INT256-NEXT: movq %rbp, %rsp -; INT256-NEXT: popq %rbp -; INT256-NEXT: retq +; AVX2-LABEL: var_shuffle_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vmovd %xmm2, %eax +; AVX2-NEXT: vmovaps %ymm0, (%rsp) +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpextrw $1, %xmm2, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $2, %xmm2, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $3, %xmm2, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $4, %xmm2, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $5, %xmm2, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $6, %xmm2, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $7, %xmm2, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vpextrw $1, %xmm1, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm1, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm1, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: var_shuffle_v16i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $64, %rsp +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vmovaps %ymm0, (%rsp) +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpextrw $1, %xmm2, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $2, %xmm2, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $3, %xmm2, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $4, %xmm2, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $5, %xmm2, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $6, %xmm2, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $7, %xmm2, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vpextrw $1, %xmm1, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $2, %xmm1, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $3, %xmm1, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $4, %xmm1, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $5, %xmm1, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $6, %xmm1, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $7, %xmm1, %eax +; AVX512F-NEXT: andl $15, %eax +; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_shuffle_v16i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: pushq %rbp +; AVX512VL-NEXT: movq %rsp, %rbp +; AVX512VL-NEXT: andq $-32, %rsp +; AVX512VL-NEXT: subq $64, %rsp +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vmovd %xmm2, %eax +; AVX512VL-NEXT: vmovaps %ymm0, (%rsp) +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm0 +; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $2, %xmm2, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $3, %xmm2, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $4, %xmm2, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $5, %xmm2, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $6, %xmm2, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $7, %xmm2, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: andl $15, %eax +; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movq %rbp, %rsp +; AVX512VL-NEXT: popq %rbp +; AVX512VL-NEXT: retq +; +; AVX512VLBW-LABEL: var_shuffle_v16i16: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq %index0 = extractelement <16 x i16> %indices, i32 0 %index1 = extractelement <16 x i16> %indices, i32 1 %index2 = extractelement <16 x i16> %indices, i32 2 @@ -492,133 +623,394 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; -; INT256-LABEL: var_shuffle_v32i8: -; INT256: # BB#0: -; INT256-NEXT: pushq %rbp -; INT256-NEXT: movq %rsp, %rbp -; INT256-NEXT: andq $-32, %rsp -; INT256-NEXT: subq $64, %rsp -; INT256-NEXT: vextracti128 $1, %ymm1, %xmm2 -; INT256-NEXT: vpextrb $0, %xmm2, %eax -; INT256-NEXT: vmovaps %ymm0, (%rsp) -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movq %rsp, %rcx -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vmovd %eax, %xmm0 -; INT256-NEXT: vpextrb $1, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $2, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $3, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $4, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $5, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $6, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $7, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $8, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $9, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $10, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $11, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $12, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $13, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $14, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $15, %xmm2, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; INT256-NEXT: vpextrb $0, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vmovd %eax, %xmm2 -; INT256-NEXT: vpextrb $1, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $2, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $3, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $4, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $5, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $6, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $7, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $8, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $9, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $10, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $11, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $12, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $13, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $14, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 -; INT256-NEXT: vpextrb $15, %xmm1, %eax -; INT256-NEXT: andl $31, %eax -; INT256-NEXT: movzbl (%rax,%rcx), %eax -; INT256-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; INT256-NEXT: movq %rbp, %rsp -; INT256-NEXT: popq %rbp -; INT256-NEXT: retq +; AVX2-LABEL: var_shuffle_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: vmovaps %ymm0, (%rsp) +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movq %rsp, %rcx +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $2, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $3, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $5, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $6, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $7, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $8, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $9, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $10, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $11, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $12, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $13, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $14, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $15, %xmm2, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpextrb $0, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: var_shuffle_v32i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $64, %rsp +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpextrb $0, %xmm2, %eax +; AVX512F-NEXT: vmovaps %ymm0, (%rsp) +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movq %rsp, %rcx +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpextrb $1, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $2, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $3, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $4, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $5, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $6, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $7, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $8, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $9, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $10, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $11, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $12, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $13, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $14, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $15, %xmm2, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrb $0, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vpextrb $1, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $2, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $3, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $4, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $5, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $6, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $7, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $8, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $9, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $10, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $11, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $12, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $13, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $14, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpextrb $15, %xmm1, %eax +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_shuffle_v32i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: pushq %rbp +; AVX512VL-NEXT: movq %rsp, %rbp +; AVX512VL-NEXT: andq $-32, %rsp +; AVX512VL-NEXT: subq $64, %rsp +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax +; AVX512VL-NEXT: vmovaps %ymm0, (%rsp) +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movq %rsp, %rcx +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm0 +; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vmovd %eax, %xmm2 +; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax +; AVX512VL-NEXT: andl $31, %eax +; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movq %rbp, %rsp +; AVX512VL-NEXT: popq %rbp +; AVX512VL-NEXT: retq +; +; VBMI-LABEL: var_shuffle_v32i8: +; VBMI: # BB#0: +; VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; VBMI-NEXT: retq %index0 = extractelement <32 x i8> %indices, i32 0 %index1 = extractelement <32 x i8> %indices, i32 1 %index2 = extractelement <32 x i8> %indices, i32 2 @@ -744,30 +1136,65 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; -; INT256-LABEL: var_shuffle_v4f64: -; INT256: # BB#0: -; INT256-NEXT: pushq %rbp -; INT256-NEXT: movq %rsp, %rbp -; INT256-NEXT: andq $-32, %rsp -; INT256-NEXT: subq $64, %rsp -; INT256-NEXT: vmovq %xmm1, %rax -; INT256-NEXT: andl $3, %eax -; INT256-NEXT: vpextrq $1, %xmm1, %rcx -; INT256-NEXT: andl $3, %ecx -; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1 -; INT256-NEXT: vmovq %xmm1, %rdx -; INT256-NEXT: andl $3, %edx -; INT256-NEXT: vpextrq $1, %xmm1, %rsi -; INT256-NEXT: andl $3, %esi -; INT256-NEXT: vmovaps %ymm0, (%rsp) -; INT256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; INT256-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; INT256-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; INT256-NEXT: movq %rbp, %rsp -; INT256-NEXT: popq %rbp -; INT256-NEXT: retq +; AVX2-LABEL: var_shuffle_v4f64: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: andl $3, %eax +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: vpextrq $1, %xmm1, %rsi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovaps %ymm0, (%rsp) +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: var_shuffle_v4f64: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $64, %rsp +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: andl $3, %eax +; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx +; AVX512F-NEXT: andl $3, %ecx +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-NEXT: vmovq %xmm1, %rdx +; AVX512F-NEXT: andl $3, %edx +; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi +; AVX512F-NEXT: andl $3, %esi +; AVX512F-NEXT: vmovaps %ymm0, (%rsp) +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: var_shuffle_v4f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBW-LABEL: var_shuffle_v4f64: +; AVX512VLBW: # BB#0: +; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq %index0 = extractelement <4 x i64> %indices, i32 0 %index1 = extractelement <4 x i64> %indices, i32 1 %index2 = extractelement <4 x i64> %indices, i32 2 @@ -828,44 +1255,7 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi ; ; INT256-LABEL: var_shuffle_v8f32: ; INT256: # BB#0: -; INT256-NEXT: pushq %rbp -; INT256-NEXT: movq %rsp, %rbp -; INT256-NEXT: andq $-32, %rsp -; INT256-NEXT: subq $64, %rsp -; INT256-NEXT: vpextrq $1, %xmm1, %r8 -; INT256-NEXT: movq %r8, %rcx -; INT256-NEXT: shrq $30, %rcx -; INT256-NEXT: vmovq %xmm1, %r9 -; INT256-NEXT: movq %r9, %rdx -; INT256-NEXT: shrq $30, %rdx -; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1 -; INT256-NEXT: vpextrq $1, %xmm1, %r10 -; INT256-NEXT: movq %r10, %rdi -; INT256-NEXT: shrq $30, %rdi -; INT256-NEXT: vmovq %xmm1, %rax -; INT256-NEXT: movq %rax, %rsi -; INT256-NEXT: shrq $30, %rsi -; INT256-NEXT: vmovaps %ymm0, (%rsp) -; INT256-NEXT: andl $7, %r9d -; INT256-NEXT: andl $28, %edx -; INT256-NEXT: andl $7, %r8d -; INT256-NEXT: andl $28, %ecx -; INT256-NEXT: andl $7, %eax -; INT256-NEXT: andl $28, %esi -; INT256-NEXT: andl $7, %r10d -; INT256-NEXT: andl $28, %edi -; INT256-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; INT256-NEXT: movq %rsp, %rax -; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; INT256-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; INT256-NEXT: movq %rbp, %rsp -; INT256-NEXT: popq %rbp +; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; INT256-NEXT: retq %index0 = extractelement <8 x i32> %indices, i32 0 %index1 = extractelement <8 x i32> %indices, i32 1 diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll index bd1f220ceb13..15c7a1c8b8bf 100644 --- a/test/CodeGen/X86/var-permute-512.ll +++ b/test/CodeGen/X86/var-permute-512.ll @@ -6,47 +6,7 @@ define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind { ; AVX512-LABEL: var_shuffle_v8i64: ; AVX512: # BB#0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: vmovq %xmm1, %r8 -; AVX512-NEXT: andl $7, %r8d -; AVX512-NEXT: vpextrq $1, %xmm1, %r9 -; AVX512-NEXT: andl $7, %r9d -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %r10 -; AVX512-NEXT: andl $7, %r10d -; AVX512-NEXT: vpextrq $1, %xmm2, %rsi -; AVX512-NEXT: andl $7, %esi -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %rdi -; AVX512-NEXT: andl $7, %edi -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: andl $7, %eax -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rcx -; AVX512-NEXT: andl $7, %ecx -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: andl $7, %edx -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %index0 = extractelement <8 x i64> %indices, i32 0 %index1 = extractelement <8 x i64> %indices, i32 1 @@ -78,76 +38,7 @@ define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind { define <16 x i32> @var_shuffle_v16i32(<16 x i32> %v, <16 x i32> %indices) nounwind { ; AVX512-LABEL: var_shuffle_v16i32: ; AVX512: # BB#0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rax -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: movq %rdx, %rcx -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: andl $15, %edx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: movq %rsp, %rdx -; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm0, %xmm0 -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm3, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm0, %xmm0 -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm3 -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm3, %xmm3 -; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm3, %xmm3 -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm2 -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm2, %xmm2 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm2, %xmm2 -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm1 -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm1, %xmm1 -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %index0 = extractelement <16 x i32> %indices, i32 0 %index1 = extractelement <16 x i32> %indices, i32 1 @@ -381,136 +272,7 @@ define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwi ; ; AVX512BW-LABEL: var_shuffle_v32i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512BW-NEXT: vmovd %xmm4, %eax -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm0 -; AVX512BW-NEXT: vpextrw $1, %xmm4, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrw $2, %xmm4, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrw $3, %xmm4, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrw $4, %xmm4, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrw $5, %xmm4, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrw $6, %xmm4, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrw $7, %xmm4, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm3, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm4 -; AVX512BW-NEXT: vpextrw $1, %xmm3, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $2, %xmm3, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $3, %xmm3, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $4, %xmm3, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $5, %xmm3, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $6, %xmm3, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm3 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm4 -; AVX512BW-NEXT: vpextrw $1, %xmm2, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $2, %xmm2, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $3, %xmm2, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $5, %xmm2, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $6, %xmm2, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2 -; AVX512BW-NEXT: vmovd %xmm1, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm4 -; AVX512BW-NEXT: vpextrw $1, %xmm1, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $2, %xmm1, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $3, %xmm1, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $4, %xmm1, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $5, %xmm1, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $6, %xmm1, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $7, %xmm1, %eax -; AVX512BW-NEXT: andl $31, %eax -; AVX512BW-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq %index0 = extractelement <32 x i16> %indices, i32 0 %index1 = extractelement <32 x i16> %indices, i32 1 @@ -1014,267 +776,10 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind { ; NOBW-NEXT: popq %rbp ; NOBW-NEXT: retq ; -; AVX512BW-LABEL: var_shuffle_v64i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movq %rsp, %rsi -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vmovd %edx, %xmm0 -; AVX512BW-NEXT: vpextrb $1, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $2, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vmovd %edx, %xmm4 -; AVX512BW-NEXT: vpextrb $1, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $2, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm3 -; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vmovd %edx, %xmm4 -; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm2 -; AVX512BW-NEXT: vpextrb $2, %xmm1, %edx -; AVX512BW-NEXT: andl $63, %ecx -; AVX512BW-NEXT: movzbl (%rcx,%rsi), %ecx -; AVX512BW-NEXT: vmovd %ecx, %xmm4 -; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rax,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: vpinsrb $2, (%rdx,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $5, %xmm1, %edx -; AVX512BW-NEXT: andl $63, %ecx -; AVX512BW-NEXT: vpinsrb $3, (%rcx,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rax,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: vpinsrb $5, (%rdx,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $8, %xmm1, %edx -; AVX512BW-NEXT: andl $63, %ecx -; AVX512BW-NEXT: vpinsrb $6, (%rcx,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rax,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: vpinsrb $8, (%rdx,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $11, %xmm1, %edx -; AVX512BW-NEXT: andl $63, %ecx -; AVX512BW-NEXT: vpinsrb $9, (%rcx,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $10, (%rax,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: vpinsrb $11, (%rdx,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $14, %xmm1, %edx -; AVX512BW-NEXT: andl $63, %ecx -; AVX512BW-NEXT: vpinsrb $12, (%rcx,%rsi), %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: andl $63, %edx -; AVX512BW-NEXT: andl $63, %ecx -; AVX512BW-NEXT: movzbl (%rcx,%rsi), %ecx -; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx -; AVX512BW-NEXT: movzbl (%rax,%rsi), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm1 -; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: retq +; VBMI-LABEL: var_shuffle_v64i8: +; VBMI: # BB#0: +; VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; VBMI-NEXT: retq %index0 = extractelement <64 x i8> %indices, i32 0 %index1 = extractelement <64 x i8> %indices, i32 1 %index2 = extractelement <64 x i8> %indices, i32 2 @@ -1473,43 +978,7 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind { define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) nounwind { ; AVX512-LABEL: var_shuffle_v8f64: ; AVX512: # BB#0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: vmovq %xmm1, %r8 -; AVX512-NEXT: andl $7, %r8d -; AVX512-NEXT: vpextrq $1, %xmm1, %r9 -; AVX512-NEXT: andl $7, %r9d -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %r10 -; AVX512-NEXT: andl $7, %r10d -; AVX512-NEXT: vpextrq $1, %xmm2, %rsi -; AVX512-NEXT: andl $7, %esi -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %rdi -; AVX512-NEXT: andl $7, %edi -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: andl $7, %eax -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rcx -; AVX512-NEXT: andl $7, %ecx -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: andl $7, %edx -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %index0 = extractelement <8 x i64> %indices, i32 0 %index1 = extractelement <8 x i64> %indices, i32 1 @@ -1541,76 +1010,7 @@ define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) noun define <16 x float> @var_shuffle_v16f32(<16 x float> %v, <16 x i32> %indices) nounwind { ; AVX512-LABEL: var_shuffle_v16f32: ; AVX512: # BB#0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %rax -; AVX512-NEXT: vmovq %xmm4, %rdx -; AVX512-NEXT: movq %rdx, %rcx -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: andl $15, %edx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: movq %rsp, %rdx -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX512-NEXT: vmovq %xmm3, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],mem[0],xmm4[2,3] -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] -; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],mem[0],xmm4[2,3] -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],mem[0],xmm4[2,3] -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andl $15, %eax -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX512-NEXT: shrq $30, %rcx -; AVX512-NEXT: andl $60, %ecx -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %index0 = extractelement <16 x i32> %indices, i32 0 %index1 = extractelement <16 x i32> %indices, i32 1 diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index c6335d751ed1..2f52bab2803c 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -2288,67 +2288,19 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { ; VEX-NEXT: popq %rax ; VEX-NEXT: retq ; -; AVX512F-LABEL: fptosi_2f16_to_4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: vcvttss2si %xmm1, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f16_to_4i32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vcvttss2si %xmm1, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f16_to_4i32: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512DQ-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512DQ-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512DQ-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512DQ-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm1 -; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax -; AVX512DQ-NEXT: vmovq %rax, %xmm0 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32: -; AVX512VLDQ: # BB#0: -; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vcvttss2si %xmm1, %rax -; AVX512VLDQ-NEXT: vmovq %rax, %xmm1 -; AVX512VLDQ-NEXT: vcvttss2si %xmm0, %rax -; AVX512VLDQ-NEXT: vmovq %rax, %xmm0 -; AVX512VLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero -; AVX512VLDQ-NEXT: retq +; AVX512-LABEL: fptosi_2f16_to_4i32: +; AVX512: # BB#0: +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vcvttss2si %xmm1, %rax +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vcvttss2si %xmm0, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX512-NEXT: retq %cvt = fptosi <2 x half> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ret <4 x i32> %ext diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll index 6e664ba98d9c..9feff88a5761 100644 --- a/test/CodeGen/X86/vector-half-conversions.ll +++ b/test/CodeGen/X86/vector-half-conversions.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL ; @@ -9,35 +9,12 @@ ; define float @cvt_i16_to_f32(i16 %a0) nounwind { -; AVX1-LABEL: cvt_i16_to_f32: -; AVX1: # BB#0: -; AVX1-NEXT: movswl %di, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_i16_to_f32: -; AVX2: # BB#0: -; AVX2-NEXT: movswl %di, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_i16_to_f32: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl %di, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_i16_to_f32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl %di, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_i16_to_f32: +; ALL: # BB#0: +; ALL-NEXT: movswl %di, %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: retq %1 = bitcast i16 %a0 to half %2 = fpext half %1 to float ret float %2 @@ -111,19 +88,18 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { ; AVX512F-NEXT: shrq $48, %rdx ; AVX512F-NEXT: movswl %dx, %edx ; AVX512F-NEXT: vmovd %edx, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: movswl %cx, %ecx ; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512F-NEXT: vmovd %esi, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_4i16_to_4f32: @@ -222,19 +198,18 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { ; AVX512F-NEXT: shrq $48, %rdx ; AVX512F-NEXT: movswl %dx, %edx ; AVX512F-NEXT: vmovd %edx, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: movswl %cx, %ecx ; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512F-NEXT: vmovd %esi, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_8i16_to_4f32: @@ -271,201 +246,54 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { } define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { -; AVX1-LABEL: cvt_8i16_to_8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: movq %rdx, %r8 -; AVX1-NEXT: movq %rdx, %r10 -; AVX1-NEXT: movswl %dx, %r9d -; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: shrq $32, %r8 -; AVX1-NEXT: shrq $48, %r10 -; AVX1-NEXT: vmovq %xmm0, %rdi -; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: movq %rdi, %rsi -; AVX1-NEXT: movswl %di, %ecx -; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> -; AVX1-NEXT: shrl $16, %edi -; AVX1-NEXT: shrq $32, %rax -; AVX1-NEXT: shrq $48, %rsi -; AVX1-NEXT: movswl %si, %esi -; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: cwtl -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: movswl %di, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vmovd %ecx, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: movswl %r10w, %eax -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: movswl %r8w, %eax -; AVX1-NEXT: vmovd %eax, %xmm5 -; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: movswl %dx, %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vmovd %r9d, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_8i16_to_8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: movq %rdx, %r8 -; AVX2-NEXT: movq %rdx, %r10 -; AVX2-NEXT: movswl %dx, %r9d -; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> -; AVX2-NEXT: shrl $16, %edx -; AVX2-NEXT: shrq $32, %r8 -; AVX2-NEXT: shrq $48, %r10 -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: movq %rdi, %rsi -; AVX2-NEXT: movswl %di, %ecx -; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> -; AVX2-NEXT: shrl $16, %edi -; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: shrq $48, %rsi -; AVX2-NEXT: movswl %si, %esi -; AVX2-NEXT: vmovd %esi, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: movswl %di, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vmovd %ecx, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: movswl %r10w, %eax -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: movswl %r8w, %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: movswl %dx, %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vmovd %r9d, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_8i16_to_8f32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512F-NEXT: movq %rdx, %r8 -; AVX512F-NEXT: movq %rdx, %r9 -; AVX512F-NEXT: movswl %dx, %r10d -; AVX512F-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> -; AVX512F-NEXT: shrl $16, %edx -; AVX512F-NEXT: shrq $32, %r8 -; AVX512F-NEXT: shrq $48, %r9 -; AVX512F-NEXT: vmovq %xmm0, %rdi -; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: movq %rdi, %rcx -; AVX512F-NEXT: movswl %di, %esi -; AVX512F-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> -; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: shrq $32, %rax -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: movswl %di, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 -; AVX512F-NEXT: vmovd %esi, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 -; AVX512F-NEXT: movswl %r9w, %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4 -; AVX512F-NEXT: movswl %r8w, %eax -; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5 -; AVX512F-NEXT: movswl %dx, %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6 -; AVX512F-NEXT: vmovd %r10d, %xmm7 -; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_8i16_to_8f32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512VL-NEXT: movq %rdx, %r8 -; AVX512VL-NEXT: movq %rdx, %r10 -; AVX512VL-NEXT: movswl %dx, %r9d -; AVX512VL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> -; AVX512VL-NEXT: shrl $16, %edx -; AVX512VL-NEXT: shrq $32, %r8 -; AVX512VL-NEXT: shrq $48, %r10 -; AVX512VL-NEXT: vmovq %xmm0, %rdi -; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: movq %rdi, %rsi -; AVX512VL-NEXT: movswl %di, %ecx -; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> -; AVX512VL-NEXT: shrl $16, %edi -; AVX512VL-NEXT: shrq $32, %rax -; AVX512VL-NEXT: shrq $48, %rsi -; AVX512VL-NEXT: movswl %si, %esi -; AVX512VL-NEXT: vmovd %esi, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: movswl %di, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: vmovd %ecx, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: movswl %r10w, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: movswl %r8w, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512VL-NEXT: movswl %dx, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: vmovd %r9d, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_8i16_to_8f32: +; ALL: # BB#0: +; ALL-NEXT: vpextrq $1, %xmm0, %rdx +; ALL-NEXT: movq %rdx, %r8 +; ALL-NEXT: movq %rdx, %r10 +; ALL-NEXT: movswl %dx, %r9d +; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> +; ALL-NEXT: shrl $16, %edx +; ALL-NEXT: shrq $32, %r8 +; ALL-NEXT: shrq $48, %r10 +; ALL-NEXT: vmovq %xmm0, %rdi +; ALL-NEXT: movq %rdi, %rax +; ALL-NEXT: movq %rdi, %rsi +; ALL-NEXT: movswl %di, %ecx +; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill> +; ALL-NEXT: shrl $16, %edi +; ALL-NEXT: shrq $32, %rax +; ALL-NEXT: shrq $48, %rsi +; ALL-NEXT: movswl %si, %esi +; ALL-NEXT: vmovd %esi, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: cwtl +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: movswl %di, %eax +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: vmovd %ecx, %xmm3 +; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 +; ALL-NEXT: movswl %r10w, %eax +; ALL-NEXT: vmovd %eax, %xmm4 +; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 +; ALL-NEXT: movswl %r8w, %eax +; ALL-NEXT: vmovd %eax, %xmm5 +; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 +; ALL-NEXT: movswl %dx, %eax +; ALL-NEXT: vmovd %eax, %xmm6 +; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 +; ALL-NEXT: vmovd %r9d, %xmm7 +; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 +; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; ALL-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x float> ret <8 x float> %2 @@ -664,98 +492,98 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; ; AVX512F-LABEL: cvt_16i16_to_16f32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $48, %rcx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm2 +; AVX512F-NEXT: vmovd %ecx, %xmm8 ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $32, %rcx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm3 +; AVX512F-NEXT: vmovd %ecx, %xmm9 ; AVX512F-NEXT: movswl %ax, %ecx ; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> ; AVX512F-NEXT: shrl $16, %eax ; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm4 +; AVX512F-NEXT: vmovd %eax, %xmm11 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vmovd %ecx, %xmm12 ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $48, %rcx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm5 +; AVX512F-NEXT: vmovd %ecx, %xmm13 ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $32, %rcx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm6 +; AVX512F-NEXT: vmovd %ecx, %xmm14 ; AVX512F-NEXT: movswl %ax, %ecx ; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> ; AVX512F-NEXT: shrl $16, %eax ; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm8 +; AVX512F-NEXT: vmovd %eax, %xmm15 +; AVX512F-NEXT: vmovq %xmm10, %rax +; AVX512F-NEXT: vmovd %ecx, %xmm2 ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $48, %rcx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm9 +; AVX512F-NEXT: vmovd %ecx, %xmm3 ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $32, %rcx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm10 +; AVX512F-NEXT: vmovd %ecx, %xmm1 ; AVX512F-NEXT: movswl %ax, %ecx ; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> ; AVX512F-NEXT: shrl $16, %eax ; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm11 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vmovd %eax, %xmm4 +; AVX512F-NEXT: vpextrq $1, %xmm10, %rax +; AVX512F-NEXT: vmovd %ecx, %xmm10 ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $48, %rcx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm12 +; AVX512F-NEXT: vmovd %ecx, %xmm5 ; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: shrq $32, %rcx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm13 +; AVX512F-NEXT: vmovd %ecx, %xmm6 ; AVX512F-NEXT: movl %eax, %ecx ; AVX512F-NEXT: shrl $16, %ecx ; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm14 +; AVX512F-NEXT: vmovd %ecx, %xmm7 ; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm15 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm16 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 -; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5 -; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6 -; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7 -; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8 -; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9 -; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10 -; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12 -; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13 -; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14 -; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm16[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11 +; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12 +; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13 +; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14 +; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10 +; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_16i16_to_16f32: @@ -863,35 +691,12 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; define float @load_cvt_i16_to_f32(i16* %a0) nounwind { -; AVX1-LABEL: load_cvt_i16_to_f32: -; AVX1: # BB#0: -; AVX1-NEXT: movswl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_cvt_i16_to_f32: -; AVX2: # BB#0: -; AVX2-NEXT: movswl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_cvt_i16_to_f32: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_i16_to_f32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: load_cvt_i16_to_f32: +; ALL: # BB#0: +; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: retq %1 = load i16, i16* %a0 %2 = bitcast i16 %1 to half %3 = fpext half %2 to float @@ -899,82 +704,24 @@ define float @load_cvt_i16_to_f32(i16* %a0) nounwind { } define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { -; AVX1-LABEL: load_cvt_4i16_to_4f32: -; AVX1: # BB#0: -; AVX1-NEXT: movswl 6(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: movswl 4(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: movswl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: movswl 2(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_cvt_4i16_to_4f32: -; AVX2: # BB#0: -; AVX2-NEXT: movswl 6(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: movswl 4(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: movswl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: movswl 2(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_cvt_4i16_to_4f32: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl 6(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: movswl 4(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 -; AVX512F-NEXT: movswl 2(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_4i16_to_4f32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl 6(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: movswl 4(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: movswl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: movswl 2(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: retq +; ALL-LABEL: load_cvt_4i16_to_4f32: +; ALL: # BB#0: +; ALL-NEXT: movswl 6(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: movswl 4(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: movswl 2(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm3 +; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; ALL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> %3 = fpext <4 x half> %2 to <4 x float> @@ -1046,19 +793,18 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { ; AVX512F-NEXT: shrq $48, %rdx ; AVX512F-NEXT: movswl %dx, %edx ; AVX512F-NEXT: vmovd %edx, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: movswl %cx, %ecx ; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512F-NEXT: vmovd %esi, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: load_cvt_8i16_to_4f32: @@ -1096,145 +842,40 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { } define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { -; AVX1-LABEL: load_cvt_8i16_to_8f32: -; AVX1: # BB#0: -; AVX1-NEXT: movswl 6(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: movswl 4(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: movswl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: movswl 2(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: movswl 14(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: movswl 12(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm5 -; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: movswl 8(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: movswl 10(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_cvt_8i16_to_8f32: -; AVX2: # BB#0: -; AVX2-NEXT: movswl 6(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: movswl 4(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: movswl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: movswl 2(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: movswl 14(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: movswl 12(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: movswl 8(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: movswl 10(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_cvt_8i16_to_8f32: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl 6(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: movswl 4(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 -; AVX512F-NEXT: movswl 2(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 -; AVX512F-NEXT: movswl 14(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4 -; AVX512F-NEXT: movswl 12(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5 -; AVX512F-NEXT: movswl 8(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6 -; AVX512F-NEXT: movswl 10(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_8i16_to_8f32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl 6(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: movswl 4(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: movswl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: movswl 2(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: movswl 14(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: movswl 12(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512VL-NEXT: movswl 8(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: movswl 10(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: load_cvt_8i16_to_8f32: +; ALL: # BB#0: +; ALL-NEXT: movswl 6(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: movswl 4(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: movswl 2(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm3 +; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 +; ALL-NEXT: movswl 14(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm4 +; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 +; ALL-NEXT: movswl 12(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm5 +; ALL-NEXT: vcvtph2ps %xmm5, %xmm5 +; ALL-NEXT: movswl 8(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm6 +; ALL-NEXT: vcvtph2ps %xmm6, %xmm6 +; ALL-NEXT: movswl 10(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm7 +; ALL-NEXT: vcvtph2ps %xmm7, %xmm7 +; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> %3 = fpext <8 x half> %2 to <8 x float> @@ -1378,65 +1019,65 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; AVX512F: # BB#0: ; AVX512F-NEXT: movswl 6(%rdi), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm16 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8 ; AVX512F-NEXT: movswl 4(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm17 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9 ; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10 ; AVX512F-NEXT: movswl 2(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11 ; AVX512F-NEXT: movswl 14(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12 ; AVX512F-NEXT: movswl 12(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13 ; AVX512F-NEXT: movswl 8(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14 ; AVX512F-NEXT: movswl 10(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15 ; AVX512F-NEXT: movswl 22(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm8 -; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8 +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: movswl 20(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm9 -; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9 +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: movswl 16(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm10 -; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10 +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512F-NEXT: movswl 18(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm11 -; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11 +; AVX512F-NEXT: vmovd %eax, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512F-NEXT: movswl 30(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm12 -; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12 +; AVX512F-NEXT: vmovd %eax, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 ; AVX512F-NEXT: movswl 28(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm13 -; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13 +; AVX512F-NEXT: vmovd %eax, %xmm5 +; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 ; AVX512F-NEXT: movswl 24(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm14 -; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14 +; AVX512F-NEXT: vmovd %eax, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX512F-NEXT: movswl 26(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm15 -; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0],xmm0[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; AVX512F-NEXT: vmovd %eax, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm17[0],xmm2[3] -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm16[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq @@ -1518,38 +1159,13 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; define double @cvt_i16_to_f64(i16 %a0) nounwind { -; AVX1-LABEL: cvt_i16_to_f64: -; AVX1: # BB#0: -; AVX1-NEXT: movswl %di, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_i16_to_f64: -; AVX2: # BB#0: -; AVX2-NEXT: movswl %di, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_i16_to_f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl %di, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_i16_to_f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl %di, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_i16_to_f64: +; ALL: # BB#0: +; ALL-NEXT: movswl %di, %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: retq %1 = bitcast i16 %a0 to half %2 = fpext half %1 to double ret double %2 @@ -1599,13 +1215,12 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { ; AVX512F-NEXT: shrl $16, %eax ; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_2i16_to_2f64: @@ -1701,15 +1316,15 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: movswl %dx, %edx ; AVX512F-NEXT: vmovd %edx, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %esi, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: movswl %cx, %ecx ; AVX512F-NEXT: vmovd %ecx, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -1791,13 +1406,12 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { ; AVX512F-NEXT: shrl $16, %eax ; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %ecx, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_8i16_to_2f64: @@ -1892,15 +1506,15 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: movswl %dx, %edx ; AVX512F-NEXT: vmovd %edx, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %esi, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: movswl %cx, %ecx ; AVX512F-NEXT: vmovd %ecx, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -1950,25 +1564,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX1-LABEL: cvt_8i16_to_8f64: ; AVX1: # BB#0: ; AVX1-NEXT: vmovq %xmm0, %rdx -; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: movq %rdx, %r9 ; AVX1-NEXT: movl %edx, %r10d -; AVX1-NEXT: movswl %dx, %r9d +; AVX1-NEXT: movswl %dx, %r8d ; AVX1-NEXT: shrq $48, %rdx -; AVX1-NEXT: shrq $32, %r8 +; AVX1-NEXT: shrq $32, %r9 ; AVX1-NEXT: shrl $16, %r10d ; AVX1-NEXT: vpextrq $1, %xmm0, %rdi -; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: movq %rdi, %rsi +; AVX1-NEXT: movl %edi, %eax ; AVX1-NEXT: movswl %di, %ecx ; AVX1-NEXT: shrq $48, %rdi -; AVX1-NEXT: shrq $32, %rax -; AVX1-NEXT: shrl $16, %esi -; AVX1-NEXT: movswl %si, %esi -; AVX1-NEXT: vmovd %esi, %xmm0 +; AVX1-NEXT: shrq $32, %rsi +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: cwtl +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1 ; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2 -; AVX1-NEXT: cwtl +; AVX1-NEXT: movswl %si, %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3 ; AVX1-NEXT: movswl %di, %eax @@ -1977,9 +1591,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX1-NEXT: movswl %r10w, %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vmovd %r9d, %xmm5 +; AVX1-NEXT: vmovd %r8d, %xmm5 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: movswl %r8w, %eax +; AVX1-NEXT: movswl %r9w, %eax ; AVX1-NEXT: vmovd %eax, %xmm6 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX1-NEXT: movswl %dx, %eax @@ -2004,25 +1618,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX2-LABEL: cvt_8i16_to_8f64: ; AVX2: # BB#0: ; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: movq %rdx, %r9 ; AVX2-NEXT: movl %edx, %r10d -; AVX2-NEXT: movswl %dx, %r9d +; AVX2-NEXT: movswl %dx, %r8d ; AVX2-NEXT: shrq $48, %rdx -; AVX2-NEXT: shrq $32, %r8 +; AVX2-NEXT: shrq $32, %r9 ; AVX2-NEXT: shrl $16, %r10d ; AVX2-NEXT: vpextrq $1, %xmm0, %rdi -; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: movq %rdi, %rsi +; AVX2-NEXT: movl %edi, %eax ; AVX2-NEXT: movswl %di, %ecx ; AVX2-NEXT: shrq $48, %rdi -; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: shrl $16, %esi -; AVX2-NEXT: movswl %si, %esi -; AVX2-NEXT: vmovd %esi, %xmm0 +; AVX2-NEXT: shrq $32, %rsi +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: cwtl +; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1 ; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2 -; AVX2-NEXT: cwtl +; AVX2-NEXT: movswl %si, %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3 ; AVX2-NEXT: movswl %di, %eax @@ -2031,9 +1645,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX2-NEXT: movswl %r10w, %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vmovd %r9d, %xmm5 +; AVX2-NEXT: vmovd %r8d, %xmm5 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: movswl %r8w, %eax +; AVX2-NEXT: movswl %r9w, %eax ; AVX2-NEXT: vmovd %eax, %xmm6 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 ; AVX2-NEXT: movswl %dx, %eax @@ -2055,115 +1669,60 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: cvt_8i16_to_8f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512F-NEXT: movq %rdx, %r8 -; AVX512F-NEXT: movl %edx, %r9d -; AVX512F-NEXT: movswl %dx, %r10d -; AVX512F-NEXT: shrq $48, %rdx -; AVX512F-NEXT: shrq $32, %r8 -; AVX512F-NEXT: shrl $16, %r9d -; AVX512F-NEXT: vmovq %xmm0, %rdi -; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: movl %edi, %ecx -; AVX512F-NEXT: movswl %di, %esi -; AVX512F-NEXT: shrq $48, %rdi -; AVX512F-NEXT: shrq $32, %rax -; AVX512F-NEXT: shrl $16, %ecx -; AVX512F-NEXT: movswl %cx, %ecx -; AVX512F-NEXT: vmovd %ecx, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: vmovd %esi, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: cwtl -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 -; AVX512F-NEXT: movswl %di, %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 -; AVX512F-NEXT: movswl %r9w, %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4 -; AVX512F-NEXT: vmovd %r10d, %xmm5 -; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5 -; AVX512F-NEXT: movswl %r8w, %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6 -; AVX512F-NEXT: movswl %dx, %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7 -; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_8i16_to_8f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512VL-NEXT: movq %rdx, %r8 -; AVX512VL-NEXT: movl %edx, %r10d -; AVX512VL-NEXT: movswl %dx, %r9d -; AVX512VL-NEXT: shrq $48, %rdx -; AVX512VL-NEXT: shrq $32, %r8 -; AVX512VL-NEXT: shrl $16, %r10d -; AVX512VL-NEXT: vmovq %xmm0, %rdi -; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: movl %edi, %esi -; AVX512VL-NEXT: movswl %di, %ecx -; AVX512VL-NEXT: shrq $48, %rdi -; AVX512VL-NEXT: shrq $32, %rax -; AVX512VL-NEXT: shrl $16, %esi -; AVX512VL-NEXT: movswl %si, %esi -; AVX512VL-NEXT: vmovd %esi, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %ecx, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: cwtl -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: movswl %di, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: movswl %r10w, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: vmovd %r9d, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512VL-NEXT: movswl %r8w, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: movswl %dx, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: cvt_8i16_to_8f64: +; AVX512: # BB#0: +; AVX512-NEXT: vpextrq $1, %xmm0, %rdx +; AVX512-NEXT: movq %rdx, %r9 +; AVX512-NEXT: movl %edx, %r10d +; AVX512-NEXT: movswl %dx, %r8d +; AVX512-NEXT: shrq $48, %rdx +; AVX512-NEXT: shrq $32, %r9 +; AVX512-NEXT: shrl $16, %r10d +; AVX512-NEXT: vmovq %xmm0, %rdi +; AVX512-NEXT: movq %rdi, %rsi +; AVX512-NEXT: movl %edi, %eax +; AVX512-NEXT: movswl %di, %ecx +; AVX512-NEXT: shrq $48, %rdi +; AVX512-NEXT: shrq $32, %rsi +; AVX512-NEXT: shrl $16, %eax +; AVX512-NEXT: cwtl +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovd %ecx, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: movswl %si, %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: movswl %di, %eax +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: movswl %r10w, %eax +; AVX512-NEXT: vmovd %eax, %xmm4 +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovd %r8d, %xmm5 +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512-NEXT: movswl %r9w, %eax +; AVX512-NEXT: vmovd %eax, %xmm6 +; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512-NEXT: movswl %dx, %eax +; AVX512-NEXT: vmovd %eax, %xmm7 +; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 +; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 +; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] +; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x double> ret <8 x double> %2 @@ -2174,38 +1733,13 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; define double @load_cvt_i16_to_f64(i16* %a0) nounwind { -; AVX1-LABEL: load_cvt_i16_to_f64: -; AVX1: # BB#0: -; AVX1-NEXT: movswl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_cvt_i16_to_f64: -; AVX2: # BB#0: -; AVX2-NEXT: movswl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_cvt_i16_to_f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_i16_to_f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: load_cvt_i16_to_f64: +; ALL: # BB#0: +; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: retq %1 = load i16, i16* %a0 %2 = bitcast i16 %1 to half %3 = fpext half %2 to double @@ -2213,58 +1747,18 @@ define double @load_cvt_i16_to_f64(i16* %a0) nounwind { } define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind { -; AVX1-LABEL: load_cvt_2i16_to_2f64: -; AVX1: # BB#0: -; AVX1-NEXT: movswl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: movswl 2(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_cvt_2i16_to_2f64: -; AVX2: # BB#0: -; AVX2-NEXT: movswl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: movswl 2(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_cvt_2i16_to_2f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: movswl 2(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_2i16_to_2f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: movswl 2(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq +; ALL-LABEL: load_cvt_2i16_to_2f64: +; ALL: # BB#0: +; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: movswl 2(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ALL-NEXT: retq %1 = load <2 x i16>, <2 x i16>* %a0 %2 = bitcast <2 x i16> %1 to <2 x half> %3 = fpext <2 x half> %2 to <2 x double> @@ -2272,97 +1766,28 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind { } define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind { -; AVX1-LABEL: load_cvt_4i16_to_4f64: -; AVX1: # BB#0: -; AVX1-NEXT: movswl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: movswl 2(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: movswl 4(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: movswl 6(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_cvt_4i16_to_4f64: -; AVX2: # BB#0: -; AVX2-NEXT: movswl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: movswl 2(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: movswl 4(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: movswl 6(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_cvt_4i16_to_4f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: movswl 2(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: movswl 4(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 -; AVX512F-NEXT: movswl 6(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 -; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_4i16_to_4f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: movswl 2(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: movswl 4(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: movswl 6(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: load_cvt_4i16_to_4f64: +; ALL: # BB#0: +; ALL-NEXT: movswl (%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 +; ALL-NEXT: movswl 2(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm1 +; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 +; ALL-NEXT: movswl 4(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm2 +; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 +; ALL-NEXT: movswl 6(%rdi), %eax +; ALL-NEXT: vmovd %eax, %xmm3 +; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 +; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; ALL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> %3 = fpext <4 x half> %2 to <4 x double> @@ -2439,15 +1864,15 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind { ; AVX512F-NEXT: shrl $16, %edx ; AVX512F-NEXT: movswl %dx, %edx ; AVX512F-NEXT: vmovd %edx, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %esi, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: movswl %cx, %ecx ; AVX512F-NEXT: vmovd %ecx, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 ; AVX512F-NEXT: cwtl ; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -2579,91 +2004,48 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_cvt_8i16_to_8f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: movswl (%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 -; AVX512F-NEXT: movswl 2(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1 -; AVX512F-NEXT: movswl 4(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2 -; AVX512F-NEXT: movswl 6(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3 -; AVX512F-NEXT: movswl 8(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4 -; AVX512F-NEXT: movswl 10(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5 -; AVX512F-NEXT: movswl 12(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6 -; AVX512F-NEXT: movswl 14(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7 -; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_8i16_to_8f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: movswl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: movswl 2(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: movswl 4(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: movswl 6(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: movswl 8(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: movswl 10(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512VL-NEXT: movswl 12(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: movswl 14(%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 -; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: load_cvt_8i16_to_8f64: +; AVX512: # BB#0: +; AVX512-NEXT: movswl (%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movswl 2(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: movswl 4(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: movswl 6(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: movswl 8(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm4 +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: movswl 10(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm5 +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512-NEXT: movswl 12(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm6 +; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512-NEXT: movswl 14(%rdi), %eax +; AVX512-NEXT: vmovd %eax, %xmm7 +; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7 +; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5 +; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> %3 = fpext <8 x half> %2 to <8 x double> @@ -2675,138 +2057,41 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { ; define i16 @cvt_f32_to_i16(float %a0) nounwind { -; AVX1-LABEL: cvt_f32_to_i16: -; AVX1: # BB#0: -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_f32_to_i16: -; AVX2: # BB#0: -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_f32_to_i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_f32_to_i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_f32_to_i16: +; ALL: # BB#0: +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; ALL-NEXT: retq %1 = fptrunc float %a0 to half %2 = bitcast half %1 to i16 ret i16 %2 } define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { -; AVX1-LABEL: cvt_4f32_to_4i16: -; AVX1: # BB#0: -; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movzwl %cx, %ecx -; AVX1-NEXT: orl %eax, %ecx -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: movzwl %dx, %edx -; AVX1-NEXT: orl %eax, %edx -; AVX1-NEXT: shlq $32, %rdx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_4f32_to_4i16: -; AVX2: # BB#0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: orl %eax, %ecx -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: movzwl %dx, %edx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: shlq $32, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_4f32_to_4i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: shll $16, %ecx -; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vmovd %xmm0, %edx -; AVX512F-NEXT: shll $16, %edx -; AVX512F-NEXT: orl %eax, %edx -; AVX512F-NEXT: shlq $32, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_4f32_to_4i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: shll $16, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %ecx -; AVX512VL-NEXT: movzwl %cx, %ecx -; AVX512VL-NEXT: orl %eax, %ecx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: shll $16, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %edx -; AVX512VL-NEXT: movzwl %dx, %edx -; AVX512VL-NEXT: orl %eax, %edx -; AVX512VL-NEXT: shlq $32, %rdx -; AVX512VL-NEXT: orq %rcx, %rdx -; AVX512VL-NEXT: vmovq %rdx, %xmm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_4f32_to_4i16: +; ALL: # BB#0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: movzwl %cx, %ecx +; ALL-NEXT: orl %eax, %ecx +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %edx +; ALL-NEXT: movzwl %dx, %edx +; ALL-NEXT: orl %eax, %edx +; ALL-NEXT: shlq $32, %rdx +; ALL-NEXT: orq %rcx, %rdx +; ALL-NEXT: vmovq %rdx, %xmm0 +; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> ret <4 x i16> %2 @@ -2865,29 +2150,27 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { ; ; AVX512F-LABEL: cvt_4f32_to_8i16_undef: ; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: shll $16, %eax +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 ; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: shll $16, %ecx +; AVX512F-NEXT: movzwl %cx, %ecx ; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: shll $16, %eax +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %edx -; AVX512F-NEXT: shll $16, %edx +; AVX512F-NEXT: movzwl %dx, %edx ; AVX512F-NEXT: orl %eax, %edx ; AVX512F-NEXT: shlq $32, %rdx ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_4f32_to_8i16_undef: @@ -2974,29 +2257,27 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { ; ; AVX512F-LABEL: cvt_4f32_to_8i16_zero: ; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: shll $16, %eax +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 ; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: shll $16, %ecx +; AVX512F-NEXT: movzwl %cx, %ecx ; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: shll $16, %eax +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %edx -; AVX512F-NEXT: shll $16, %edx +; AVX512F-NEXT: movzwl %dx, %edx ; AVX512F-NEXT: orl %eax, %edx ; AVX512F-NEXT: shlq $32, %rdx ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_4f32_to_8i16_zero: @@ -3033,194 +2314,52 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { } define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { -; AVX1-LABEL: cvt_8f32_to_8i16: -; AVX1: # BB#0: -; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movzwl %cx, %ecx -; AVX1-NEXT: orl %eax, %ecx -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %edx -; AVX1-NEXT: shll $16, %edx -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: orl %edx, %eax -; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: shll $16, %ecx -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %edx -; AVX1-NEXT: movzwl %dx, %edx -; AVX1-NEXT: orl %ecx, %edx -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: shll $16, %ecx -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: movzwl %si, %esi -; AVX1-NEXT: orl %ecx, %esi -; AVX1-NEXT: shlq $32, %rsi -; AVX1-NEXT: orq %rdx, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_8f32_to_8i16: -; AVX2: # BB#0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: orl %eax, %ecx -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %edx -; AVX2-NEXT: shll $16, %edx -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: orl %edx, %eax -; AVX2-NEXT: shlq $32, %rax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: shll $16, %ecx -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %edx -; AVX2-NEXT: movzwl %dx, %edx -; AVX2-NEXT: orl %ecx, %edx -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: shll $16, %ecx -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: movzwl %si, %esi -; AVX2-NEXT: orl %ecx, %esi -; AVX2-NEXT: shlq $32, %rsi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm0 -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_8f32_to_8i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: shll $16, %ecx -; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %edx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: shll $16, %eax -; AVX512F-NEXT: orl %edx, %eax -; AVX512F-NEXT: shlq $32, %rax -; AVX512F-NEXT: orq %rcx, %rax -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: movzwl %cx, %ecx -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %edx -; AVX512F-NEXT: shll $16, %edx -; AVX512F-NEXT: orl %ecx, %edx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: movzwl %cx, %ecx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vmovd %xmm0, %esi -; AVX512F-NEXT: shll $16, %esi -; AVX512F-NEXT: orl %ecx, %esi -; AVX512F-NEXT: shlq $32, %rsi -; AVX512F-NEXT: orq %rdx, %rsi -; AVX512F-NEXT: vmovq %rsi, %xmm0 -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_8f32_to_8i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: shll $16, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %ecx -; AVX512VL-NEXT: movzwl %cx, %ecx -; AVX512VL-NEXT: orl %eax, %ecx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %edx -; AVX512VL-NEXT: shll $16, %edx -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: orl %edx, %eax -; AVX512VL-NEXT: shlq $32, %rax -; AVX512VL-NEXT: orq %rcx, %rax -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %ecx -; AVX512VL-NEXT: shll $16, %ecx -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %edx -; AVX512VL-NEXT: movzwl %dx, %edx -; AVX512VL-NEXT: orl %ecx, %edx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %ecx -; AVX512VL-NEXT: shll $16, %ecx -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %esi -; AVX512VL-NEXT: movzwl %si, %esi -; AVX512VL-NEXT: orl %ecx, %esi -; AVX512VL-NEXT: shlq $32, %rsi -; AVX512VL-NEXT: orq %rdx, %rsi -; AVX512VL-NEXT: vmovq %rsi, %xmm0 -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_8f32_to_8i16: +; ALL: # BB#0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: movzwl %cx, %ecx +; ALL-NEXT: orl %eax, %ecx +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %edx +; ALL-NEXT: shll $16, %edx +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: movzwl %ax, %eax +; ALL-NEXT: orl %edx, %eax +; ALL-NEXT: shlq $32, %rax +; ALL-NEXT: orq %rcx, %rax +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: shll $16, %ecx +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT: vmovd %xmm1, %edx +; ALL-NEXT: movzwl %dx, %edx +; ALL-NEXT: orl %ecx, %edx +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: shll $16, %ecx +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %esi +; ALL-NEXT: movzwl %si, %esi +; ALL-NEXT: orl %ecx, %esi +; ALL-NEXT: shlq $32, %rsi +; ALL-NEXT: orq %rdx, %rsi +; ALL-NEXT: vmovq %rsi, %xmm0 +; ALL-NEXT: vmovq %rax, %xmm1 +; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq %1 = fptrunc <8 x float> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> ret <8 x i16> %2 @@ -3361,141 +2500,73 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: cvt_16f32_to_16i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm2 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm1 -; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0 -; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_16f32_to_16i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm2 -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm1 -; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: cvt_16f32_to_16i16: +; AVX512: # BB#0: +; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 +; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0 +; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %1 = fptrunc <16 x float> %a0 to <16 x half> %2 = bitcast <16 x half> %1 to <16 x i16> ret <16 x i16> %2 @@ -3506,35 +2577,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind { -; AVX1-LABEL: store_cvt_f32_to_i16: -; AVX1: # BB#0: -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: movw %ax, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_f32_to_i16: -; AVX2: # BB#0: -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: movw %ax, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_cvt_f32_to_i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: movw %ax, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_cvt_f32_to_i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: movw %ax, (%rdi) -; AVX512VL-NEXT: retq +; ALL-LABEL: store_cvt_f32_to_i16: +; ALL: # BB#0: +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, (%rdi) +; ALL-NEXT: retq %1 = fptrunc float %a0 to half %2 = bitcast half %1 to i16 store i16 %2, i16* %a1 @@ -3542,83 +2590,24 @@ define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind { } define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind { -; AVX1-LABEL: store_cvt_4f32_to_4i16: -; AVX1: # BB#0: -; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %edx -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: movw %si, (%rdi) -; AVX1-NEXT: movw %dx, 6(%rdi) -; AVX1-NEXT: movw %cx, 4(%rdi) -; AVX1-NEXT: movw %ax, 2(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_4f32_to_4i16: -; AVX2: # BB#0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %edx -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: movw %si, (%rdi) -; AVX2-NEXT: movw %dx, 6(%rdi) -; AVX2-NEXT: movw %cx, 4(%rdi) -; AVX2-NEXT: movw %ax, 2(%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_cvt_4f32_to_4i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %edx -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vmovd %xmm0, %esi -; AVX512F-NEXT: movw %si, (%rdi) -; AVX512F-NEXT: movw %dx, 6(%rdi) -; AVX512F-NEXT: movw %cx, 4(%rdi) -; AVX512F-NEXT: movw %ax, 2(%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_cvt_4f32_to_4i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %ecx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %edx -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %esi -; AVX512VL-NEXT: movw %si, (%rdi) -; AVX512VL-NEXT: movw %dx, 6(%rdi) -; AVX512VL-NEXT: movw %cx, 4(%rdi) -; AVX512VL-NEXT: movw %ax, 2(%rdi) -; AVX512VL-NEXT: retq +; ALL-LABEL: store_cvt_4f32_to_4i16: +; ALL: # BB#0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %edx +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %esi +; ALL-NEXT: movw %si, (%rdi) +; ALL-NEXT: movw %dx, 6(%rdi) +; ALL-NEXT: movw %cx, 4(%rdi) +; ALL-NEXT: movw %ax, 2(%rdi) +; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> store <4 x i16> %2, <4 x i16>* %a1 @@ -3680,30 +2669,28 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw ; ; AVX512F-LABEL: store_cvt_4f32_to_8i16_undef: ; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: shll $16, %eax +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 ; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: shll $16, %ecx +; AVX512F-NEXT: movzwl %cx, %ecx ; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: shll $16, %eax +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %edx -; AVX512F-NEXT: shll $16, %edx +; AVX512F-NEXT: movzwl %dx, %edx ; AVX512F-NEXT: orl %eax, %edx ; AVX512F-NEXT: shlq $32, %rdx ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef: @@ -3794,30 +2781,28 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi ; ; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero: ; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: shll $16, %eax +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 ; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: shll $16, %ecx +; AVX512F-NEXT: movzwl %cx, %ecx ; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: shll $16, %eax +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %edx -; AVX512F-NEXT: shll $16, %edx +; AVX512F-NEXT: movzwl %dx, %edx ; AVX512F-NEXT: orl %eax, %edx ; AVX512F-NEXT: shlq $32, %rdx ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero: @@ -3856,150 +2841,41 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi } define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { -; AVX1-LABEL: store_cvt_8f32_to_8i16: -; AVX1: # BB#0: -; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %r8d -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %r9d -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %r10d -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %r11d -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %eax -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %ecx -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %esi -; AVX1-NEXT: movw %si, 8(%rdi) -; AVX1-NEXT: movw %dx, (%rdi) -; AVX1-NEXT: movw %cx, 14(%rdi) -; AVX1-NEXT: movw %ax, 12(%rdi) -; AVX1-NEXT: movw %r11w, 10(%rdi) -; AVX1-NEXT: movw %r10w, 6(%rdi) -; AVX1-NEXT: movw %r9w, 4(%rdi) -; AVX1-NEXT: movw %r8w, 2(%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_8f32_to_8i16: -; AVX2: # BB#0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %r8d -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %r9d -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %r10d -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %r11d -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %eax -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %ecx -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %esi -; AVX2-NEXT: movw %si, 8(%rdi) -; AVX2-NEXT: movw %dx, (%rdi) -; AVX2-NEXT: movw %cx, 14(%rdi) -; AVX2-NEXT: movw %ax, 12(%rdi) -; AVX2-NEXT: movw %r11w, 10(%rdi) -; AVX2-NEXT: movw %r10w, 6(%rdi) -; AVX2-NEXT: movw %r9w, 4(%rdi) -; AVX2-NEXT: movw %r8w, 2(%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_cvt_8f32_to_8i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %r8d -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %r9d -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: vmovd %xmm1, %r10d -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vmovd %xmm2, %r11d -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vmovd %xmm2, %ecx -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vmovd %xmm0, %edx -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0 -; AVX512F-NEXT: vmovd %xmm0, %esi -; AVX512F-NEXT: movw %si, 8(%rdi) -; AVX512F-NEXT: movw %dx, (%rdi) -; AVX512F-NEXT: movw %cx, 14(%rdi) -; AVX512F-NEXT: movw %ax, 12(%rdi) -; AVX512F-NEXT: movw %r11w, 10(%rdi) -; AVX512F-NEXT: movw %r10w, 6(%rdi) -; AVX512F-NEXT: movw %r9w, 4(%rdi) -; AVX512F-NEXT: movw %r8w, 2(%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_cvt_8f32_to_8i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %r8d -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %r9d -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %r10d -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovd %xmm2, %r11d -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovd %xmm2, %ecx -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %edx -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %esi -; AVX512VL-NEXT: movw %si, 8(%rdi) -; AVX512VL-NEXT: movw %dx, (%rdi) -; AVX512VL-NEXT: movw %cx, 14(%rdi) -; AVX512VL-NEXT: movw %ax, 12(%rdi) -; AVX512VL-NEXT: movw %r11w, 10(%rdi) -; AVX512VL-NEXT: movw %r10w, 6(%rdi) -; AVX512VL-NEXT: movw %r9w, 4(%rdi) -; AVX512VL-NEXT: movw %r8w, 2(%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; ALL-LABEL: store_cvt_8f32_to_8i16: +; ALL: # BB#0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %r8d +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %r9d +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %r10d +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %r11d +; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %eax +; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; ALL-NEXT: vmovd %xmm2, %ecx +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %edx +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0 +; ALL-NEXT: vmovd %xmm0, %esi +; ALL-NEXT: movw %si, 8(%rdi) +; ALL-NEXT: movw %dx, (%rdi) +; ALL-NEXT: movw %cx, 14(%rdi) +; ALL-NEXT: movw %ax, 12(%rdi) +; ALL-NEXT: movw %r11w, 10(%rdi) +; ALL-NEXT: movw %r10w, 6(%rdi) +; ALL-NEXT: movw %r9w, 4(%rdi) +; ALL-NEXT: movw %r8w, 2(%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq %1 = fptrunc <8 x float> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> store <8 x i16> %2, <8 x i16>* %a1 @@ -4141,141 +3017,73 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: store_cvt_16f32_to_16i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm4 -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm4 -; AVX512F-NEXT: movw %ax, 24(%rdi) -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm4 -; AVX512F-NEXT: movw %ax, 16(%rdi) -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm4 -; AVX512F-NEXT: movw %ax, 8(%rdi) -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4 -; AVX512F-NEXT: movw %ax, (%rdi) -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4 -; AVX512F-NEXT: movw %ax, 30(%rdi) -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 -; AVX512F-NEXT: movw %ax, 28(%rdi) -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 -; AVX512F-NEXT: movw %ax, 26(%rdi) -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 -; AVX512F-NEXT: movw %ax, 22(%rdi) -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: movw %ax, 20(%rdi) -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: movw %ax, 18(%rdi) -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: movw %ax, 14(%rdi) -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movw %ax, 12(%rdi) -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: movw %ax, 10(%rdi) -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: movw %ax, 6(%rdi) -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: movw %ax, 4(%rdi) -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: movw %ax, 2(%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_cvt_16f32_to_16i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4 -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4 -; AVX512VL-NEXT: movw %ax, 24(%rdi) -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm4 -; AVX512VL-NEXT: movw %ax, 16(%rdi) -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm4 -; AVX512VL-NEXT: movw %ax, 8(%rdi) -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512VL-NEXT: movw %ax, (%rdi) -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512VL-NEXT: movw %ax, 30(%rdi) -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512VL-NEXT: movw %ax, 28(%rdi) -; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512VL-NEXT: movw %ax, 26(%rdi) -; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512VL-NEXT: movw %ax, 22(%rdi) -; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: movw %ax, 20(%rdi) -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: movw %ax, 18(%rdi) -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: movw %ax, 14(%rdi) -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movw %ax, 12(%rdi) -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: movw %ax, 10(%rdi) -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: movw %ax, 6(%rdi) -; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: movw %ax, 4(%rdi) -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: movw %ax, 2(%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: store_cvt_16f32_to_16i16: +; AVX512: # BB#0: +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4 +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4 +; AVX512-NEXT: movw %ax, 24(%rdi) +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4 +; AVX512-NEXT: movw %ax, 16(%rdi) +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4 +; AVX512-NEXT: movw %ax, 8(%rdi) +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: movw %ax, (%rdi) +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: movw %ax, 30(%rdi) +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: movw %ax, 28(%rdi) +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: movw %ax, 26(%rdi) +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: movw %ax, 22(%rdi) +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: movw %ax, 20(%rdi) +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: movw %ax, 18(%rdi) +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512-NEXT: movw %ax, 14(%rdi) +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movw %ax, 12(%rdi) +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: movw %ax, 10(%rdi) +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: movw %ax, 6(%rdi) +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: movw %ax, 4(%rdi) +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: movw %ax, 2(%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = fptrunc <16 x float> %a0 to <16 x half> %2 = bitcast <16 x half> %1 to <16 x i16> store <16 x i16> %2, <16 x i16>* %a1 diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index cd4b237735f1..25377f267996 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -3333,11 +3333,17 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: popq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 48 ; AVX1-NEXT: popq %r12 +; AVX1-NEXT: .cfi_def_cfa_offset 40 ; AVX1-NEXT: popq %r13 +; AVX1-NEXT: .cfi_def_cfa_offset 32 ; AVX1-NEXT: popq %r14 +; AVX1-NEXT: .cfi_def_cfa_offset 24 ; AVX1-NEXT: popq %r15 +; AVX1-NEXT: .cfi_def_cfa_offset 16 ; AVX1-NEXT: popq %rbp +; AVX1-NEXT: .cfi_def_cfa_offset 8 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_16i1_to_16i16: @@ -3424,11 +3430,17 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: popq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 48 ; AVX2-NEXT: popq %r12 +; AVX2-NEXT: .cfi_def_cfa_offset 40 ; AVX2-NEXT: popq %r13 +; AVX2-NEXT: .cfi_def_cfa_offset 32 ; AVX2-NEXT: popq %r14 +; AVX2-NEXT: .cfi_def_cfa_offset 24 ; AVX2-NEXT: popq %r15 +; AVX2-NEXT: .cfi_def_cfa_offset 16 ; AVX2-NEXT: popq %rbp +; AVX2-NEXT: .cfi_def_cfa_offset 8 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_sext_16i1_to_16i16: @@ -4824,6 +4836,7 @@ define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp { ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 ; X32-SSE41-NEXT: movd %xmm0, %eax ; X32-SSE41-NEXT: popl %ecx +; X32-SSE41-NEXT: .cfi_def_cfa_offset 4 ; X32-SSE41-NEXT: retl entry: %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1> diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index dd329d21dc97..7ef5bee54204 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3963,10 +3963,20 @@ define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i } define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %bc0hi = bitcast <8 x i16> %ahi to <16 x i8> diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index cf1aaca4ee20..56567c7e794e 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1053,8 +1053,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_3254: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4> ret <4 x i64> %shuffle @@ -1075,8 +1075,8 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_3276: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6> ret <4 x i64> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index b95e7cf008aa..e4234c058453 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1789,21 +1789,33 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_7654fedc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_7654fedc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_7654fedc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { -; ALL-LABEL: shuffle_v8i32_fedc7654: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8i32_fedc7654: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_fedc7654: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4> ret <8 x i32> %shuffle } @@ -2177,10 +2189,15 @@ define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) { -; ALL-LABEL: concat_v8i32_4567CDEF_bc: -; ALL: # BB#0: -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: retq +; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: retq %a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15> %bc0hi = bitcast <4 x i32> %a0hi to <2 x i64> diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 6c9805597215..1d17ef109d26 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1165,14 +1165,31 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01014545: ; AVX512F: # BB#0: -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01014545: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-32-NEXT: retl + + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01014545_mem(<8 x i64>* %ptr, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64_01014545_mem: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_01014545_mem: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5] ; AVX512F-32-NEXT: retl + %a = load <8 x i64>, <8 x i64>* %ptr %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> ret <8 x i64> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll index efbe5586747f..b107b60cd6d2 100644 --- a/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -619,6 +619,7 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){ ; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1 ; KNL32-NEXT: movl %ebp, %esp ; KNL32-NEXT: popl %ebp +; KNL32-NEXT: .cfi_def_cfa %esp, 4 ; KNL32-NEXT: retl entry: %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> <i32 64, i32 1, i32 66, i32 3, i32 68, i32 5, i32 70, i32 7, i32 72, i32 9, i32 74, i32 11, i32 76, i32 13, i32 78, i32 15, i32 80, i32 17, i32 82, i32 19, i32 84, i32 21, i32 86, i32 23, i32 88, i32 25, i32 90, i32 27, i32 92, i32 29, i32 94, i32 31, i32 96, i32 33, i32 98, i32 35, i32 100, i32 37, i32 102, i32 39, i32 104, i32 41, i32 106, i32 43, i32 108, i32 45, i32 110, i32 47, i32 112, i32 49, i32 114, i32 51, i32 116, i32 53, i32 118, i32 55, i32 120, i32 57, i32 122, i32 59, i32 124, i32 61, i32 126, i32 63> @@ -659,6 +660,7 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){ ; KNL32-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] ; KNL32-NEXT: movl %ebp, %esp ; KNL32-NEXT: popl %ebp +; KNL32-NEXT: .cfi_def_cfa %esp, 4 ; KNL32-NEXT: retl entry: %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31> diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index 8d057290085c..0e690347a543 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -630,6 +630,7 @@ define i64 @shuf64i1_zero(i64 %a) { ; AVX512F-NEXT: orq %rcx, %rax ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -662,6 +663,7 @@ define i64 @shuf64i1_zero(i64 %a) { ; AVX512VL-NEXT: orq %rcx, %rax ; AVX512VL-NEXT: movq %rbp, %rsp ; AVX512VL-NEXT: popq %rbp +; AVX512VL-NEXT: .cfi_def_cfa %rsp, 8 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index dc08d88074d2..ac1083ad4478 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -813,13 +813,10 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i16_lshr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -947,28 +944,52 @@ entry: } define void @trunc16i32_16i8_lshr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i8_lshr: -; SSE: # BB#0: # %entry -; SSE-NEXT: psrld $24, %xmm1 -; SSE-NEXT: psrld $24, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: psrld $24, %xmm3 -; SSE-NEXT: psrld $24, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i8_lshr: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $24, %xmm3 +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8_lshr: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: psrld $24, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: psrld $24, %xmm3 +; SSSE3-NEXT: psrld $24, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8_lshr: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: psrld $24, %xmm1 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm3 +; SSE41-NEXT: psrld $24, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8_lshr: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -976,16 +997,12 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i8_lshr: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll index 97460b36a749..9bd53c6fbd35 100644 --- a/test/CodeGen/X86/wide-integer-cmp.ll +++ b/test/CodeGen/X86/wide-integer-cmp.ll @@ -105,10 +105,13 @@ define i32 @test_wide(i128 %a, i128 %b) { ; CHECK-NEXT: # BB#1: # %bb1 ; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl ; CHECK-NEXT: .LBB4_2: # %bb2 +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: movl $2, %eax ; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl entry: %cmp = icmp slt i128 %a, %b diff --git a/test/CodeGen/X86/x86-framelowering-trap.ll b/test/CodeGen/X86/x86-framelowering-trap.ll index f1590abcae8b..89f4528fb06d 100644 --- a/test/CodeGen/X86/x86-framelowering-trap.ll +++ b/test/CodeGen/X86/x86-framelowering-trap.ll @@ -6,6 +6,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: pushq ; CHECK: ud2 ; CHECK-NEXT: popq +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq define void @bar() { entry: diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index acad9f771fc7..bc6a6ea205c1 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -1816,6 +1816,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX1-NEXT: vmovaps %ymm9, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm8, (%rdi) ; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll index 763d764698dd..929dafbfc21d 100644 --- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll +++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll @@ -20,6 +20,7 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 { ; CHECK-NEXT: movl $4, %eax ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: popq %rdx +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq call void asm sideeffect "", "~{rax},~{rdx},~{xmm1},~{rdi},~{rsi},~{xmm0}"() ret i32 4 |