diff options
author | Stefan Pintilie <stefanp@ca.ibm.com> | 2021-11-15 15:26:30 -0600 |
---|---|---|
committer | Stefan Pintilie <stefanp@ca.ibm.com> | 2021-11-19 15:03:01 -0600 |
commit | e9d12c248013b2d2b9880436727857e0ec8a7085 (patch) | |
tree | 15acf94ea645815639667645f520b51efe611bd8 | |
parent | e059329b835aac1b93d764811b23c3cfd8b856c7 (diff) | |
download | llvm-e9d12c248013b2d2b9880436727857e0ec8a7085.tar.gz |
[PowerPC][NFC] Add a series of codegen tests for vector reductions.
This patch only adds tests for PowerPC. The purpose of these tests
is to track what code is generated for various vector reductions.
Reviewed By: nemanjai, #powerpc
Differential Revision: https://reviews.llvm.org/D113801
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-add.ll | 1808 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-and.ll | 390 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll | 4247 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll | 1169 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll | 1169 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll | 1717 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll | 204 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-or.ll | 392 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-smax.ll | 796 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-smin.ll | 796 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-umax.ll | 796 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-umin.ll | 796 | ||||
-rw-r--r-- | llvm/test/CodeGen/PowerPC/vector-reduce-xor.ll | 392 |
13 files changed, 14672 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll new file mode 100644 index 000000000000..2ba113f59da3 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll @@ -0,0 +1,1808 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of i8 +;; +define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) + ret i8 %0 +} + +define dso_local signext i8 @v16i8_sign(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8_sign: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: extsb r3, r3 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8_sign: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: extsb r3, r3 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8_sign: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: extsb r3, r3 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8_sign: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: extsb r3, r3 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) + ret i8 %0 +} + +define dso_local zeroext i8 @v16i8_zero(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8_zero: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: clrldi r3, r3, 56 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8_zero: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: clrldi r3, r3, 56 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8_zero: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: clrldi r3, r3, 56 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8_zero: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: clrldi r3, r3, 56 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v32i8(<32 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vaddubm v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vaddubm v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vaddubm v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vaddubm v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a) + ret i8 %0 +} + +declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) #0 +declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) #0 +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) #0 +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #0 +declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) #0 + +;; +;; Vectors of i16 +;; +define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) + ret i16 %0 +} + +define dso_local zeroext i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: clrldi r3, r3, 48 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: clrldi r3, r3, 48 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: clrldi r3, r3, 48 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: clrldi r3, r3, 48 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a) + ret i16 %0 +} + +define dso_local signext i16 @v16i8tov16i16_sign(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8tov16i16_sign: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmrghb v3, v2, v2 +; PWR9LE-NEXT: vspltish v4, 8 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmrglb v2, v2, v2 +; PWR9LE-NEXT: vslh v3, v3, v4 +; PWR9LE-NEXT: vslh v2, v2, v4 +; PWR9LE-NEXT: vsrah v3, v3, v4 +; PWR9LE-NEXT: vsrah v2, v2, v4 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: extsh r3, r3 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8tov16i16_sign: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmrglb v3, v2, v2 +; PWR9BE-NEXT: vspltish v4, 8 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmrghb v2, v2, v2 +; PWR9BE-NEXT: vslh v3, v3, v4 +; PWR9BE-NEXT: vslh v2, v2, v4 +; PWR9BE-NEXT: vsrah v3, v3, v4 +; PWR9BE-NEXT: vsrah v2, v2, v4 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: extsh r3, r3 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8tov16i16_sign: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmrghb v3, v2, v2 +; PWR10LE-NEXT: xxspltiw v4, 524296 +; PWR10LE-NEXT: vmrglb v2, v2, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vslh v3, v3, v4 +; PWR10LE-NEXT: vslh v2, v2, v4 +; PWR10LE-NEXT: vsrah v3, v3, v4 +; PWR10LE-NEXT: vsrah v2, v2, v4 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: extsh r3, r3 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8tov16i16_sign: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmrglb v3, v2, v2 +; PWR10BE-NEXT: xxspltiw v4, 524296 +; PWR10BE-NEXT: vmrghb v2, v2, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vslh v3, v3, v4 +; PWR10BE-NEXT: vslh v2, v2, v4 +; PWR10BE-NEXT: vsrah v3, v3, v4 +; PWR10BE-NEXT: vsrah v2, v2, v4 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: extsh r3, r3 +; PWR10BE-NEXT: blr +entry: + %0 = sext <16 x i8> %a to <16 x i16> + %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0) + ret i16 %1 +} + +define dso_local zeroext i16 @v16i8tov16i16_zero(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8tov16i16_zero: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlxor v3, v3, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmrghb v4, v3, v2 +; PWR9LE-NEXT: vmrglb v2, v3, v2 +; PWR9LE-NEXT: vadduhm v2, v2, v4 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: clrldi r3, r3, 48 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8tov16i16_zero: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlxor v3, v3, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmrglb v4, v3, v2 +; PWR9BE-NEXT: vmrghb v2, v3, v2 +; PWR9BE-NEXT: vadduhm v2, v2, v4 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: clrldi r3, r3, 48 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8tov16i16_zero: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlxor v3, v3, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmrghb v4, v3, v2 +; PWR10LE-NEXT: vmrglb v2, v3, v2 +; PWR10LE-NEXT: vadduhm v2, v2, v4 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: clrldi r3, r3, 48 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8tov16i16_zero: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlxor v3, v3, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmrglb v4, v3, v2 +; PWR10BE-NEXT: vmrghb v2, v3, v2 +; PWR10BE-NEXT: vadduhm v2, v2, v4 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: clrldi r3, r3, 48 +; PWR10BE-NEXT: blr +entry: + %0 = zext <16 x i8> %a to <16 x i16> + %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0) + ret i16 %1 +} + +declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) #0 +declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) #0 +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #0 +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #0 + +;; +;; Vectors of i32 +;; +define dso_local zeroext i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local zeroext i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local zeroext i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local zeroext i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vadduwm v3, v3, v5 +; PWR9LE-NEXT: vadduwm v2, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vadduwm v3, v3, v5 +; PWR9BE-NEXT: vadduwm v2, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vadduwm v3, v3, v5 +; PWR10LE-NEXT: vadduwm v2, v2, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vadduwm v3, v3, v5 +; PWR10BE-NEXT: vadduwm v2, v2, v4 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a) + ret i32 %0 +} + +define dso_local zeroext i32 @v32i32(<32 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vadduwm v4, v4, v8 +; PWR9LE-NEXT: vadduwm v2, v2, v6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vadduwm v5, v5, v9 +; PWR9LE-NEXT: vadduwm v3, v3, v7 +; PWR9LE-NEXT: vadduwm v3, v3, v5 +; PWR9LE-NEXT: vadduwm v2, v2, v4 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vadduwm v4, v4, v8 +; PWR9BE-NEXT: vadduwm v2, v2, v6 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vadduwm v5, v5, v9 +; PWR9BE-NEXT: vadduwm v3, v3, v7 +; PWR9BE-NEXT: vadduwm v3, v3, v5 +; PWR9BE-NEXT: vadduwm v2, v2, v4 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vadduwm v4, v4, v8 +; PWR10LE-NEXT: vadduwm v2, v2, v6 +; PWR10LE-NEXT: vadduwm v5, v5, v9 +; PWR10LE-NEXT: vadduwm v3, v3, v7 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vadduwm v3, v3, v5 +; PWR10LE-NEXT: vadduwm v2, v2, v4 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vadduwm v4, v4, v8 +; PWR10BE-NEXT: vadduwm v2, v2, v6 +; PWR10BE-NEXT: vadduwm v5, v5, v9 +; PWR10BE-NEXT: vadduwm v3, v3, v7 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vadduwm v3, v3, v5 +; PWR10BE-NEXT: vadduwm v2, v2, v4 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %a) + ret i32 %0 +} + +define dso_local signext i32 @v16i8tov16i32_sign(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8tov16i32_sign: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: addis r3, r2, .LCPI17_0@toc@ha +; PWR9LE-NEXT: addi r3, r3, .LCPI17_0@toc@l +; PWR9LE-NEXT: lxv v3, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI17_1@toc@ha +; PWR9LE-NEXT: addi r3, r3, .LCPI17_1@toc@l +; PWR9LE-NEXT: lxv v4, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI17_2@toc@ha +; PWR9LE-NEXT: vperm v3, v2, v2, v3 +; PWR9LE-NEXT: addi r3, r3, .LCPI17_2@toc@l +; PWR9LE-NEXT: lxv v5, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI17_3@toc@ha +; PWR9LE-NEXT: vextsb2w v3, v3 +; PWR9LE-NEXT: vperm v4, v2, v2, v4 +; PWR9LE-NEXT: addi r3, r3, .LCPI17_3@toc@l +; PWR9LE-NEXT: lxv v0, 0(r3) +; PWR9LE-NEXT: vextsb2w v4, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vperm v5, v2, v2, v5 +; PWR9LE-NEXT: vadduwm v3, v4, v3 +; PWR9LE-NEXT: vextsb2w v5, v5 +; PWR9LE-NEXT: vperm v2, v2, v2, v0 +; PWR9LE-NEXT: vextsb2w v2, v2 +; PWR9LE-NEXT: vadduwm v2, v2, v5 +; PWR9LE-NEXT: vadduwm v2, v3, v2 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: extsw r3, r3 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8tov16i32_sign: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: addis r3, r2, .LCPI17_0@toc@ha +; PWR9BE-NEXT: addi r3, r3, .LCPI17_0@toc@l +; PWR9BE-NEXT: lxv v3, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI17_1@toc@ha +; PWR9BE-NEXT: addi r3, r3, .LCPI17_1@toc@l +; PWR9BE-NEXT: lxv v4, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI17_2@toc@ha +; PWR9BE-NEXT: vperm v3, v2, v2, v3 +; PWR9BE-NEXT: addi r3, r3, .LCPI17_2@toc@l +; PWR9BE-NEXT: lxv v5, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI17_3@toc@ha +; PWR9BE-NEXT: vextsb2w v3, v3 +; PWR9BE-NEXT: vperm v4, v2, v2, v4 +; PWR9BE-NEXT: addi r3, r3, .LCPI17_3@toc@l +; PWR9BE-NEXT: lxv v0, 0(r3) +; PWR9BE-NEXT: vextsb2w v4, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vperm v5, v2, v2, v5 +; PWR9BE-NEXT: vadduwm v3, v4, v3 +; PWR9BE-NEXT: vextsb2w v5, v5 +; PWR9BE-NEXT: vperm v2, v2, v2, v0 +; PWR9BE-NEXT: vextsb2w v2, v2 +; PWR9BE-NEXT: vadduwm v2, v2, v5 +; PWR9BE-NEXT: vadduwm v2, v3, v2 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: extsw r3, r3 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8tov16i32_sign: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: plxv v3, .LCPI17_0@PCREL(0), 1 +; PWR10LE-NEXT: plxv v4, .LCPI17_1@PCREL(0), 1 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vperm v3, v2, v2, v3 +; PWR10LE-NEXT: plxv v5, .LCPI17_2@PCREL(0), 1 +; PWR10LE-NEXT: plxv v0, .LCPI17_3@PCREL(0), 1 +; PWR10LE-NEXT: vperm v4, v2, v2, v4 +; PWR10LE-NEXT: vperm v5, v2, v2, v5 +; PWR10LE-NEXT: vperm v2, v2, v2, v0 +; PWR10LE-NEXT: vextsb2w v3, v3 +; PWR10LE-NEXT: vextsb2w v4, v4 +; PWR10LE-NEXT: vextsb2w v5, v5 +; PWR10LE-NEXT: vextsb2w v2, v2 +; PWR10LE-NEXT: vadduwm v2, v2, v5 +; PWR10LE-NEXT: vadduwm v3, v4, v3 +; PWR10LE-NEXT: vadduwm v2, v3, v2 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: extsw r3, r3 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8tov16i32_sign: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: addis r3, r2, .LCPI17_0@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI17_0@toc@l +; PWR10BE-NEXT: lxv v3, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI17_1@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI17_1@toc@l +; PWR10BE-NEXT: lxv v4, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI17_2@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI17_2@toc@l +; PWR10BE-NEXT: vperm v3, v2, v2, v3 +; PWR10BE-NEXT: lxv v5, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI17_3@toc@ha +; PWR10BE-NEXT: vextsb2w v3, v3 +; PWR10BE-NEXT: addi r3, r3, .LCPI17_3@toc@l +; PWR10BE-NEXT: vperm v4, v2, v2, v4 +; PWR10BE-NEXT: lxv v0, 0(r3) +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vextsb2w v4, v4 +; PWR10BE-NEXT: vperm v5, v2, v2, v5 +; PWR10BE-NEXT: vadduwm v3, v4, v3 +; PWR10BE-NEXT: vextsb2w v5, v5 +; PWR10BE-NEXT: vperm v2, v2, v2, v0 +; PWR10BE-NEXT: vextsb2w v2, v2 +; PWR10BE-NEXT: vadduwm v2, v2, v5 +; PWR10BE-NEXT: vadduwm v2, v3, v2 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: extsw r3, r3 +; PWR10BE-NEXT: blr +entry: + %0 = sext <16 x i8> %a to <16 x i32> + %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) + ret i32 %1 +} + +define dso_local zeroext i32 @v16i8tov16i32_zero(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8tov16i32_zero: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: addis r3, r2, .LCPI18_0@toc@ha +; PWR9LE-NEXT: xxlxor v4, v4, v4 +; PWR9LE-NEXT: addi r3, r3, .LCPI18_0@toc@l +; PWR9LE-NEXT: lxv v3, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI18_1@toc@ha +; PWR9LE-NEXT: addi r3, r3, .LCPI18_1@toc@l +; PWR9LE-NEXT: lxv v5, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI18_2@toc@ha +; PWR9LE-NEXT: vperm v3, v4, v2, v3 +; PWR9LE-NEXT: addi r3, r3, .LCPI18_2@toc@l +; PWR9LE-NEXT: lxv v0, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI18_3@toc@ha +; PWR9LE-NEXT: vperm v5, v4, v2, v5 +; PWR9LE-NEXT: addi r3, r3, .LCPI18_3@toc@l +; PWR9LE-NEXT: lxv v1, 0(r3) +; PWR9LE-NEXT: vadduwm v3, v5, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vperm v0, v4, v2, v0 +; PWR9LE-NEXT: vperm v2, v4, v2, v1 +; PWR9LE-NEXT: vadduwm v2, v2, v0 +; PWR9LE-NEXT: vadduwm v2, v3, v2 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8tov16i32_zero: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: addis r3, r2, .LCPI18_0@toc@ha +; PWR9BE-NEXT: xxlxor v4, v4, v4 +; PWR9BE-NEXT: addi r3, r3, .LCPI18_0@toc@l +; PWR9BE-NEXT: lxv v3, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI18_1@toc@ha +; PWR9BE-NEXT: addi r3, r3, .LCPI18_1@toc@l +; PWR9BE-NEXT: lxv v5, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI18_2@toc@ha +; PWR9BE-NEXT: vperm v3, v4, v2, v3 +; PWR9BE-NEXT: addi r3, r3, .LCPI18_2@toc@l +; PWR9BE-NEXT: lxv v0, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI18_3@toc@ha +; PWR9BE-NEXT: vperm v5, v4, v2, v5 +; PWR9BE-NEXT: addi r3, r3, .LCPI18_3@toc@l +; PWR9BE-NEXT: lxv v1, 0(r3) +; PWR9BE-NEXT: vadduwm v3, v5, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vperm v0, v4, v2, v0 +; PWR9BE-NEXT: vperm v2, v4, v2, v1 +; PWR9BE-NEXT: vadduwm v2, v2, v0 +; PWR9BE-NEXT: vadduwm v2, v3, v2 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8tov16i32_zero: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: plxv v3, .LCPI18_0@PCREL(0), 1 +; PWR10LE-NEXT: plxv v5, .LCPI18_1@PCREL(0), 1 +; PWR10LE-NEXT: xxlxor v4, v4, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vperm v3, v4, v2, v3 +; PWR10LE-NEXT: plxv v0, .LCPI18_2@PCREL(0), 1 +; PWR10LE-NEXT: plxv v1, .LCPI18_3@PCREL(0), 1 +; PWR10LE-NEXT: vperm v5, v4, v2, v5 +; PWR10LE-NEXT: vperm v0, v4, v2, v0 +; PWR10LE-NEXT: vperm v2, v4, v2, v1 +; PWR10LE-NEXT: vadduwm v2, v2, v0 +; PWR10LE-NEXT: vadduwm v3, v5, v3 +; PWR10LE-NEXT: vadduwm v2, v3, v2 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8tov16i32_zero: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: addis r3, r2, .LCPI18_0@toc@ha +; PWR10BE-NEXT: xxlxor v4, v4, v4 +; PWR10BE-NEXT: addi r3, r3, .LCPI18_0@toc@l +; PWR10BE-NEXT: lxv v3, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI18_1@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI18_1@toc@l +; PWR10BE-NEXT: lxv v5, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI18_2@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI18_2@toc@l +; PWR10BE-NEXT: vperm v3, v4, v2, v3 +; PWR10BE-NEXT: lxv v0, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI18_3@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI18_3@toc@l +; PWR10BE-NEXT: vperm v5, v4, v2, v5 +; PWR10BE-NEXT: lxv v1, 0(r3) +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vadduwm v3, v5, v3 +; PWR10BE-NEXT: vperm v0, v4, v2, v0 +; PWR10BE-NEXT: vperm v2, v4, v2, v1 +; PWR10BE-NEXT: vadduwm v2, v2, v0 +; PWR10BE-NEXT: vadduwm v2, v3, v2 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = zext <16 x i8> %a to <16 x i32> + %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) + ret i32 %1 +} + +declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #0 +declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) #0 + +;; +;; Vectors of i64 +;; +define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vaddudm v3, v3, v5 +; PWR9LE-NEXT: vaddudm v2, v2, v4 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vaddudm v3, v3, v5 +; PWR9BE-NEXT: vaddudm v2, v2, v4 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vaddudm v3, v3, v5 +; PWR10LE-NEXT: vaddudm v2, v2, v4 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vaddudm v3, v3, v5 +; PWR10BE-NEXT: vaddudm v2, v2, v4 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vaddudm v4, v4, v8 +; PWR9LE-NEXT: vaddudm v2, v2, v6 +; PWR9LE-NEXT: vaddudm v5, v5, v9 +; PWR9LE-NEXT: vaddudm v3, v3, v7 +; PWR9LE-NEXT: vaddudm v3, v3, v5 +; PWR9LE-NEXT: vaddudm v2, v2, v4 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vaddudm v4, v4, v8 +; PWR9BE-NEXT: vaddudm v2, v2, v6 +; PWR9BE-NEXT: vaddudm v5, v5, v9 +; PWR9BE-NEXT: vaddudm v3, v3, v7 +; PWR9BE-NEXT: vaddudm v3, v3, v5 +; PWR9BE-NEXT: vaddudm v2, v2, v4 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vaddudm v4, v4, v8 +; PWR10LE-NEXT: vaddudm v5, v5, v9 +; PWR10LE-NEXT: vaddudm v3, v3, v7 +; PWR10LE-NEXT: vaddudm v3, v3, v5 +; PWR10LE-NEXT: vaddudm v2, v2, v6 +; PWR10LE-NEXT: vaddudm v2, v2, v4 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vaddudm v4, v4, v8 +; PWR10BE-NEXT: vaddudm v5, v5, v9 +; PWR10BE-NEXT: vaddudm v3, v3, v7 +; PWR10BE-NEXT: vaddudm v3, v3, v5 +; PWR10BE-NEXT: vaddudm v2, v2, v6 +; PWR10BE-NEXT: vaddudm v2, v2, v4 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8tov16i64_sign: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: addis r3, r2, .LCPI23_0@toc@ha +; PWR9LE-NEXT: addi r3, r3, .LCPI23_0@toc@l +; PWR9LE-NEXT: lxv v3, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI23_1@toc@ha +; PWR9LE-NEXT: addi r3, r3, .LCPI23_1@toc@l +; PWR9LE-NEXT: lxv v4, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI23_2@toc@ha +; PWR9LE-NEXT: vperm v3, v2, v2, v3 +; PWR9LE-NEXT: addi r3, r3, .LCPI23_2@toc@l +; PWR9LE-NEXT: lxv v5, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI23_3@toc@ha +; PWR9LE-NEXT: vextsb2d v3, v3 +; PWR9LE-NEXT: vperm v4, v2, v2, v4 +; PWR9LE-NEXT: addi r3, r3, .LCPI23_3@toc@l +; PWR9LE-NEXT: lxv v0, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI23_4@toc@ha +; PWR9LE-NEXT: vextsb2d v4, v4 +; PWR9LE-NEXT: vperm v5, v2, v2, v5 +; PWR9LE-NEXT: addi r3, r3, .LCPI23_4@toc@l +; PWR9LE-NEXT: vaddudm v3, v4, v3 +; PWR9LE-NEXT: lxv v1, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI23_5@toc@ha +; PWR9LE-NEXT: vextsb2d v5, v5 +; PWR9LE-NEXT: vperm v0, v2, v2, v0 +; PWR9LE-NEXT: addi r3, r3, .LCPI23_5@toc@l +; PWR9LE-NEXT: lxv v6, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI23_6@toc@ha +; PWR9LE-NEXT: vperm v1, v2, v2, v1 +; PWR9LE-NEXT: vextsb2d v0, v0 +; PWR9LE-NEXT: addi r3, r3, .LCPI23_6@toc@l +; PWR9LE-NEXT: vaddudm v5, v0, v5 +; PWR9LE-NEXT: lxv v7, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI23_7@toc@ha +; PWR9LE-NEXT: vperm v6, v2, v2, v6 +; PWR9LE-NEXT: vextsb2d v1, v1 +; PWR9LE-NEXT: vaddudm v3, v3, v5 +; PWR9LE-NEXT: addi r3, r3, .LCPI23_7@toc@l +; PWR9LE-NEXT: lxv v8, 0(r3) +; PWR9LE-NEXT: vextsb2d v6, v6 +; PWR9LE-NEXT: vperm v7, v2, v2, v7 +; PWR9LE-NEXT: vaddudm v1, v6, v1 +; PWR9LE-NEXT: vextsb2d v7, v7 +; PWR9LE-NEXT: vperm v2, v2, v2, v8 +; PWR9LE-NEXT: vextsb2d v2, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v7 +; PWR9LE-NEXT: vaddudm v2, v1, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8tov16i64_sign: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: addis r3, r2, .LCPI23_0@toc@ha +; PWR9BE-NEXT: addi r3, r3, .LCPI23_0@toc@l +; PWR9BE-NEXT: lxv v3, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI23_1@toc@ha +; PWR9BE-NEXT: addi r3, r3, .LCPI23_1@toc@l +; PWR9BE-NEXT: lxv v4, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI23_2@toc@ha +; PWR9BE-NEXT: vperm v3, v2, v2, v3 +; PWR9BE-NEXT: addi r3, r3, .LCPI23_2@toc@l +; PWR9BE-NEXT: lxv v5, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI23_3@toc@ha +; PWR9BE-NEXT: vextsb2d v3, v3 +; PWR9BE-NEXT: vperm v4, v2, v2, v4 +; PWR9BE-NEXT: addi r3, r3, .LCPI23_3@toc@l +; PWR9BE-NEXT: lxv v0, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI23_4@toc@ha +; PWR9BE-NEXT: vextsb2d v4, v4 +; PWR9BE-NEXT: vperm v5, v2, v2, v5 +; PWR9BE-NEXT: addi r3, r3, .LCPI23_4@toc@l +; PWR9BE-NEXT: vaddudm v3, v4, v3 +; PWR9BE-NEXT: lxv v1, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI23_5@toc@ha +; PWR9BE-NEXT: vextsb2d v5, v5 +; PWR9BE-NEXT: vperm v0, v2, v2, v0 +; PWR9BE-NEXT: addi r3, r3, .LCPI23_5@toc@l +; PWR9BE-NEXT: lxv v6, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI23_6@toc@ha +; PWR9BE-NEXT: vperm v1, v2, v2, v1 +; PWR9BE-NEXT: vextsb2d v0, v0 +; PWR9BE-NEXT: addi r3, r3, .LCPI23_6@toc@l +; PWR9BE-NEXT: vaddudm v5, v0, v5 +; PWR9BE-NEXT: lxv v7, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI23_7@toc@ha +; PWR9BE-NEXT: vperm v6, v2, v2, v6 +; PWR9BE-NEXT: vextsb2d v1, v1 +; PWR9BE-NEXT: vaddudm v3, v3, v5 +; PWR9BE-NEXT: addi r3, r3, .LCPI23_7@toc@l +; PWR9BE-NEXT: lxv v8, 0(r3) +; PWR9BE-NEXT: vextsb2d v6, v6 +; PWR9BE-NEXT: vperm v7, v2, v2, v7 +; PWR9BE-NEXT: vaddudm v1, v6, v1 +; PWR9BE-NEXT: vextsb2d v7, v7 +; PWR9BE-NEXT: vperm v2, v2, v2, v8 +; PWR9BE-NEXT: vextsb2d v2, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v7 +; PWR9BE-NEXT: vaddudm v2, v1, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8tov16i64_sign: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: plxv v3, .LCPI23_0@PCREL(0), 1 +; PWR10LE-NEXT: plxv v4, .LCPI23_1@PCREL(0), 1 +; PWR10LE-NEXT: vperm v3, v2, v2, v3 +; PWR10LE-NEXT: plxv v5, .LCPI23_2@PCREL(0), 1 +; PWR10LE-NEXT: plxv v0, .LCPI23_3@PCREL(0), 1 +; PWR10LE-NEXT: plxv v1, .LCPI23_4@PCREL(0), 1 +; PWR10LE-NEXT: plxv v6, .LCPI23_5@PCREL(0), 1 +; PWR10LE-NEXT: plxv v7, .LCPI23_6@PCREL(0), 1 +; PWR10LE-NEXT: plxv v8, .LCPI23_7@PCREL(0), 1 +; PWR10LE-NEXT: vperm v4, v2, v2, v4 +; PWR10LE-NEXT: vperm v5, v2, v2, v5 +; PWR10LE-NEXT: vperm v0, v2, v2, v0 +; PWR10LE-NEXT: vperm v1, v2, v2, v1 +; PWR10LE-NEXT: vperm v6, v2, v2, v6 +; PWR10LE-NEXT: vperm v7, v2, v2, v7 +; PWR10LE-NEXT: vperm v2, v2, v2, v8 +; PWR10LE-NEXT: vextsb2d v5, v5 +; PWR10LE-NEXT: vextsb2d v0, v0 +; PWR10LE-NEXT: vextsb2d v7, v7 +; PWR10LE-NEXT: vextsb2d v2, v2 +; PWR10LE-NEXT: vextsb2d v3, v3 +; PWR10LE-NEXT: vextsb2d v4, v4 +; PWR10LE-NEXT: vextsb2d v1, v1 +; PWR10LE-NEXT: vextsb2d v6, v6 +; PWR10LE-NEXT: vaddudm v2, v2, v7 +; PWR10LE-NEXT: vaddudm v5, v0, v5 +; PWR10LE-NEXT: vaddudm v3, v4, v3 +; PWR10LE-NEXT: vaddudm v3, v3, v5 +; PWR10LE-NEXT: vaddudm v4, v6, v1 +; PWR10LE-NEXT: vaddudm v2, v4, v2 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8tov16i64_sign: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: addis r3, r2, .LCPI23_0@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI23_0@toc@l +; PWR10BE-NEXT: lxv v3, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI23_1@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI23_1@toc@l +; PWR10BE-NEXT: lxv v4, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI23_2@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI23_2@toc@l +; PWR10BE-NEXT: vperm v3, v2, v2, v3 +; PWR10BE-NEXT: lxv v5, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI23_3@toc@ha +; PWR10BE-NEXT: vextsb2d v3, v3 +; PWR10BE-NEXT: addi r3, r3, .LCPI23_3@toc@l +; PWR10BE-NEXT: vperm v4, v2, v2, v4 +; PWR10BE-NEXT: lxv v0, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI23_4@toc@ha +; PWR10BE-NEXT: vextsb2d v4, v4 +; PWR10BE-NEXT: addi r3, r3, .LCPI23_4@toc@l +; PWR10BE-NEXT: vperm v5, v2, v2, v5 +; PWR10BE-NEXT: lxv v1, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI23_5@toc@ha +; PWR10BE-NEXT: vextsb2d v5, v5 +; PWR10BE-NEXT: addi r3, r3, .LCPI23_5@toc@l +; PWR10BE-NEXT: vperm v0, v2, v2, v0 +; PWR10BE-NEXT: lxv v6, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI23_6@toc@ha +; PWR10BE-NEXT: vextsb2d v0, v0 +; PWR10BE-NEXT: addi r3, r3, .LCPI23_6@toc@l +; PWR10BE-NEXT: vperm v1, v2, v2, v1 +; PWR10BE-NEXT: vaddudm v5, v0, v5 +; PWR10BE-NEXT: vaddudm v3, v4, v3 +; PWR10BE-NEXT: vaddudm v3, v3, v5 +; PWR10BE-NEXT: lxv v7, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI23_7@toc@ha +; PWR10BE-NEXT: vextsb2d v1, v1 +; PWR10BE-NEXT: addi r3, r3, .LCPI23_7@toc@l +; PWR10BE-NEXT: vperm v6, v2, v2, v6 +; PWR10BE-NEXT: lxv v8, 0(r3) +; PWR10BE-NEXT: vextsb2d v6, v6 +; PWR10BE-NEXT: vperm v7, v2, v2, v7 +; PWR10BE-NEXT: vextsb2d v7, v7 +; PWR10BE-NEXT: vperm v2, v2, v2, v8 +; PWR10BE-NEXT: vextsb2d v2, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v7 +; PWR10BE-NEXT: vaddudm v4, v6, v1 +; PWR10BE-NEXT: vaddudm v2, v4, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = sext <16 x i8> %a to <16 x i64> + %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %0) + ret i64 %1 +} + +define dso_local i64 @v16i8tov16i64_zero(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8tov16i64_zero: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: addis r3, r2, .LCPI24_0@toc@ha +; PWR9LE-NEXT: xxlxor v4, v4, v4 +; PWR9LE-NEXT: addi r3, r3, .LCPI24_0@toc@l +; PWR9LE-NEXT: lxv v3, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI24_1@toc@ha +; PWR9LE-NEXT: addi r3, r3, .LCPI24_1@toc@l +; PWR9LE-NEXT: lxv v5, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI24_2@toc@ha +; PWR9LE-NEXT: vperm v3, v4, v2, v3 +; PWR9LE-NEXT: addi r3, r3, .LCPI24_2@toc@l +; PWR9LE-NEXT: lxv v0, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI24_3@toc@ha +; PWR9LE-NEXT: vperm v5, v4, v2, v5 +; PWR9LE-NEXT: addi r3, r3, .LCPI24_3@toc@l +; PWR9LE-NEXT: lxv v1, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI24_4@toc@ha +; PWR9LE-NEXT: vaddudm v3, v5, v3 +; PWR9LE-NEXT: vperm v0, v4, v2, v0 +; PWR9LE-NEXT: addi r3, r3, .LCPI24_4@toc@l +; PWR9LE-NEXT: lxv v6, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI24_5@toc@ha +; PWR9LE-NEXT: vperm v1, v4, v2, v1 +; PWR9LE-NEXT: addi r3, r3, .LCPI24_5@toc@l +; PWR9LE-NEXT: lxv v7, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI24_6@toc@ha +; PWR9LE-NEXT: vaddudm v0, v1, v0 +; PWR9LE-NEXT: vperm v6, v4, v2, v6 +; PWR9LE-NEXT: addi r3, r3, .LCPI24_6@toc@l +; PWR9LE-NEXT: lxv v8, 0(r3) +; PWR9LE-NEXT: addis r3, r2, .LCPI24_7@toc@ha +; PWR9LE-NEXT: vaddudm v3, v3, v0 +; PWR9LE-NEXT: vperm v7, v4, v2, v7 +; PWR9LE-NEXT: addi r3, r3, .LCPI24_7@toc@l +; PWR9LE-NEXT: lxv v9, 0(r3) +; PWR9LE-NEXT: vperm v8, v4, v2, v8 +; PWR9LE-NEXT: vperm v2, v4, v2, v9 +; PWR9LE-NEXT: vaddudm v4, v7, v6 +; PWR9LE-NEXT: vaddudm v2, v2, v8 +; PWR9LE-NEXT: vaddudm v2, v4, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vaddudm v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8tov16i64_zero: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: addis r3, r2, .LCPI24_0@toc@ha +; PWR9BE-NEXT: xxlxor v4, v4, v4 +; PWR9BE-NEXT: addi r3, r3, .LCPI24_0@toc@l +; PWR9BE-NEXT: lxv v3, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI24_1@toc@ha +; PWR9BE-NEXT: addi r3, r3, .LCPI24_1@toc@l +; PWR9BE-NEXT: lxv v5, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI24_2@toc@ha +; PWR9BE-NEXT: vperm v3, v4, v2, v3 +; PWR9BE-NEXT: addi r3, r3, .LCPI24_2@toc@l +; PWR9BE-NEXT: lxv v0, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI24_3@toc@ha +; PWR9BE-NEXT: vperm v5, v4, v2, v5 +; PWR9BE-NEXT: addi r3, r3, .LCPI24_3@toc@l +; PWR9BE-NEXT: lxv v1, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI24_4@toc@ha +; PWR9BE-NEXT: vaddudm v3, v5, v3 +; PWR9BE-NEXT: vperm v0, v4, v2, v0 +; PWR9BE-NEXT: addi r3, r3, .LCPI24_4@toc@l +; PWR9BE-NEXT: lxv v6, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI24_5@toc@ha +; PWR9BE-NEXT: vperm v1, v4, v2, v1 +; PWR9BE-NEXT: addi r3, r3, .LCPI24_5@toc@l +; PWR9BE-NEXT: lxv v7, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI24_6@toc@ha +; PWR9BE-NEXT: vaddudm v0, v1, v0 +; PWR9BE-NEXT: vperm v6, v4, v2, v6 +; PWR9BE-NEXT: addi r3, r3, .LCPI24_6@toc@l +; PWR9BE-NEXT: lxv v8, 0(r3) +; PWR9BE-NEXT: addis r3, r2, .LCPI24_7@toc@ha +; PWR9BE-NEXT: vaddudm v3, v3, v0 +; PWR9BE-NEXT: vperm v7, v4, v2, v7 +; PWR9BE-NEXT: addi r3, r3, .LCPI24_7@toc@l +; PWR9BE-NEXT: lxv v9, 0(r3) +; PWR9BE-NEXT: vperm v8, v4, v2, v8 +; PWR9BE-NEXT: vperm v2, v4, v2, v9 +; PWR9BE-NEXT: vaddudm v4, v7, v6 +; PWR9BE-NEXT: vaddudm v2, v2, v8 +; PWR9BE-NEXT: vaddudm v2, v4, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vaddudm v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8tov16i64_zero: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: plxv v3, .LCPI24_0@PCREL(0), 1 +; PWR10LE-NEXT: plxv v5, .LCPI24_1@PCREL(0), 1 +; PWR10LE-NEXT: xxlxor v4, v4, v4 +; PWR10LE-NEXT: vperm v3, v4, v2, v3 +; PWR10LE-NEXT: plxv v0, .LCPI24_2@PCREL(0), 1 +; PWR10LE-NEXT: plxv v1, .LCPI24_3@PCREL(0), 1 +; PWR10LE-NEXT: plxv v6, .LCPI24_4@PCREL(0), 1 +; PWR10LE-NEXT: plxv v7, .LCPI24_5@PCREL(0), 1 +; PWR10LE-NEXT: plxv v8, .LCPI24_6@PCREL(0), 1 +; PWR10LE-NEXT: plxv v9, .LCPI24_7@PCREL(0), 1 +; PWR10LE-NEXT: vperm v5, v4, v2, v5 +; PWR10LE-NEXT: vperm v0, v4, v2, v0 +; PWR10LE-NEXT: vperm v1, v4, v2, v1 +; PWR10LE-NEXT: vperm v6, v4, v2, v6 +; PWR10LE-NEXT: vperm v7, v4, v2, v7 +; PWR10LE-NEXT: vperm v8, v4, v2, v8 +; PWR10LE-NEXT: vperm v2, v4, v2, v9 +; PWR10LE-NEXT: vaddudm v2, v2, v8 +; PWR10LE-NEXT: vaddudm v4, v1, v0 +; PWR10LE-NEXT: vaddudm v3, v5, v3 +; PWR10LE-NEXT: vaddudm v3, v3, v4 +; PWR10LE-NEXT: vaddudm v4, v7, v6 +; PWR10LE-NEXT: vaddudm v2, v4, v2 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vaddudm v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8tov16i64_zero: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: addis r3, r2, .LCPI24_0@toc@ha +; PWR10BE-NEXT: xxlxor v4, v4, v4 +; PWR10BE-NEXT: addi r3, r3, .LCPI24_0@toc@l +; PWR10BE-NEXT: lxv v3, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI24_1@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI24_1@toc@l +; PWR10BE-NEXT: lxv v5, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI24_2@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI24_2@toc@l +; PWR10BE-NEXT: vperm v3, v4, v2, v3 +; PWR10BE-NEXT: lxv v0, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI24_3@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI24_3@toc@l +; PWR10BE-NEXT: vperm v5, v4, v2, v5 +; PWR10BE-NEXT: lxv v1, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI24_4@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI24_4@toc@l +; PWR10BE-NEXT: vperm v0, v4, v2, v0 +; PWR10BE-NEXT: lxv v6, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI24_5@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI24_5@toc@l +; PWR10BE-NEXT: vperm v1, v4, v2, v1 +; PWR10BE-NEXT: lxv v7, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI24_6@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI24_6@toc@l +; PWR10BE-NEXT: vperm v6, v4, v2, v6 +; PWR10BE-NEXT: lxv v8, 0(r3) +; PWR10BE-NEXT: addis r3, r2, .LCPI24_7@toc@ha +; PWR10BE-NEXT: addi r3, r3, .LCPI24_7@toc@l +; PWR10BE-NEXT: vperm v7, v4, v2, v7 +; PWR10BE-NEXT: lxv v9, 0(r3) +; PWR10BE-NEXT: vperm v8, v4, v2, v8 +; PWR10BE-NEXT: vperm v2, v4, v2, v9 +; PWR10BE-NEXT: vaddudm v4, v1, v0 +; PWR10BE-NEXT: vaddudm v3, v5, v3 +; PWR10BE-NEXT: vaddudm v3, v3, v4 +; PWR10BE-NEXT: vaddudm v2, v2, v8 +; PWR10BE-NEXT: vaddudm v4, v7, v6 +; PWR10BE-NEXT: vaddudm v2, v4, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vaddudm v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = zext <16 x i8> %a to <16 x i64> + %1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %0) + ret i64 %1 +} + +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) #0 +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #0 +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #0 +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-and.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-and.ll new file mode 100644 index 000000000000..ce872e272019 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-and.ll @@ -0,0 +1,390 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of type i32 +;; +define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw vs0, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxland v2, v2, vs0 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw vs0, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxland v2, v2, vs0 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw vs0, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxland v2, v2, vs0 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw vs0, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxland v2, v2, vs0 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxland vs0, v2, v3 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxland v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxland vs0, v2, v3 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxland v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxland vs0, v2, v3 +; PWR10LE-NEXT: xxspltw vs0, vs0, 2 +; PWR10LE-NEXT: xxeval v2, v2, v3, vs0, 1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxland vs0, v2, v3 +; PWR10BE-NEXT: xxspltw vs0, vs0, 1 +; PWR10BE-NEXT: xxeval v2, v2, v3, vs0, 1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxland vs0, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxland vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxland v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxland vs0, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxland vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxland v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxland vs0, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v4, vs0 +; PWR10LE-NEXT: xxeval vs1, v2, v3, v4, 1 +; PWR10LE-NEXT: xxspltw vs1, vs1, 2 +; PWR10LE-NEXT: xxeval v2, vs0, v4, vs1, 1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxland vs0, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v4, vs0 +; PWR10BE-NEXT: xxeval vs1, v2, v3, v4, 1 +; PWR10BE-NEXT: xxspltw vs1, vs1, 1 +; PWR10BE-NEXT: xxeval v2, vs0, v4, vs1, 1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxland vs0, v3, v5 +; PWR9LE-NEXT: xxland vs1, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxland vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxland vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxland v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxland vs0, v3, v5 +; PWR9BE-NEXT: xxland vs1, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxland vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxland vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxland v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxland vs1, v2, v4 +; PWR10LE-NEXT: xxland vs0, v3, v5 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxeval vs2, vs1, v3, v5, 1 +; PWR10LE-NEXT: xxswapd v2, vs2 +; PWR10LE-NEXT: xxeval vs0, vs1, vs0, v2, 1 +; PWR10LE-NEXT: xxspltw vs0, vs0, 2 +; PWR10LE-NEXT: xxeval v2, vs2, v2, vs0, 1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxland vs1, v2, v4 +; PWR10BE-NEXT: xxland vs0, v3, v5 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxeval vs2, vs1, v3, v5, 1 +; PWR10BE-NEXT: xxswapd v2, vs2 +; PWR10BE-NEXT: xxeval vs0, vs1, vs0, v2, 1 +; PWR10BE-NEXT: xxspltw vs0, vs0, 1 +; PWR10BE-NEXT: xxeval v2, vs2, v2, vs0, 1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a) + ret i32 %0 +} + +declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>) #0 + +;; +;; Vectors of type i64 +;; +define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: xxland vs0, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: xxland vs0, v2, v3 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: xxland vs0, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: xxland vs0, v2, v3 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxland vs0, v2, v3 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxland vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxland vs0, v2, v3 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxland vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxland vs0, v2, v3 +; PWR10LE-NEXT: xxswapd v4, vs0 +; PWR10LE-NEXT: xxeval vs0, v2, v3, v4, 1 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxland vs0, v2, v3 +; PWR10BE-NEXT: xxswapd v4, vs0 +; PWR10BE-NEXT: xxeval vs0, v2, v3, v4, 1 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxland vs0, v3, v5 +; PWR9LE-NEXT: xxland vs1, v2, v4 +; PWR9LE-NEXT: xxland vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxland vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxland vs0, v3, v5 +; PWR9BE-NEXT: xxland vs1, v2, v4 +; PWR9BE-NEXT: xxland vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxland vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxland vs1, v2, v4 +; PWR10LE-NEXT: xxland vs0, v3, v5 +; PWR10LE-NEXT: xxeval vs2, vs1, v3, v5, 1 +; PWR10LE-NEXT: xxswapd v2, vs2 +; PWR10LE-NEXT: xxeval vs0, vs1, vs0, v2, 1 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxland vs1, v2, v4 +; PWR10BE-NEXT: xxland vs0, v3, v5 +; PWR10BE-NEXT: xxeval vs2, vs1, v3, v5, 1 +; PWR10BE-NEXT: xxswapd v2, vs2 +; PWR10BE-NEXT: xxeval vs0, vs1, vs0, v2, 1 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxland vs0, v4, v8 +; PWR9LE-NEXT: xxland vs1, v2, v6 +; PWR9LE-NEXT: xxland vs2, v5, v9 +; PWR9LE-NEXT: xxland vs3, v3, v7 +; PWR9LE-NEXT: xxland vs2, vs3, vs2 +; PWR9LE-NEXT: xxland vs0, vs1, vs0 +; PWR9LE-NEXT: xxland vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxland vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxland vs0, v4, v8 +; PWR9BE-NEXT: xxland vs1, v2, v6 +; PWR9BE-NEXT: xxland vs2, v5, v9 +; PWR9BE-NEXT: xxland vs3, v3, v7 +; PWR9BE-NEXT: xxland vs2, vs3, vs2 +; PWR9BE-NEXT: xxland vs0, vs1, vs0 +; PWR9BE-NEXT: xxland vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxland vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxland vs1, v2, v6 +; PWR10LE-NEXT: xxland vs0, v5, v9 +; PWR10LE-NEXT: xxland vs2, v3, v7 +; PWR10LE-NEXT: xxeval vs1, vs1, v4, v8, 1 +; PWR10LE-NEXT: xxeval vs3, vs2, v5, v9, 1 +; PWR10LE-NEXT: xxeval vs0, vs1, vs2, vs0, 1 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxeval vs0, vs1, vs3, v2, 1 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxland vs1, v2, v6 +; PWR10BE-NEXT: xxland vs0, v5, v9 +; PWR10BE-NEXT: xxland vs2, v3, v7 +; PWR10BE-NEXT: xxeval vs1, vs1, v4, v8, 1 +; PWR10BE-NEXT: xxeval vs3, vs2, v5, v9, 1 +; PWR10BE-NEXT: xxeval vs0, vs1, vs2, vs0, 1 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxeval vs0, vs1, vs3, v2, 1 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %a) + ret i64 %0 +} + +declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) #0 +declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) #0 +declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>) #0 +declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>) #0 + + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll new file mode 100644 index 000000000000..20bfcfcccd39 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll @@ -0,0 +1,4247 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mattr=-paired-vector-memops -mcpu=pwr10 -mtriple=powerpc64le < %s | \ +; RUN: FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mattr=-paired-vector-memops -mcpu=pwr10 -mtriple=powerpc64 < %s | \ +; RUN: FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of f32 +;; +define dso_local float @v2f32(<2 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %a) + ret float %0 +} + +define dso_local float @v2f32_b(<2 x float> %a, float %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xsaddsp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xsaddsp f0, f1, f0 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xsaddsp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xsaddsp f0, f1, f0 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fadd.v2f32(float %b, <2 x float> %a) + ret float %0 +} + +define dso_local float @v2f32_fast(<2 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw vs0, v2, 2 +; PWR9LE-NEXT: xvaddsp vs0, v2, vs0 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw vs0, v2, 1 +; PWR9BE-NEXT: xvaddsp vs0, v2, vs0 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw vs0, v2, 2 +; PWR10LE-NEXT: xvaddsp vs0, v2, vs0 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw vs0, v2, 1 +; PWR10BE-NEXT: xvaddsp vs0, v2, vs0 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %a) + ret float %0 +} + +define dso_local float @v4f32(<4 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsaddsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsaddsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %a) + ret float %0 +} + +define dso_local float @v4f32_b(<4 x float> %a, float %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xsaddsp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsaddsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xsaddsp f0, f1, f0 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xsaddsp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsaddsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xsaddsp f0, f1, f0 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fadd.v4f32(float %b, <4 x float> %a) + ret float %0 +} + +define dso_local float @v4f32_fast(<4 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: xvaddsp vs0, v2, v3 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: xvaddsp vs0, v2, v3 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: xvaddsp vs0, v2, v3 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: xvaddsp vs0, v2, v3 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %a) + ret float %0 +} + +define dso_local float @v8f32(<8 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v3 +; PWR9LE-NEXT: xsaddsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v3 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v3 +; PWR10LE-NEXT: xsaddsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v3 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %a) + ret float %0 +} + +define dso_local float @v8f32_b(<8 x float> %a, float %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xsaddsp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v3 +; PWR9LE-NEXT: xsaddsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xsaddsp f0, f1, f0 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v3 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xsaddsp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v3 +; PWR10LE-NEXT: xsaddsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xsaddsp f0, f1, f0 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v3 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fadd.v8f32(float %b, <8 x float> %a) + ret float %0 +} + +define dso_local float @v8f32_fast(<8 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvaddsp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvaddsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvaddsp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvaddsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvaddsp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvaddsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvaddsp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvaddsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %a) + ret float %0 +} + +define dso_local float @v16f32(<16 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v3 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v4 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v5 +; PWR9LE-NEXT: xsaddsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v3 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v4 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v5 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v3 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v4 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v5 +; PWR10LE-NEXT: xsaddsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v3 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v4 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v5 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %a) + ret float %0 +} + +define dso_local float @v16f32_b(<16 x float> %a, float %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xsaddsp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v3 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v4 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsaddsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v5 +; PWR9LE-NEXT: xsaddsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xsaddsp f0, f1, f0 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v3 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v4 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v5 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsaddsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xsaddsp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v3 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v4 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsaddsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v5 +; PWR10LE-NEXT: xsaddsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xsaddsp f0, f1, f0 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v3 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v4 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v5 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsaddsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fadd.v16f32(float %b, <16 x float> %a) + ret float %0 +} + +define dso_local float @v16f32_fast(<16 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvaddsp vs0, v3, v5 +; PWR9LE-NEXT: xvaddsp vs1, v2, v4 +; PWR9LE-NEXT: xvaddsp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvaddsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvaddsp vs0, v3, v5 +; PWR9BE-NEXT: xvaddsp vs1, v2, v4 +; PWR9BE-NEXT: xvaddsp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvaddsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvaddsp vs0, v3, v5 +; PWR10LE-NEXT: xvaddsp vs1, v2, v4 +; PWR10LE-NEXT: xvaddsp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvaddsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvaddsp vs0, v3, v5 +; PWR10BE-NEXT: xvaddsp vs1, v2, v4 +; PWR10BE-NEXT: xvaddsp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvaddsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvaddsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %a) + ret float %0 +} + +declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) #0 +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) #0 +declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) #0 +declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) #0 + +;; +;; Vectors of f64 +;; +define dso_local double @v2f64(<2 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsadddp f1, f0, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xsadddp f1, v2, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsadddp f1, f0, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xsadddp f1, v2, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %a) + ret double %0 +} + +define dso_local double @v2f64_b(<2 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsadddp f0, f1, f0 +; PWR9LE-NEXT: xsadddp f1, f0, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsadddp f0, f1, v2 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xsadddp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsadddp f0, f1, f0 +; PWR10LE-NEXT: xsadddp f1, f0, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsadddp f0, f1, v2 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xsadddp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v2f64(double %b, <2 x double> %a) + ret double %0 +} + +define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xvadddp vs0, v2, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xvadddp vs1, v2, vs0 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xvadddp vs0, v2, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xvadddp vs1, v2, vs0 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %a) + ret double %0 +} + +define dso_local double @v4f64(<4 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsadddp f0, f0, v2 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xsadddp f1, f0, v3 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsadddp f0, v2, f0 +; PWR9BE-NEXT: xsadddp f0, f0, v3 +; PWR9BE-NEXT: xsadddp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsadddp f0, f0, v2 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xsadddp f1, f0, v3 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsadddp f0, v2, f0 +; PWR10BE-NEXT: xsadddp f0, f0, v3 +; PWR10BE-NEXT: xsadddp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> %a) + ret double %0 +} + +define dso_local double @v4f64_b(<4 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsadddp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsadddp f0, f0, v2 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xsadddp f1, f0, v3 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsadddp f0, f1, v2 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsadddp f0, f0, v3 +; PWR9BE-NEXT: xsadddp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsadddp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsadddp f0, f0, v2 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xsadddp f1, f0, v3 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsadddp f0, f1, v2 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsadddp f0, f0, v3 +; PWR10BE-NEXT: xsadddp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v4f64(double %b, <4 x double> %a) + ret double %0 +} + +define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvadddp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvadddp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvadddp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvadddp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> %a) + ret double %0 +} + +define dso_local double @v8f64(<8 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsadddp f0, f0, v2 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xsadddp f0, f0, v3 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xsadddp f0, f0, v4 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xsadddp f1, f0, v5 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsadddp f0, v2, f0 +; PWR9BE-NEXT: xsadddp f0, f0, v3 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xsadddp f0, f0, v4 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xsadddp f0, f0, v5 +; PWR9BE-NEXT: xsadddp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsadddp f0, f0, v2 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xsadddp f0, f0, v3 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xsadddp f0, f0, v4 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xsadddp f1, f0, v5 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsadddp f0, v2, f0 +; PWR10BE-NEXT: xsadddp f0, f0, v3 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xsadddp f0, f0, v4 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xsadddp f0, f0, v5 +; PWR10BE-NEXT: xsadddp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %a) + ret double %0 +} + +define dso_local double @v8f64_b(<8 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsadddp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsadddp f0, f0, v2 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xsadddp f0, f0, v3 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xsadddp f0, f0, v4 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xsadddp f1, f0, v5 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsadddp f0, f1, v2 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsadddp f0, f0, v3 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xsadddp f0, f0, v4 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xsadddp f0, f0, v5 +; PWR9BE-NEXT: xsadddp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsadddp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsadddp f0, f0, v2 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xsadddp f0, f0, v3 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xsadddp f0, f0, v4 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xsadddp f1, f0, v5 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsadddp f0, f1, v2 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsadddp f0, f0, v3 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xsadddp f0, f0, v4 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xsadddp f0, f0, v5 +; PWR10BE-NEXT: xsadddp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v8f64(double %b, <8 x double> %a) + ret double %0 +} + +define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvadddp vs0, v3, v5 +; PWR9LE-NEXT: xvadddp vs1, v2, v4 +; PWR9LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvadddp vs0, v3, v5 +; PWR9BE-NEXT: xvadddp vs1, v2, v4 +; PWR9BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvadddp vs0, v3, v5 +; PWR10LE-NEXT: xvadddp vs1, v2, v4 +; PWR10LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvadddp vs0, v3, v5 +; PWR10BE-NEXT: xvadddp vs1, v2, v4 +; PWR10BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %a) + ret double %0 +} + +define dso_local double @v16f64(<16 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsadddp f0, f0, v2 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xsadddp f0, f0, v3 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xsadddp f0, f0, v4 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v6 +; PWR9LE-NEXT: xsadddp f0, f0, v5 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v7 +; PWR9LE-NEXT: xsadddp f0, f0, v6 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v8 +; PWR9LE-NEXT: xsadddp f0, f0, v7 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v9 +; PWR9LE-NEXT: xsadddp f0, f0, v8 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xsadddp f1, f0, v9 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsadddp f0, v2, f0 +; PWR9BE-NEXT: xsadddp f0, f0, v3 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xsadddp f0, f0, v4 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xsadddp f0, f0, v5 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v6 +; PWR9BE-NEXT: xsadddp f0, f0, v6 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v7 +; PWR9BE-NEXT: xsadddp f0, f0, v7 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v8 +; PWR9BE-NEXT: xsadddp f0, f0, v8 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v9 +; PWR9BE-NEXT: xsadddp f0, f0, v9 +; PWR9BE-NEXT: xsadddp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsadddp f0, f0, v2 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xsadddp f0, f0, v3 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xsadddp f0, f0, v4 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v6 +; PWR10LE-NEXT: xsadddp f0, f0, v5 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v7 +; PWR10LE-NEXT: xsadddp f0, f0, v6 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v8 +; PWR10LE-NEXT: xsadddp f0, f0, v7 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v9 +; PWR10LE-NEXT: xsadddp f0, f0, v8 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xsadddp f1, f0, v9 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsadddp f0, v2, f0 +; PWR10BE-NEXT: xsadddp f0, f0, v3 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xsadddp f0, f0, v4 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xsadddp f0, f0, v5 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v6 +; PWR10BE-NEXT: xsadddp f0, f0, v6 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v7 +; PWR10BE-NEXT: xsadddp f0, f0, v7 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v8 +; PWR10BE-NEXT: xsadddp f0, f0, v8 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v9 +; PWR10BE-NEXT: xsadddp f0, f0, v9 +; PWR10BE-NEXT: xsadddp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v16f64(double -0.000000e+00, <16 x double> %a) + ret double %0 +} + +define dso_local double @v16f64_b(<16 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsadddp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsadddp f0, f0, v2 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xsadddp f0, f0, v3 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xsadddp f0, f0, v4 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v6 +; PWR9LE-NEXT: xsadddp f0, f0, v5 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v7 +; PWR9LE-NEXT: xsadddp f0, f0, v6 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v8 +; PWR9LE-NEXT: xsadddp f0, f0, v7 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v9 +; PWR9LE-NEXT: xsadddp f0, f0, v8 +; PWR9LE-NEXT: xsadddp f0, f0, f1 +; PWR9LE-NEXT: xsadddp f1, f0, v9 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsadddp f0, f1, v2 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsadddp f0, f0, v3 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xsadddp f0, f0, v4 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xsadddp f0, f0, v5 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v6 +; PWR9BE-NEXT: xsadddp f0, f0, v6 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v7 +; PWR9BE-NEXT: xsadddp f0, f0, v7 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v8 +; PWR9BE-NEXT: xsadddp f0, f0, v8 +; PWR9BE-NEXT: xsadddp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v9 +; PWR9BE-NEXT: xsadddp f0, f0, v9 +; PWR9BE-NEXT: xsadddp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsadddp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsadddp f0, f0, v2 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xsadddp f0, f0, v3 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xsadddp f0, f0, v4 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v6 +; PWR10LE-NEXT: xsadddp f0, f0, v5 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v7 +; PWR10LE-NEXT: xsadddp f0, f0, v6 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v8 +; PWR10LE-NEXT: xsadddp f0, f0, v7 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v9 +; PWR10LE-NEXT: xsadddp f0, f0, v8 +; PWR10LE-NEXT: xsadddp f0, f0, f1 +; PWR10LE-NEXT: xsadddp f1, f0, v9 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsadddp f0, f1, v2 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsadddp f0, f0, v3 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xsadddp f0, f0, v4 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xsadddp f0, f0, v5 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v6 +; PWR10BE-NEXT: xsadddp f0, f0, v6 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v7 +; PWR10BE-NEXT: xsadddp f0, f0, v7 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v8 +; PWR10BE-NEXT: xsadddp f0, f0, v8 +; PWR10BE-NEXT: xsadddp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v9 +; PWR10BE-NEXT: xsadddp f0, f0, v9 +; PWR10BE-NEXT: xsadddp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v16f64(double %b, <16 x double> %a) + ret double %0 +} + +define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvadddp vs0, v4, v8 +; PWR9LE-NEXT: xvadddp vs1, v2, v6 +; PWR9LE-NEXT: xvadddp vs2, v5, v9 +; PWR9LE-NEXT: xvadddp vs3, v3, v7 +; PWR9LE-NEXT: xvadddp vs2, vs3, vs2 +; PWR9LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvadddp vs0, v4, v8 +; PWR9BE-NEXT: xvadddp vs1, v2, v6 +; PWR9BE-NEXT: xvadddp vs2, v5, v9 +; PWR9BE-NEXT: xvadddp vs3, v3, v7 +; PWR9BE-NEXT: xvadddp vs2, vs3, vs2 +; PWR9BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9BE-NEXT: xvadddp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvadddp vs0, v4, v8 +; PWR10LE-NEXT: xvadddp vs1, v2, v6 +; PWR10LE-NEXT: xvadddp vs2, v5, v9 +; PWR10LE-NEXT: xvadddp vs3, v3, v7 +; PWR10LE-NEXT: xvadddp vs2, vs3, vs2 +; PWR10LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvadddp vs0, v4, v8 +; PWR10BE-NEXT: xvadddp vs1, v2, v6 +; PWR10BE-NEXT: xvadddp vs2, v5, v9 +; PWR10BE-NEXT: xvadddp vs3, v3, v7 +; PWR10BE-NEXT: xvadddp vs2, vs3, vs2 +; PWR10BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10BE-NEXT: xvadddp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fadd.v16f64(double -0.000000e+00, <16 x double> %a) + ret double %0 +} + +define dso_local double @v32f64(<32 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs4, v2 +; PWR9LE-NEXT: xxswapd vs5, v3 +; PWR9LE-NEXT: lxv vs3, 224(r1) +; PWR9LE-NEXT: lxv vs2, 240(r1) +; PWR9LE-NEXT: lxv vs1, 256(r1) +; PWR9LE-NEXT: lxv vs0, 272(r1) +; PWR9LE-NEXT: xsadddp f4, f4, v2 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v4 +; PWR9LE-NEXT: xsadddp f4, f4, v3 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v5 +; PWR9LE-NEXT: xsadddp f4, f4, v4 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v6 +; PWR9LE-NEXT: xsadddp f4, f4, v5 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v7 +; PWR9LE-NEXT: xsadddp f4, f4, v6 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v8 +; PWR9LE-NEXT: xsadddp f4, f4, v7 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v9 +; PWR9LE-NEXT: xsadddp f4, f4, v8 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v10 +; PWR9LE-NEXT: xsadddp f4, f4, v9 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v11 +; PWR9LE-NEXT: xsadddp f4, f4, v10 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v12 +; PWR9LE-NEXT: xsadddp f4, f4, v11 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, v13 +; PWR9LE-NEXT: xsadddp f4, f4, v12 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xxswapd vs5, vs3 +; PWR9LE-NEXT: xsadddp f4, f4, v13 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xsadddp f3, f4, f3 +; PWR9LE-NEXT: xxswapd vs4, vs2 +; PWR9LE-NEXT: xsadddp f3, f3, f4 +; PWR9LE-NEXT: xsadddp f2, f3, f2 +; PWR9LE-NEXT: xxswapd vs3, vs1 +; PWR9LE-NEXT: xsadddp f2, f2, f3 +; PWR9LE-NEXT: xsadddp f1, f2, f1 +; PWR9LE-NEXT: xxswapd vs2, vs0 +; PWR9LE-NEXT: xsadddp f1, f1, f2 +; PWR9LE-NEXT: xsadddp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs4, v2 +; PWR9BE-NEXT: xxswapd vs5, v3 +; PWR9BE-NEXT: lxv vs3, 240(r1) +; PWR9BE-NEXT: lxv vs2, 256(r1) +; PWR9BE-NEXT: lxv vs1, 272(r1) +; PWR9BE-NEXT: lxv vs0, 288(r1) +; PWR9BE-NEXT: xsadddp f4, v2, f4 +; PWR9BE-NEXT: xsadddp f4, f4, v3 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v4 +; PWR9BE-NEXT: xsadddp f4, f4, v4 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v5 +; PWR9BE-NEXT: xsadddp f4, f4, v5 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v6 +; PWR9BE-NEXT: xsadddp f4, f4, v6 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v7 +; PWR9BE-NEXT: xsadddp f4, f4, v7 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v8 +; PWR9BE-NEXT: xsadddp f4, f4, v8 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v9 +; PWR9BE-NEXT: xsadddp f4, f4, v9 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v10 +; PWR9BE-NEXT: xsadddp f4, f4, v10 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v11 +; PWR9BE-NEXT: xsadddp f4, f4, v11 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v12 +; PWR9BE-NEXT: xsadddp f4, f4, v12 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xxswapd vs5, v13 +; PWR9BE-NEXT: xsadddp f4, f4, v13 +; PWR9BE-NEXT: xsadddp f4, f4, f5 +; PWR9BE-NEXT: xsadddp f4, f4, f3 +; PWR9BE-NEXT: xxswapd vs3, vs3 +; PWR9BE-NEXT: xsadddp f3, f4, f3 +; PWR9BE-NEXT: xsadddp f3, f3, f2 +; PWR9BE-NEXT: xxswapd vs2, vs2 +; PWR9BE-NEXT: xsadddp f2, f3, f2 +; PWR9BE-NEXT: xsadddp f2, f2, f1 +; PWR9BE-NEXT: xxswapd vs1, vs1 +; PWR9BE-NEXT: xsadddp f1, f2, f1 +; PWR9BE-NEXT: xsadddp f1, f1, f0 +; PWR9BE-NEXT: xxswapd vs0, vs0 +; PWR9BE-NEXT: xsadddp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs4, v2 +; PWR10LE-NEXT: xxswapd vs5, v3 +; PWR10LE-NEXT: lxv vs3, 224(r1) +; PWR10LE-NEXT: lxv vs2, 240(r1) +; PWR10LE-NEXT: xsadddp f4, f4, v2 +; PWR10LE-NEXT: lxv vs1, 256(r1) +; PWR10LE-NEXT: lxv vs0, 272(r1) +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v4 +; PWR10LE-NEXT: xsadddp f4, f4, v3 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v5 +; PWR10LE-NEXT: xsadddp f4, f4, v4 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v6 +; PWR10LE-NEXT: xsadddp f4, f4, v5 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v7 +; PWR10LE-NEXT: xsadddp f4, f4, v6 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v8 +; PWR10LE-NEXT: xsadddp f4, f4, v7 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v9 +; PWR10LE-NEXT: xsadddp f4, f4, v8 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v10 +; PWR10LE-NEXT: xsadddp f4, f4, v9 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v11 +; PWR10LE-NEXT: xsadddp f4, f4, v10 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v12 +; PWR10LE-NEXT: xsadddp f4, f4, v11 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, v13 +; PWR10LE-NEXT: xsadddp f4, f4, v12 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xxswapd vs5, vs3 +; PWR10LE-NEXT: xsadddp f4, f4, v13 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xsadddp f3, f4, f3 +; PWR10LE-NEXT: xxswapd vs4, vs2 +; PWR10LE-NEXT: xsadddp f3, f3, f4 +; PWR10LE-NEXT: xsadddp f2, f3, f2 +; PWR10LE-NEXT: xxswapd vs3, vs1 +; PWR10LE-NEXT: xsadddp f2, f2, f3 +; PWR10LE-NEXT: xsadddp f1, f2, f1 +; PWR10LE-NEXT: xxswapd vs2, vs0 +; PWR10LE-NEXT: xsadddp f1, f1, f2 +; PWR10LE-NEXT: xsadddp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs4, v2 +; PWR10BE-NEXT: xxswapd vs5, v3 +; PWR10BE-NEXT: lxv vs3, 240(r1) +; PWR10BE-NEXT: lxv vs2, 256(r1) +; PWR10BE-NEXT: xsadddp f4, v2, f4 +; PWR10BE-NEXT: lxv vs1, 272(r1) +; PWR10BE-NEXT: lxv vs0, 288(r1) +; PWR10BE-NEXT: xsadddp f4, f4, v3 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v4 +; PWR10BE-NEXT: xsadddp f4, f4, v4 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v5 +; PWR10BE-NEXT: xsadddp f4, f4, v5 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v6 +; PWR10BE-NEXT: xsadddp f4, f4, v6 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v7 +; PWR10BE-NEXT: xsadddp f4, f4, v7 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v8 +; PWR10BE-NEXT: xsadddp f4, f4, v8 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v9 +; PWR10BE-NEXT: xsadddp f4, f4, v9 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v10 +; PWR10BE-NEXT: xsadddp f4, f4, v10 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v11 +; PWR10BE-NEXT: xsadddp f4, f4, v11 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v12 +; PWR10BE-NEXT: xsadddp f4, f4, v12 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xxswapd vs5, v13 +; PWR10BE-NEXT: xsadddp f4, f4, v13 +; PWR10BE-NEXT: xsadddp f4, f4, f5 +; PWR10BE-NEXT: xsadddp f4, f4, f3 +; PWR10BE-NEXT: xxswapd vs3, vs3 +; PWR10BE-NEXT: xsadddp f3, f4, f3 +; PWR10BE-NEXT: xsadddp f3, f3, f2 +; PWR10BE-NEXT: xxswapd vs2, vs2 +; PWR10BE-NEXT: xsadddp f2, f3, f2 +; PWR10BE-NEXT: xsadddp f2, f2, f1 +; PWR10BE-NEXT: xxswapd vs1, vs1 +; PWR10BE-NEXT: xsadddp f1, f2, f1 +; PWR10BE-NEXT: xsadddp f1, f1, f0 +; PWR10BE-NEXT: xxswapd vs0, vs0 +; PWR10BE-NEXT: xsadddp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> %a) + ret double %0 +} + +define dso_local double @v32f64_b(<32 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs5, v2 +; PWR9LE-NEXT: lxv vs4, 224(r1) +; PWR9LE-NEXT: lxv vs3, 240(r1) +; PWR9LE-NEXT: lxv vs2, 256(r1) +; PWR9LE-NEXT: lxv vs0, 272(r1) +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v3 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v4 +; PWR9LE-NEXT: xsadddp f1, f1, v3 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v5 +; PWR9LE-NEXT: xsadddp f1, f1, v4 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v6 +; PWR9LE-NEXT: xsadddp f1, f1, v5 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v7 +; PWR9LE-NEXT: xsadddp f1, f1, v6 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v8 +; PWR9LE-NEXT: xsadddp f1, f1, v7 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v9 +; PWR9LE-NEXT: xsadddp f1, f1, v8 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v10 +; PWR9LE-NEXT: xsadddp f1, f1, v9 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v11 +; PWR9LE-NEXT: xsadddp f1, f1, v10 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v12 +; PWR9LE-NEXT: xsadddp f1, f1, v11 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, v13 +; PWR9LE-NEXT: xsadddp f1, f1, v12 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, vs4 +; PWR9LE-NEXT: xsadddp f1, f1, v13 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xsadddp f1, f1, f4 +; PWR9LE-NEXT: xxswapd vs4, vs3 +; PWR9LE-NEXT: xsadddp f1, f1, f4 +; PWR9LE-NEXT: xsadddp f1, f1, f3 +; PWR9LE-NEXT: xxswapd vs3, vs2 +; PWR9LE-NEXT: xsadddp f1, f1, f3 +; PWR9LE-NEXT: xsadddp f1, f1, f2 +; PWR9LE-NEXT: xxswapd vs2, vs0 +; PWR9LE-NEXT: xsadddp f1, f1, f2 +; PWR9LE-NEXT: xsadddp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd vs5, v2 +; PWR9BE-NEXT: lxv vs4, 240(r1) +; PWR9BE-NEXT: lxv vs3, 256(r1) +; PWR9BE-NEXT: lxv vs2, 272(r1) +; PWR9BE-NEXT: lxv vs0, 288(r1) +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v3 +; PWR9BE-NEXT: xsadddp f1, f1, v3 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v4 +; PWR9BE-NEXT: xsadddp f1, f1, v4 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v5 +; PWR9BE-NEXT: xsadddp f1, f1, v5 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v6 +; PWR9BE-NEXT: xsadddp f1, f1, v6 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v7 +; PWR9BE-NEXT: xsadddp f1, f1, v7 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v8 +; PWR9BE-NEXT: xsadddp f1, f1, v8 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v9 +; PWR9BE-NEXT: xsadddp f1, f1, v9 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v10 +; PWR9BE-NEXT: xsadddp f1, f1, v10 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v11 +; PWR9BE-NEXT: xsadddp f1, f1, v11 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v12 +; PWR9BE-NEXT: xsadddp f1, f1, v12 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, v13 +; PWR9BE-NEXT: xsadddp f1, f1, v13 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xsadddp f1, f1, f4 +; PWR9BE-NEXT: xxswapd vs4, vs4 +; PWR9BE-NEXT: xsadddp f1, f1, f4 +; PWR9BE-NEXT: xsadddp f1, f1, f3 +; PWR9BE-NEXT: xxswapd vs3, vs3 +; PWR9BE-NEXT: xsadddp f1, f1, f3 +; PWR9BE-NEXT: xsadddp f1, f1, f2 +; PWR9BE-NEXT: xxswapd vs2, vs2 +; PWR9BE-NEXT: xsadddp f1, f1, f2 +; PWR9BE-NEXT: xsadddp f1, f1, f0 +; PWR9BE-NEXT: xxswapd vs0, vs0 +; PWR9BE-NEXT: xsadddp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs5, v2 +; PWR10LE-NEXT: lxv vs4, 224(r1) +; PWR10LE-NEXT: lxv vs3, 240(r1) +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v3 +; PWR10LE-NEXT: lxv vs2, 256(r1) +; PWR10LE-NEXT: lxv vs0, 272(r1) +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v4 +; PWR10LE-NEXT: xsadddp f1, f1, v3 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v5 +; PWR10LE-NEXT: xsadddp f1, f1, v4 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v6 +; PWR10LE-NEXT: xsadddp f1, f1, v5 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v7 +; PWR10LE-NEXT: xsadddp f1, f1, v6 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v8 +; PWR10LE-NEXT: xsadddp f1, f1, v7 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v9 +; PWR10LE-NEXT: xsadddp f1, f1, v8 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v10 +; PWR10LE-NEXT: xsadddp f1, f1, v9 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v11 +; PWR10LE-NEXT: xsadddp f1, f1, v10 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v12 +; PWR10LE-NEXT: xsadddp f1, f1, v11 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, v13 +; PWR10LE-NEXT: xsadddp f1, f1, v12 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, vs4 +; PWR10LE-NEXT: xsadddp f1, f1, v13 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xsadddp f1, f1, f4 +; PWR10LE-NEXT: xxswapd vs4, vs3 +; PWR10LE-NEXT: xsadddp f1, f1, f4 +; PWR10LE-NEXT: xsadddp f1, f1, f3 +; PWR10LE-NEXT: xxswapd vs3, vs2 +; PWR10LE-NEXT: xsadddp f1, f1, f3 +; PWR10LE-NEXT: xsadddp f1, f1, f2 +; PWR10LE-NEXT: xxswapd vs2, vs0 +; PWR10LE-NEXT: xsadddp f1, f1, f2 +; PWR10LE-NEXT: xsadddp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd vs5, v2 +; PWR10BE-NEXT: lxv vs4, 240(r1) +; PWR10BE-NEXT: lxv vs3, 256(r1) +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v3 +; PWR10BE-NEXT: lxv vs2, 272(r1) +; PWR10BE-NEXT: lxv vs0, 288(r1) +; PWR10BE-NEXT: xsadddp f1, f1, v3 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v4 +; PWR10BE-NEXT: xsadddp f1, f1, v4 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v5 +; PWR10BE-NEXT: xsadddp f1, f1, v5 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v6 +; PWR10BE-NEXT: xsadddp f1, f1, v6 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v7 +; PWR10BE-NEXT: xsadddp f1, f1, v7 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v8 +; PWR10BE-NEXT: xsadddp f1, f1, v8 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v9 +; PWR10BE-NEXT: xsadddp f1, f1, v9 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v10 +; PWR10BE-NEXT: xsadddp f1, f1, v10 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v11 +; PWR10BE-NEXT: xsadddp f1, f1, v11 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v12 +; PWR10BE-NEXT: xsadddp f1, f1, v12 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, v13 +; PWR10BE-NEXT: xsadddp f1, f1, v13 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xsadddp f1, f1, f4 +; PWR10BE-NEXT: xxswapd vs4, vs4 +; PWR10BE-NEXT: xsadddp f1, f1, f4 +; PWR10BE-NEXT: xsadddp f1, f1, f3 +; PWR10BE-NEXT: xxswapd vs3, vs3 +; PWR10BE-NEXT: xsadddp f1, f1, f3 +; PWR10BE-NEXT: xsadddp f1, f1, f2 +; PWR10BE-NEXT: xxswapd vs2, vs2 +; PWR10BE-NEXT: xsadddp f1, f1, f2 +; PWR10BE-NEXT: xsadddp f1, f1, f0 +; PWR10BE-NEXT: xxswapd vs0, vs0 +; PWR10BE-NEXT: xsadddp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v32f64(double %b, <32 x double> %a) + ret double %0 +} + +define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: lxv vs0, 256(r1) +; PWR9LE-NEXT: lxv vs1, 224(r1) +; PWR9LE-NEXT: lxv vs2, 272(r1) +; PWR9LE-NEXT: lxv vs3, 240(r1) +; PWR9LE-NEXT: xvadddp vs4, v3, v11 +; PWR9LE-NEXT: xvadddp vs5, v5, v13 +; PWR9LE-NEXT: xvadddp vs6, v2, v10 +; PWR9LE-NEXT: xvadddp vs7, v4, v12 +; PWR9LE-NEXT: xvadddp vs3, v7, vs3 +; PWR9LE-NEXT: xvadddp vs2, v9, vs2 +; PWR9LE-NEXT: xvadddp vs1, v6, vs1 +; PWR9LE-NEXT: xvadddp vs0, v8, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs7, vs0 +; PWR9LE-NEXT: xvadddp vs1, vs6, vs1 +; PWR9LE-NEXT: xvadddp vs2, vs5, vs2 +; PWR9LE-NEXT: xvadddp vs3, vs4, vs3 +; PWR9LE-NEXT: xvadddp vs2, vs3, vs2 +; PWR9LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: lxv vs0, 272(r1) +; PWR9BE-NEXT: lxv vs1, 240(r1) +; PWR9BE-NEXT: lxv vs2, 288(r1) +; PWR9BE-NEXT: lxv vs3, 256(r1) +; PWR9BE-NEXT: xvadddp vs4, v3, v11 +; PWR9BE-NEXT: xvadddp vs5, v5, v13 +; PWR9BE-NEXT: xvadddp vs6, v2, v10 +; PWR9BE-NEXT: xvadddp vs7, v4, v12 +; PWR9BE-NEXT: xvadddp vs3, v7, vs3 +; PWR9BE-NEXT: xvadddp vs2, v9, vs2 +; PWR9BE-NEXT: xvadddp vs1, v6, vs1 +; PWR9BE-NEXT: xvadddp vs0, v8, vs0 +; PWR9BE-NEXT: xvadddp vs0, vs7, vs0 +; PWR9BE-NEXT: xvadddp vs1, vs6, vs1 +; PWR9BE-NEXT: xvadddp vs2, vs5, vs2 +; PWR9BE-NEXT: xvadddp vs3, vs4, vs3 +; PWR9BE-NEXT: xvadddp vs2, vs3, vs2 +; PWR9BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9BE-NEXT: xvadddp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: lxv vs0, 256(r1) +; PWR10LE-NEXT: lxv vs1, 224(r1) +; PWR10LE-NEXT: xvadddp vs4, v3, v11 +; PWR10LE-NEXT: xvadddp vs5, v5, v13 +; PWR10LE-NEXT: xvadddp vs6, v2, v10 +; PWR10LE-NEXT: xvadddp vs7, v4, v12 +; PWR10LE-NEXT: xvadddp vs1, v6, vs1 +; PWR10LE-NEXT: lxv vs2, 272(r1) +; PWR10LE-NEXT: lxv vs3, 240(r1) +; PWR10LE-NEXT: xvadddp vs3, v7, vs3 +; PWR10LE-NEXT: xvadddp vs2, v9, vs2 +; PWR10LE-NEXT: xvadddp vs0, v8, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs7, vs0 +; PWR10LE-NEXT: xvadddp vs1, vs6, vs1 +; PWR10LE-NEXT: xvadddp vs2, vs5, vs2 +; PWR10LE-NEXT: xvadddp vs3, vs4, vs3 +; PWR10LE-NEXT: xvadddp vs2, vs3, vs2 +; PWR10LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: lxv vs0, 272(r1) +; PWR10BE-NEXT: lxv vs1, 240(r1) +; PWR10BE-NEXT: xvadddp vs4, v3, v11 +; PWR10BE-NEXT: xvadddp vs5, v5, v13 +; PWR10BE-NEXT: xvadddp vs6, v2, v10 +; PWR10BE-NEXT: xvadddp vs7, v4, v12 +; PWR10BE-NEXT: xvadddp vs1, v6, vs1 +; PWR10BE-NEXT: lxv vs2, 288(r1) +; PWR10BE-NEXT: lxv vs3, 256(r1) +; PWR10BE-NEXT: xvadddp vs3, v7, vs3 +; PWR10BE-NEXT: xvadddp vs2, v9, vs2 +; PWR10BE-NEXT: xvadddp vs0, v8, vs0 +; PWR10BE-NEXT: xvadddp vs0, vs7, vs0 +; PWR10BE-NEXT: xvadddp vs1, vs6, vs1 +; PWR10BE-NEXT: xvadddp vs2, vs5, vs2 +; PWR10BE-NEXT: xvadddp vs3, vs4, vs3 +; PWR10BE-NEXT: xvadddp vs2, vs3, vs2 +; PWR10BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10BE-NEXT: xvadddp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> %a) + ret double %0 +} + +define dso_local double @v64f64(<64 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v64f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v18, v2 +; PWR9LE-NEXT: lxv v17, 224(r1) +; PWR9LE-NEXT: lxv v16, 240(r1) +; PWR9LE-NEXT: lxv v15, 256(r1) +; PWR9LE-NEXT: lxv v14, 272(r1) +; PWR9LE-NEXT: xsadddp v2, v18, v2 +; PWR9LE-NEXT: xxswapd v18, v3 +; PWR9LE-NEXT: lxv v1, 288(r1) +; PWR9LE-NEXT: lxv v0, 304(r1) +; PWR9LE-NEXT: lxv vs13, 320(r1) +; PWR9LE-NEXT: lxv vs12, 336(r1) +; PWR9LE-NEXT: lxv vs11, 352(r1) +; PWR9LE-NEXT: lxv vs10, 368(r1) +; PWR9LE-NEXT: lxv vs9, 384(r1) +; PWR9LE-NEXT: lxv vs8, 400(r1) +; PWR9LE-NEXT: lxv vs7, 416(r1) +; PWR9LE-NEXT: lxv vs6, 432(r1) +; PWR9LE-NEXT: lxv vs5, 448(r1) +; PWR9LE-NEXT: lxv vs4, 464(r1) +; PWR9LE-NEXT: xsadddp v2, v2, v18 +; PWR9LE-NEXT: lxv vs3, 480(r1) +; PWR9LE-NEXT: lxv vs2, 496(r1) +; PWR9LE-NEXT: lxv vs1, 512(r1) +; PWR9LE-NEXT: lxv vs0, 528(r1) +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v4 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v5 +; PWR9LE-NEXT: xsadddp v2, v2, v4 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v6 +; PWR9LE-NEXT: xsadddp v2, v2, v5 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v7 +; PWR9LE-NEXT: xsadddp v2, v2, v6 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v8 +; PWR9LE-NEXT: xsadddp v2, v2, v7 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v9 +; PWR9LE-NEXT: xsadddp v2, v2, v8 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v10 +; PWR9LE-NEXT: xsadddp v2, v2, v9 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v11 +; PWR9LE-NEXT: xsadddp v2, v2, v10 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v12 +; PWR9LE-NEXT: xsadddp v2, v2, v11 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v13 +; PWR9LE-NEXT: xsadddp v2, v2, v12 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v17 +; PWR9LE-NEXT: xsadddp v2, v2, v13 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v16 +; PWR9LE-NEXT: xsadddp v2, v2, v17 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v15 +; PWR9LE-NEXT: xsadddp v2, v2, v16 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v14 +; PWR9LE-NEXT: xsadddp v2, v2, v15 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v1 +; PWR9LE-NEXT: xsadddp v2, v2, v14 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v0 +; PWR9LE-NEXT: xsadddp v2, v2, v1 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, vs13 +; PWR9LE-NEXT: xsadddp v2, v2, v0 +; PWR9LE-NEXT: xsadddp v2, v2, v3 +; PWR9LE-NEXT: xsadddp f13, v2, f13 +; PWR9LE-NEXT: xxswapd v2, vs12 +; PWR9LE-NEXT: xsadddp f13, f13, v2 +; PWR9LE-NEXT: xsadddp f12, f13, f12 +; PWR9LE-NEXT: xxswapd vs13, vs11 +; PWR9LE-NEXT: xsadddp f12, f12, f13 +; PWR9LE-NEXT: xsadddp f11, f12, f11 +; PWR9LE-NEXT: xxswapd vs12, vs10 +; PWR9LE-NEXT: xsadddp f11, f11, f12 +; PWR9LE-NEXT: xsadddp f10, f11, f10 +; PWR9LE-NEXT: xxswapd vs11, vs9 +; PWR9LE-NEXT: xsadddp f10, f10, f11 +; PWR9LE-NEXT: xsadddp f9, f10, f9 +; PWR9LE-NEXT: xxswapd vs10, vs8 +; PWR9LE-NEXT: xsadddp f9, f9, f10 +; PWR9LE-NEXT: xsadddp f8, f9, f8 +; PWR9LE-NEXT: xxswapd vs9, vs7 +; PWR9LE-NEXT: xsadddp f8, f8, f9 +; PWR9LE-NEXT: xsadddp f7, f8, f7 +; PWR9LE-NEXT: xxswapd vs8, vs6 +; PWR9LE-NEXT: xsadddp f7, f7, f8 +; PWR9LE-NEXT: xsadddp f6, f7, f6 +; PWR9LE-NEXT: xxswapd vs7, vs5 +; PWR9LE-NEXT: xsadddp f6, f6, f7 +; PWR9LE-NEXT: xsadddp f5, f6, f5 +; PWR9LE-NEXT: xxswapd vs6, vs4 +; PWR9LE-NEXT: xsadddp f5, f5, f6 +; PWR9LE-NEXT: xsadddp f4, f5, f4 +; PWR9LE-NEXT: xxswapd vs5, vs3 +; PWR9LE-NEXT: xsadddp f4, f4, f5 +; PWR9LE-NEXT: xsadddp f3, f4, f3 +; PWR9LE-NEXT: xxswapd vs4, vs2 +; PWR9LE-NEXT: xsadddp f3, f3, f4 +; PWR9LE-NEXT: xsadddp f2, f3, f2 +; PWR9LE-NEXT: xxswapd vs3, vs1 +; PWR9LE-NEXT: xsadddp f2, f2, f3 +; PWR9LE-NEXT: xsadddp f1, f2, f1 +; PWR9LE-NEXT: xxswapd vs2, vs0 +; PWR9LE-NEXT: xsadddp f1, f1, f2 +; PWR9LE-NEXT: xsadddp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v64f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v18, v2 +; PWR9BE-NEXT: lxv v17, 240(r1) +; PWR9BE-NEXT: lxv v16, 256(r1) +; PWR9BE-NEXT: lxv v15, 272(r1) +; PWR9BE-NEXT: lxv v14, 288(r1) +; PWR9BE-NEXT: xsadddp v2, v2, v18 +; PWR9BE-NEXT: lxv v1, 304(r1) +; PWR9BE-NEXT: lxv v0, 320(r1) +; PWR9BE-NEXT: lxv vs13, 336(r1) +; PWR9BE-NEXT: lxv vs12, 352(r1) +; PWR9BE-NEXT: lxv vs11, 368(r1) +; PWR9BE-NEXT: lxv vs10, 384(r1) +; PWR9BE-NEXT: lxv vs9, 400(r1) +; PWR9BE-NEXT: lxv vs8, 416(r1) +; PWR9BE-NEXT: lxv vs7, 432(r1) +; PWR9BE-NEXT: lxv vs6, 448(r1) +; PWR9BE-NEXT: lxv vs5, 464(r1) +; PWR9BE-NEXT: lxv vs4, 480(r1) +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v3 +; PWR9BE-NEXT: lxv vs3, 496(r1) +; PWR9BE-NEXT: lxv vs2, 512(r1) +; PWR9BE-NEXT: lxv vs1, 528(r1) +; PWR9BE-NEXT: lxv vs0, 544(r1) +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v4 +; PWR9BE-NEXT: xsadddp v2, v2, v4 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v5 +; PWR9BE-NEXT: xsadddp v2, v2, v5 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v6 +; PWR9BE-NEXT: xsadddp v2, v2, v6 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v7 +; PWR9BE-NEXT: xsadddp v2, v2, v7 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v8 +; PWR9BE-NEXT: xsadddp v2, v2, v8 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v9 +; PWR9BE-NEXT: xsadddp v2, v2, v9 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v10 +; PWR9BE-NEXT: xsadddp v2, v2, v10 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v11 +; PWR9BE-NEXT: xsadddp v2, v2, v11 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v12 +; PWR9BE-NEXT: xsadddp v2, v2, v12 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v13 +; PWR9BE-NEXT: xsadddp v2, v2, v13 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v17 +; PWR9BE-NEXT: xsadddp v2, v2, v17 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v16 +; PWR9BE-NEXT: xsadddp v2, v2, v16 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v15 +; PWR9BE-NEXT: xsadddp v2, v2, v15 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v14 +; PWR9BE-NEXT: xsadddp v2, v2, v14 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v1 +; PWR9BE-NEXT: xsadddp v2, v2, v1 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v0 +; PWR9BE-NEXT: xsadddp v2, v2, v0 +; PWR9BE-NEXT: xsadddp v2, v2, v3 +; PWR9BE-NEXT: xsadddp v2, v2, f13 +; PWR9BE-NEXT: xxswapd vs13, vs13 +; PWR9BE-NEXT: xsadddp f13, v2, f13 +; PWR9BE-NEXT: xsadddp f13, f13, f12 +; PWR9BE-NEXT: xxswapd vs12, vs12 +; PWR9BE-NEXT: xsadddp f12, f13, f12 +; PWR9BE-NEXT: xsadddp f12, f12, f11 +; PWR9BE-NEXT: xxswapd vs11, vs11 +; PWR9BE-NEXT: xsadddp f11, f12, f11 +; PWR9BE-NEXT: xsadddp f11, f11, f10 +; PWR9BE-NEXT: xxswapd vs10, vs10 +; PWR9BE-NEXT: xsadddp f10, f11, f10 +; PWR9BE-NEXT: xsadddp f10, f10, f9 +; PWR9BE-NEXT: xxswapd vs9, vs9 +; PWR9BE-NEXT: xsadddp f9, f10, f9 +; PWR9BE-NEXT: xsadddp f9, f9, f8 +; PWR9BE-NEXT: xxswapd vs8, vs8 +; PWR9BE-NEXT: xsadddp f8, f9, f8 +; PWR9BE-NEXT: xsadddp f8, f8, f7 +; PWR9BE-NEXT: xxswapd vs7, vs7 +; PWR9BE-NEXT: xsadddp f7, f8, f7 +; PWR9BE-NEXT: xsadddp f7, f7, f6 +; PWR9BE-NEXT: xxswapd vs6, vs6 +; PWR9BE-NEXT: xsadddp f6, f7, f6 +; PWR9BE-NEXT: xsadddp f6, f6, f5 +; PWR9BE-NEXT: xxswapd vs5, vs5 +; PWR9BE-NEXT: xsadddp f5, f6, f5 +; PWR9BE-NEXT: xsadddp f5, f5, f4 +; PWR9BE-NEXT: xxswapd vs4, vs4 +; PWR9BE-NEXT: xsadddp f4, f5, f4 +; PWR9BE-NEXT: xsadddp f4, f4, f3 +; PWR9BE-NEXT: xxswapd vs3, vs3 +; PWR9BE-NEXT: xsadddp f3, f4, f3 +; PWR9BE-NEXT: xsadddp f3, f3, f2 +; PWR9BE-NEXT: xxswapd vs2, vs2 +; PWR9BE-NEXT: xsadddp f2, f3, f2 +; PWR9BE-NEXT: xsadddp f2, f2, f1 +; PWR9BE-NEXT: xxswapd vs1, vs1 +; PWR9BE-NEXT: xsadddp f1, f2, f1 +; PWR9BE-NEXT: xsadddp f1, f1, f0 +; PWR9BE-NEXT: xxswapd vs0, vs0 +; PWR9BE-NEXT: xsadddp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v64f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v18, v2 +; PWR10LE-NEXT: lxv v17, 224(r1) +; PWR10LE-NEXT: lxv v16, 240(r1) +; PWR10LE-NEXT: xsadddp v2, v18, v2 +; PWR10LE-NEXT: xxswapd v18, v3 +; PWR10LE-NEXT: lxv v15, 256(r1) +; PWR10LE-NEXT: lxv v14, 272(r1) +; PWR10LE-NEXT: lxv v1, 288(r1) +; PWR10LE-NEXT: lxv v0, 304(r1) +; PWR10LE-NEXT: lxv vs13, 320(r1) +; PWR10LE-NEXT: lxv vs12, 336(r1) +; PWR10LE-NEXT: lxv vs11, 352(r1) +; PWR10LE-NEXT: lxv vs10, 368(r1) +; PWR10LE-NEXT: xsadddp v2, v2, v18 +; PWR10LE-NEXT: lxv vs9, 384(r1) +; PWR10LE-NEXT: lxv vs8, 400(r1) +; PWR10LE-NEXT: lxv vs7, 416(r1) +; PWR10LE-NEXT: lxv vs6, 432(r1) +; PWR10LE-NEXT: lxv vs5, 448(r1) +; PWR10LE-NEXT: lxv vs4, 464(r1) +; PWR10LE-NEXT: lxv vs3, 480(r1) +; PWR10LE-NEXT: lxv vs2, 496(r1) +; PWR10LE-NEXT: lxv vs1, 512(r1) +; PWR10LE-NEXT: lxv vs0, 528(r1) +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v4 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v5 +; PWR10LE-NEXT: xsadddp v2, v2, v4 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v6 +; PWR10LE-NEXT: xsadddp v2, v2, v5 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v7 +; PWR10LE-NEXT: xsadddp v2, v2, v6 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v8 +; PWR10LE-NEXT: xsadddp v2, v2, v7 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v9 +; PWR10LE-NEXT: xsadddp v2, v2, v8 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v10 +; PWR10LE-NEXT: xsadddp v2, v2, v9 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v11 +; PWR10LE-NEXT: xsadddp v2, v2, v10 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v12 +; PWR10LE-NEXT: xsadddp v2, v2, v11 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v13 +; PWR10LE-NEXT: xsadddp v2, v2, v12 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v17 +; PWR10LE-NEXT: xsadddp v2, v2, v13 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v16 +; PWR10LE-NEXT: xsadddp v2, v2, v17 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v15 +; PWR10LE-NEXT: xsadddp v2, v2, v16 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v14 +; PWR10LE-NEXT: xsadddp v2, v2, v15 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v1 +; PWR10LE-NEXT: xsadddp v2, v2, v14 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v0 +; PWR10LE-NEXT: xsadddp v2, v2, v1 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, vs13 +; PWR10LE-NEXT: xsadddp v2, v2, v0 +; PWR10LE-NEXT: xsadddp v2, v2, v3 +; PWR10LE-NEXT: xsadddp f13, v2, f13 +; PWR10LE-NEXT: xxswapd v2, vs12 +; PWR10LE-NEXT: xsadddp f13, f13, v2 +; PWR10LE-NEXT: xsadddp f12, f13, f12 +; PWR10LE-NEXT: xxswapd vs13, vs11 +; PWR10LE-NEXT: xsadddp f12, f12, f13 +; PWR10LE-NEXT: xsadddp f11, f12, f11 +; PWR10LE-NEXT: xxswapd vs12, vs10 +; PWR10LE-NEXT: xsadddp f11, f11, f12 +; PWR10LE-NEXT: xsadddp f10, f11, f10 +; PWR10LE-NEXT: xxswapd vs11, vs9 +; PWR10LE-NEXT: xsadddp f10, f10, f11 +; PWR10LE-NEXT: xsadddp f9, f10, f9 +; PWR10LE-NEXT: xxswapd vs10, vs8 +; PWR10LE-NEXT: xsadddp f9, f9, f10 +; PWR10LE-NEXT: xsadddp f8, f9, f8 +; PWR10LE-NEXT: xxswapd vs9, vs7 +; PWR10LE-NEXT: xsadddp f8, f8, f9 +; PWR10LE-NEXT: xsadddp f7, f8, f7 +; PWR10LE-NEXT: xxswapd vs8, vs6 +; PWR10LE-NEXT: xsadddp f7, f7, f8 +; PWR10LE-NEXT: xsadddp f6, f7, f6 +; PWR10LE-NEXT: xxswapd vs7, vs5 +; PWR10LE-NEXT: xsadddp f6, f6, f7 +; PWR10LE-NEXT: xsadddp f5, f6, f5 +; PWR10LE-NEXT: xxswapd vs6, vs4 +; PWR10LE-NEXT: xsadddp f5, f5, f6 +; PWR10LE-NEXT: xsadddp f4, f5, f4 +; PWR10LE-NEXT: xxswapd vs5, vs3 +; PWR10LE-NEXT: xsadddp f4, f4, f5 +; PWR10LE-NEXT: xsadddp f3, f4, f3 +; PWR10LE-NEXT: xxswapd vs4, vs2 +; PWR10LE-NEXT: xsadddp f3, f3, f4 +; PWR10LE-NEXT: xsadddp f2, f3, f2 +; PWR10LE-NEXT: xxswapd vs3, vs1 +; PWR10LE-NEXT: xsadddp f2, f2, f3 +; PWR10LE-NEXT: xsadddp f1, f2, f1 +; PWR10LE-NEXT: xxswapd vs2, vs0 +; PWR10LE-NEXT: xsadddp f1, f1, f2 +; PWR10LE-NEXT: xsadddp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v64f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v18, v2 +; PWR10BE-NEXT: lxv v17, 240(r1) +; PWR10BE-NEXT: lxv v16, 256(r1) +; PWR10BE-NEXT: xsadddp v2, v2, v18 +; PWR10BE-NEXT: lxv v15, 272(r1) +; PWR10BE-NEXT: lxv v14, 288(r1) +; PWR10BE-NEXT: lxv v1, 304(r1) +; PWR10BE-NEXT: lxv v0, 320(r1) +; PWR10BE-NEXT: lxv vs13, 336(r1) +; PWR10BE-NEXT: lxv vs12, 352(r1) +; PWR10BE-NEXT: lxv vs11, 368(r1) +; PWR10BE-NEXT: lxv vs10, 384(r1) +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v3 +; PWR10BE-NEXT: lxv vs9, 400(r1) +; PWR10BE-NEXT: lxv vs8, 416(r1) +; PWR10BE-NEXT: lxv vs7, 432(r1) +; PWR10BE-NEXT: lxv vs6, 448(r1) +; PWR10BE-NEXT: lxv vs5, 464(r1) +; PWR10BE-NEXT: lxv vs4, 480(r1) +; PWR10BE-NEXT: lxv vs3, 496(r1) +; PWR10BE-NEXT: lxv vs2, 512(r1) +; PWR10BE-NEXT: lxv vs1, 528(r1) +; PWR10BE-NEXT: lxv vs0, 544(r1) +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v4 +; PWR10BE-NEXT: xsadddp v2, v2, v4 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v5 +; PWR10BE-NEXT: xsadddp v2, v2, v5 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v6 +; PWR10BE-NEXT: xsadddp v2, v2, v6 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v7 +; PWR10BE-NEXT: xsadddp v2, v2, v7 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v8 +; PWR10BE-NEXT: xsadddp v2, v2, v8 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v9 +; PWR10BE-NEXT: xsadddp v2, v2, v9 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v10 +; PWR10BE-NEXT: xsadddp v2, v2, v10 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v11 +; PWR10BE-NEXT: xsadddp v2, v2, v11 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v12 +; PWR10BE-NEXT: xsadddp v2, v2, v12 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v13 +; PWR10BE-NEXT: xsadddp v2, v2, v13 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v17 +; PWR10BE-NEXT: xsadddp v2, v2, v17 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v16 +; PWR10BE-NEXT: xsadddp v2, v2, v16 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v15 +; PWR10BE-NEXT: xsadddp v2, v2, v15 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v14 +; PWR10BE-NEXT: xsadddp v2, v2, v14 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v1 +; PWR10BE-NEXT: xsadddp v2, v2, v1 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v0 +; PWR10BE-NEXT: xsadddp v2, v2, v0 +; PWR10BE-NEXT: xsadddp v2, v2, v3 +; PWR10BE-NEXT: xsadddp v2, v2, f13 +; PWR10BE-NEXT: xxswapd vs13, vs13 +; PWR10BE-NEXT: xsadddp f13, v2, f13 +; PWR10BE-NEXT: xsadddp f13, f13, f12 +; PWR10BE-NEXT: xxswapd vs12, vs12 +; PWR10BE-NEXT: xsadddp f12, f13, f12 +; PWR10BE-NEXT: xsadddp f12, f12, f11 +; PWR10BE-NEXT: xxswapd vs11, vs11 +; PWR10BE-NEXT: xsadddp f11, f12, f11 +; PWR10BE-NEXT: xsadddp f11, f11, f10 +; PWR10BE-NEXT: xxswapd vs10, vs10 +; PWR10BE-NEXT: xsadddp f10, f11, f10 +; PWR10BE-NEXT: xsadddp f10, f10, f9 +; PWR10BE-NEXT: xxswapd vs9, vs9 +; PWR10BE-NEXT: xsadddp f9, f10, f9 +; PWR10BE-NEXT: xsadddp f9, f9, f8 +; PWR10BE-NEXT: xxswapd vs8, vs8 +; PWR10BE-NEXT: xsadddp f8, f9, f8 +; PWR10BE-NEXT: xsadddp f8, f8, f7 +; PWR10BE-NEXT: xxswapd vs7, vs7 +; PWR10BE-NEXT: xsadddp f7, f8, f7 +; PWR10BE-NEXT: xsadddp f7, f7, f6 +; PWR10BE-NEXT: xxswapd vs6, vs6 +; PWR10BE-NEXT: xsadddp f6, f7, f6 +; PWR10BE-NEXT: xsadddp f6, f6, f5 +; PWR10BE-NEXT: xxswapd vs5, vs5 +; PWR10BE-NEXT: xsadddp f5, f6, f5 +; PWR10BE-NEXT: xsadddp f5, f5, f4 +; PWR10BE-NEXT: xxswapd vs4, vs4 +; PWR10BE-NEXT: xsadddp f4, f5, f4 +; PWR10BE-NEXT: xsadddp f4, f4, f3 +; PWR10BE-NEXT: xxswapd vs3, vs3 +; PWR10BE-NEXT: xsadddp f3, f4, f3 +; PWR10BE-NEXT: xsadddp f3, f3, f2 +; PWR10BE-NEXT: xxswapd vs2, vs2 +; PWR10BE-NEXT: xsadddp f2, f3, f2 +; PWR10BE-NEXT: xsadddp f2, f2, f1 +; PWR10BE-NEXT: xxswapd vs1, vs1 +; PWR10BE-NEXT: xsadddp f1, f2, f1 +; PWR10BE-NEXT: xsadddp f1, f1, f0 +; PWR10BE-NEXT: xxswapd vs0, vs0 +; PWR10BE-NEXT: xsadddp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v64f64(double -0.000000e+00, <64 x double> %a) + ret double %0 +} + +define dso_local double @v64f64_b(<64 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v64f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v19, v2 +; PWR9LE-NEXT: lxv v18, 224(r1) +; PWR9LE-NEXT: lxv v17, 240(r1) +; PWR9LE-NEXT: lxv v16, 256(r1) +; PWR9LE-NEXT: lxv v15, 272(r1) +; PWR9LE-NEXT: xsadddp f1, f1, v19 +; PWR9LE-NEXT: lxv v14, 288(r1) +; PWR9LE-NEXT: lxv v1, 304(r1) +; PWR9LE-NEXT: lxv v0, 320(r1) +; PWR9LE-NEXT: lxv vs13, 336(r1) +; PWR9LE-NEXT: lxv vs12, 352(r1) +; PWR9LE-NEXT: lxv vs11, 368(r1) +; PWR9LE-NEXT: lxv vs10, 384(r1) +; PWR9LE-NEXT: lxv vs9, 400(r1) +; PWR9LE-NEXT: lxv vs8, 416(r1) +; PWR9LE-NEXT: lxv vs7, 432(r1) +; PWR9LE-NEXT: lxv vs6, 448(r1) +; PWR9LE-NEXT: lxv vs5, 464(r1) +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v3 +; PWR9LE-NEXT: lxv vs4, 480(r1) +; PWR9LE-NEXT: lxv vs3, 496(r1) +; PWR9LE-NEXT: lxv vs2, 512(r1) +; PWR9LE-NEXT: lxv vs0, 528(r1) +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v4 +; PWR9LE-NEXT: xsadddp f1, f1, v3 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v5 +; PWR9LE-NEXT: xsadddp f1, f1, v4 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v6 +; PWR9LE-NEXT: xsadddp f1, f1, v5 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v7 +; PWR9LE-NEXT: xsadddp f1, f1, v6 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v8 +; PWR9LE-NEXT: xsadddp f1, f1, v7 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v9 +; PWR9LE-NEXT: xsadddp f1, f1, v8 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v10 +; PWR9LE-NEXT: xsadddp f1, f1, v9 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v11 +; PWR9LE-NEXT: xsadddp f1, f1, v10 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v12 +; PWR9LE-NEXT: xsadddp f1, f1, v11 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v13 +; PWR9LE-NEXT: xsadddp f1, f1, v12 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v18 +; PWR9LE-NEXT: xsadddp f1, f1, v13 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v17 +; PWR9LE-NEXT: xsadddp f1, f1, v18 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v16 +; PWR9LE-NEXT: xsadddp f1, f1, v17 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v15 +; PWR9LE-NEXT: xsadddp f1, f1, v16 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v14 +; PWR9LE-NEXT: xsadddp f1, f1, v15 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v1 +; PWR9LE-NEXT: xsadddp f1, f1, v14 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, v0 +; PWR9LE-NEXT: xsadddp f1, f1, v1 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xxswapd v2, vs13 +; PWR9LE-NEXT: xsadddp f1, f1, v0 +; PWR9LE-NEXT: xsadddp f1, f1, v2 +; PWR9LE-NEXT: xsadddp f1, f1, f13 +; PWR9LE-NEXT: xxswapd vs13, vs12 +; PWR9LE-NEXT: xsadddp f1, f1, f13 +; PWR9LE-NEXT: xsadddp f1, f1, f12 +; PWR9LE-NEXT: xxswapd vs12, vs11 +; PWR9LE-NEXT: xsadddp f1, f1, f12 +; PWR9LE-NEXT: xsadddp f1, f1, f11 +; PWR9LE-NEXT: xxswapd vs11, vs10 +; PWR9LE-NEXT: xsadddp f1, f1, f11 +; PWR9LE-NEXT: xsadddp f1, f1, f10 +; PWR9LE-NEXT: xxswapd vs10, vs9 +; PWR9LE-NEXT: xsadddp f1, f1, f10 +; PWR9LE-NEXT: xsadddp f1, f1, f9 +; PWR9LE-NEXT: xxswapd vs9, vs8 +; PWR9LE-NEXT: xsadddp f1, f1, f9 +; PWR9LE-NEXT: xsadddp f1, f1, f8 +; PWR9LE-NEXT: xxswapd vs8, vs7 +; PWR9LE-NEXT: xsadddp f1, f1, f8 +; PWR9LE-NEXT: xsadddp f1, f1, f7 +; PWR9LE-NEXT: xxswapd vs7, vs6 +; PWR9LE-NEXT: xsadddp f1, f1, f7 +; PWR9LE-NEXT: xsadddp f1, f1, f6 +; PWR9LE-NEXT: xxswapd vs6, vs5 +; PWR9LE-NEXT: xsadddp f1, f1, f6 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xxswapd vs5, vs4 +; PWR9LE-NEXT: xsadddp f1, f1, f5 +; PWR9LE-NEXT: xsadddp f1, f1, f4 +; PWR9LE-NEXT: xxswapd vs4, vs3 +; PWR9LE-NEXT: xsadddp f1, f1, f4 +; PWR9LE-NEXT: xsadddp f1, f1, f3 +; PWR9LE-NEXT: xxswapd vs3, vs2 +; PWR9LE-NEXT: xsadddp f1, f1, f3 +; PWR9LE-NEXT: xsadddp f1, f1, f2 +; PWR9LE-NEXT: xxswapd vs2, vs0 +; PWR9LE-NEXT: xsadddp f1, f1, f2 +; PWR9LE-NEXT: xsadddp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v64f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v2 +; PWR9BE-NEXT: lxv v18, 240(r1) +; PWR9BE-NEXT: lxv v17, 256(r1) +; PWR9BE-NEXT: lxv v16, 272(r1) +; PWR9BE-NEXT: lxv v15, 288(r1) +; PWR9BE-NEXT: lxv v14, 304(r1) +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v3 +; PWR9BE-NEXT: lxv v1, 320(r1) +; PWR9BE-NEXT: lxv v0, 336(r1) +; PWR9BE-NEXT: lxv vs13, 352(r1) +; PWR9BE-NEXT: lxv vs12, 368(r1) +; PWR9BE-NEXT: lxv vs11, 384(r1) +; PWR9BE-NEXT: lxv vs10, 400(r1) +; PWR9BE-NEXT: lxv vs9, 416(r1) +; PWR9BE-NEXT: lxv vs8, 432(r1) +; PWR9BE-NEXT: lxv vs7, 448(r1) +; PWR9BE-NEXT: lxv vs6, 464(r1) +; PWR9BE-NEXT: lxv vs5, 480(r1) +; PWR9BE-NEXT: lxv vs4, 496(r1) +; PWR9BE-NEXT: lxv vs3, 512(r1) +; PWR9BE-NEXT: lxv vs2, 528(r1) +; PWR9BE-NEXT: lxv vs0, 544(r1) +; PWR9BE-NEXT: xsadddp f1, f1, v3 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v4 +; PWR9BE-NEXT: xsadddp f1, f1, v4 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v5 +; PWR9BE-NEXT: xsadddp f1, f1, v5 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v6 +; PWR9BE-NEXT: xsadddp f1, f1, v6 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v7 +; PWR9BE-NEXT: xsadddp f1, f1, v7 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v8 +; PWR9BE-NEXT: xsadddp f1, f1, v8 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v9 +; PWR9BE-NEXT: xsadddp f1, f1, v9 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v10 +; PWR9BE-NEXT: xsadddp f1, f1, v10 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v11 +; PWR9BE-NEXT: xsadddp f1, f1, v11 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v12 +; PWR9BE-NEXT: xsadddp f1, f1, v12 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v13 +; PWR9BE-NEXT: xsadddp f1, f1, v13 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v18 +; PWR9BE-NEXT: xsadddp f1, f1, v18 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v17 +; PWR9BE-NEXT: xsadddp f1, f1, v17 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v16 +; PWR9BE-NEXT: xsadddp f1, f1, v16 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v15 +; PWR9BE-NEXT: xsadddp f1, f1, v15 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v14 +; PWR9BE-NEXT: xsadddp f1, f1, v14 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v1 +; PWR9BE-NEXT: xsadddp f1, f1, v1 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xxswapd v2, v0 +; PWR9BE-NEXT: xsadddp f1, f1, v0 +; PWR9BE-NEXT: xsadddp f1, f1, v2 +; PWR9BE-NEXT: xsadddp f1, f1, f13 +; PWR9BE-NEXT: xxswapd vs13, vs13 +; PWR9BE-NEXT: xsadddp f1, f1, f13 +; PWR9BE-NEXT: xsadddp f1, f1, f12 +; PWR9BE-NEXT: xxswapd vs12, vs12 +; PWR9BE-NEXT: xsadddp f1, f1, f12 +; PWR9BE-NEXT: xsadddp f1, f1, f11 +; PWR9BE-NEXT: xxswapd vs11, vs11 +; PWR9BE-NEXT: xsadddp f1, f1, f11 +; PWR9BE-NEXT: xsadddp f1, f1, f10 +; PWR9BE-NEXT: xxswapd vs10, vs10 +; PWR9BE-NEXT: xsadddp f1, f1, f10 +; PWR9BE-NEXT: xsadddp f1, f1, f9 +; PWR9BE-NEXT: xxswapd vs9, vs9 +; PWR9BE-NEXT: xsadddp f1, f1, f9 +; PWR9BE-NEXT: xsadddp f1, f1, f8 +; PWR9BE-NEXT: xxswapd vs8, vs8 +; PWR9BE-NEXT: xsadddp f1, f1, f8 +; PWR9BE-NEXT: xsadddp f1, f1, f7 +; PWR9BE-NEXT: xxswapd vs7, vs7 +; PWR9BE-NEXT: xsadddp f1, f1, f7 +; PWR9BE-NEXT: xsadddp f1, f1, f6 +; PWR9BE-NEXT: xxswapd vs6, vs6 +; PWR9BE-NEXT: xsadddp f1, f1, f6 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xxswapd vs5, vs5 +; PWR9BE-NEXT: xsadddp f1, f1, f5 +; PWR9BE-NEXT: xsadddp f1, f1, f4 +; PWR9BE-NEXT: xxswapd vs4, vs4 +; PWR9BE-NEXT: xsadddp f1, f1, f4 +; PWR9BE-NEXT: xsadddp f1, f1, f3 +; PWR9BE-NEXT: xxswapd vs3, vs3 +; PWR9BE-NEXT: xsadddp f1, f1, f3 +; PWR9BE-NEXT: xsadddp f1, f1, f2 +; PWR9BE-NEXT: xxswapd vs2, vs2 +; PWR9BE-NEXT: xsadddp f1, f1, f2 +; PWR9BE-NEXT: xsadddp f1, f1, f0 +; PWR9BE-NEXT: xxswapd vs0, vs0 +; PWR9BE-NEXT: xsadddp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v64f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v19, v2 +; PWR10LE-NEXT: lxv v18, 224(r1) +; PWR10LE-NEXT: lxv v17, 240(r1) +; PWR10LE-NEXT: xsadddp f1, f1, v19 +; PWR10LE-NEXT: lxv v16, 256(r1) +; PWR10LE-NEXT: lxv v15, 272(r1) +; PWR10LE-NEXT: lxv v14, 288(r1) +; PWR10LE-NEXT: lxv v1, 304(r1) +; PWR10LE-NEXT: lxv v0, 320(r1) +; PWR10LE-NEXT: lxv vs13, 336(r1) +; PWR10LE-NEXT: lxv vs12, 352(r1) +; PWR10LE-NEXT: lxv vs11, 368(r1) +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v3 +; PWR10LE-NEXT: lxv vs10, 384(r1) +; PWR10LE-NEXT: lxv vs9, 400(r1) +; PWR10LE-NEXT: lxv vs8, 416(r1) +; PWR10LE-NEXT: lxv vs7, 432(r1) +; PWR10LE-NEXT: lxv vs6, 448(r1) +; PWR10LE-NEXT: lxv vs5, 464(r1) +; PWR10LE-NEXT: lxv vs4, 480(r1) +; PWR10LE-NEXT: lxv vs3, 496(r1) +; PWR10LE-NEXT: lxv vs2, 512(r1) +; PWR10LE-NEXT: lxv vs0, 528(r1) +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v4 +; PWR10LE-NEXT: xsadddp f1, f1, v3 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v5 +; PWR10LE-NEXT: xsadddp f1, f1, v4 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v6 +; PWR10LE-NEXT: xsadddp f1, f1, v5 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v7 +; PWR10LE-NEXT: xsadddp f1, f1, v6 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v8 +; PWR10LE-NEXT: xsadddp f1, f1, v7 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v9 +; PWR10LE-NEXT: xsadddp f1, f1, v8 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v10 +; PWR10LE-NEXT: xsadddp f1, f1, v9 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v11 +; PWR10LE-NEXT: xsadddp f1, f1, v10 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v12 +; PWR10LE-NEXT: xsadddp f1, f1, v11 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v13 +; PWR10LE-NEXT: xsadddp f1, f1, v12 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v18 +; PWR10LE-NEXT: xsadddp f1, f1, v13 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v17 +; PWR10LE-NEXT: xsadddp f1, f1, v18 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v16 +; PWR10LE-NEXT: xsadddp f1, f1, v17 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v15 +; PWR10LE-NEXT: xsadddp f1, f1, v16 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v14 +; PWR10LE-NEXT: xsadddp f1, f1, v15 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v1 +; PWR10LE-NEXT: xsadddp f1, f1, v14 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, v0 +; PWR10LE-NEXT: xsadddp f1, f1, v1 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xxswapd v2, vs13 +; PWR10LE-NEXT: xsadddp f1, f1, v0 +; PWR10LE-NEXT: xsadddp f1, f1, v2 +; PWR10LE-NEXT: xsadddp f1, f1, f13 +; PWR10LE-NEXT: xxswapd vs13, vs12 +; PWR10LE-NEXT: xsadddp f1, f1, f13 +; PWR10LE-NEXT: xsadddp f1, f1, f12 +; PWR10LE-NEXT: xxswapd vs12, vs11 +; PWR10LE-NEXT: xsadddp f1, f1, f12 +; PWR10LE-NEXT: xsadddp f1, f1, f11 +; PWR10LE-NEXT: xxswapd vs11, vs10 +; PWR10LE-NEXT: xsadddp f1, f1, f11 +; PWR10LE-NEXT: xsadddp f1, f1, f10 +; PWR10LE-NEXT: xxswapd vs10, vs9 +; PWR10LE-NEXT: xsadddp f1, f1, f10 +; PWR10LE-NEXT: xsadddp f1, f1, f9 +; PWR10LE-NEXT: xxswapd vs9, vs8 +; PWR10LE-NEXT: xsadddp f1, f1, f9 +; PWR10LE-NEXT: xsadddp f1, f1, f8 +; PWR10LE-NEXT: xxswapd vs8, vs7 +; PWR10LE-NEXT: xsadddp f1, f1, f8 +; PWR10LE-NEXT: xsadddp f1, f1, f7 +; PWR10LE-NEXT: xxswapd vs7, vs6 +; PWR10LE-NEXT: xsadddp f1, f1, f7 +; PWR10LE-NEXT: xsadddp f1, f1, f6 +; PWR10LE-NEXT: xxswapd vs6, vs5 +; PWR10LE-NEXT: xsadddp f1, f1, f6 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xxswapd vs5, vs4 +; PWR10LE-NEXT: xsadddp f1, f1, f5 +; PWR10LE-NEXT: xsadddp f1, f1, f4 +; PWR10LE-NEXT: xxswapd vs4, vs3 +; PWR10LE-NEXT: xsadddp f1, f1, f4 +; PWR10LE-NEXT: xsadddp f1, f1, f3 +; PWR10LE-NEXT: xxswapd vs3, vs2 +; PWR10LE-NEXT: xsadddp f1, f1, f3 +; PWR10LE-NEXT: xsadddp f1, f1, f2 +; PWR10LE-NEXT: xxswapd vs2, vs0 +; PWR10LE-NEXT: xsadddp f1, f1, f2 +; PWR10LE-NEXT: xsadddp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v64f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v2 +; PWR10BE-NEXT: lxv v18, 240(r1) +; PWR10BE-NEXT: lxv v17, 256(r1) +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v3 +; PWR10BE-NEXT: lxv v16, 272(r1) +; PWR10BE-NEXT: lxv v15, 288(r1) +; PWR10BE-NEXT: lxv v14, 304(r1) +; PWR10BE-NEXT: lxv v1, 320(r1) +; PWR10BE-NEXT: lxv v0, 336(r1) +; PWR10BE-NEXT: lxv vs13, 352(r1) +; PWR10BE-NEXT: lxv vs12, 368(r1) +; PWR10BE-NEXT: lxv vs11, 384(r1) +; PWR10BE-NEXT: lxv vs10, 400(r1) +; PWR10BE-NEXT: lxv vs9, 416(r1) +; PWR10BE-NEXT: xsadddp f1, f1, v3 +; PWR10BE-NEXT: lxv vs8, 432(r1) +; PWR10BE-NEXT: lxv vs7, 448(r1) +; PWR10BE-NEXT: lxv vs6, 464(r1) +; PWR10BE-NEXT: lxv vs5, 480(r1) +; PWR10BE-NEXT: lxv vs4, 496(r1) +; PWR10BE-NEXT: lxv vs3, 512(r1) +; PWR10BE-NEXT: lxv vs2, 528(r1) +; PWR10BE-NEXT: lxv vs0, 544(r1) +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v4 +; PWR10BE-NEXT: xsadddp f1, f1, v4 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v5 +; PWR10BE-NEXT: xsadddp f1, f1, v5 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v6 +; PWR10BE-NEXT: xsadddp f1, f1, v6 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v7 +; PWR10BE-NEXT: xsadddp f1, f1, v7 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v8 +; PWR10BE-NEXT: xsadddp f1, f1, v8 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v9 +; PWR10BE-NEXT: xsadddp f1, f1, v9 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v10 +; PWR10BE-NEXT: xsadddp f1, f1, v10 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v11 +; PWR10BE-NEXT: xsadddp f1, f1, v11 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v12 +; PWR10BE-NEXT: xsadddp f1, f1, v12 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v13 +; PWR10BE-NEXT: xsadddp f1, f1, v13 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v18 +; PWR10BE-NEXT: xsadddp f1, f1, v18 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v17 +; PWR10BE-NEXT: xsadddp f1, f1, v17 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v16 +; PWR10BE-NEXT: xsadddp f1, f1, v16 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v15 +; PWR10BE-NEXT: xsadddp f1, f1, v15 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v14 +; PWR10BE-NEXT: xsadddp f1, f1, v14 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v1 +; PWR10BE-NEXT: xsadddp f1, f1, v1 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xxswapd v2, v0 +; PWR10BE-NEXT: xsadddp f1, f1, v0 +; PWR10BE-NEXT: xsadddp f1, f1, v2 +; PWR10BE-NEXT: xsadddp f1, f1, f13 +; PWR10BE-NEXT: xxswapd vs13, vs13 +; PWR10BE-NEXT: xsadddp f1, f1, f13 +; PWR10BE-NEXT: xsadddp f1, f1, f12 +; PWR10BE-NEXT: xxswapd vs12, vs12 +; PWR10BE-NEXT: xsadddp f1, f1, f12 +; PWR10BE-NEXT: xsadddp f1, f1, f11 +; PWR10BE-NEXT: xxswapd vs11, vs11 +; PWR10BE-NEXT: xsadddp f1, f1, f11 +; PWR10BE-NEXT: xsadddp f1, f1, f10 +; PWR10BE-NEXT: xxswapd vs10, vs10 +; PWR10BE-NEXT: xsadddp f1, f1, f10 +; PWR10BE-NEXT: xsadddp f1, f1, f9 +; PWR10BE-NEXT: xxswapd vs9, vs9 +; PWR10BE-NEXT: xsadddp f1, f1, f9 +; PWR10BE-NEXT: xsadddp f1, f1, f8 +; PWR10BE-NEXT: xxswapd vs8, vs8 +; PWR10BE-NEXT: xsadddp f1, f1, f8 +; PWR10BE-NEXT: xsadddp f1, f1, f7 +; PWR10BE-NEXT: xxswapd vs7, vs7 +; PWR10BE-NEXT: xsadddp f1, f1, f7 +; PWR10BE-NEXT: xsadddp f1, f1, f6 +; PWR10BE-NEXT: xxswapd vs6, vs6 +; PWR10BE-NEXT: xsadddp f1, f1, f6 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xxswapd vs5, vs5 +; PWR10BE-NEXT: xsadddp f1, f1, f5 +; PWR10BE-NEXT: xsadddp f1, f1, f4 +; PWR10BE-NEXT: xxswapd vs4, vs4 +; PWR10BE-NEXT: xsadddp f1, f1, f4 +; PWR10BE-NEXT: xsadddp f1, f1, f3 +; PWR10BE-NEXT: xxswapd vs3, vs3 +; PWR10BE-NEXT: xsadddp f1, f1, f3 +; PWR10BE-NEXT: xsadddp f1, f1, f2 +; PWR10BE-NEXT: xxswapd vs2, vs2 +; PWR10BE-NEXT: xsadddp f1, f1, f2 +; PWR10BE-NEXT: xsadddp f1, f1, f0 +; PWR10BE-NEXT: xxswapd vs0, vs0 +; PWR10BE-NEXT: xsadddp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fadd.v64f64(double %b, <64 x double> %a) + ret double %0 +} + +define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v64f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: lxv vs0, 368(r1) +; PWR9LE-NEXT: lxv vs1, 496(r1) +; PWR9LE-NEXT: lxv vs2, 240(r1) +; PWR9LE-NEXT: lxv vs3, 304(r1) +; PWR9LE-NEXT: xvadddp vs3, v3, vs3 +; PWR9LE-NEXT: lxv vs4, 432(r1) +; PWR9LE-NEXT: lxv vs5, 400(r1) +; PWR9LE-NEXT: lxv vs6, 528(r1) +; PWR9LE-NEXT: lxv vs7, 272(r1) +; PWR9LE-NEXT: lxv vs8, 336(r1) +; PWR9LE-NEXT: lxv vs9, 464(r1) +; PWR9LE-NEXT: lxv vs10, 352(r1) +; PWR9LE-NEXT: lxv vs11, 480(r1) +; PWR9LE-NEXT: lxv vs12, 224(r1) +; PWR9LE-NEXT: lxv vs13, 288(r1) +; PWR9LE-NEXT: lxv v0, 416(r1) +; PWR9LE-NEXT: lxv v1, 384(r1) +; PWR9LE-NEXT: lxv v14, 512(r1) +; PWR9LE-NEXT: lxv v15, 256(r1) +; PWR9LE-NEXT: lxv v16, 320(r1) +; PWR9LE-NEXT: lxv v17, 448(r1) +; PWR9LE-NEXT: xvadddp v12, v12, v17 +; PWR9LE-NEXT: xvadddp v4, v4, v16 +; PWR9LE-NEXT: xvadddp v14, v15, v14 +; PWR9LE-NEXT: xvadddp v1, v8, v1 +; PWR9LE-NEXT: xvadddp v0, v10, v0 +; PWR9LE-NEXT: xvadddp vs13, v2, vs13 +; PWR9LE-NEXT: xvadddp vs11, vs12, vs11 +; PWR9LE-NEXT: xvadddp vs10, v6, vs10 +; PWR9LE-NEXT: xvadddp vs9, v13, vs9 +; PWR9LE-NEXT: xvadddp vs8, v5, vs8 +; PWR9LE-NEXT: xvadddp vs6, vs7, vs6 +; PWR9LE-NEXT: xvadddp vs5, v9, vs5 +; PWR9LE-NEXT: xvadddp vs4, v11, vs4 +; PWR9LE-NEXT: xvadddp vs1, vs2, vs1 +; PWR9LE-NEXT: xvadddp vs0, v7, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR9LE-NEXT: xvadddp vs1, vs3, vs4 +; PWR9LE-NEXT: xvadddp vs2, vs5, vs6 +; PWR9LE-NEXT: xvadddp vs3, vs8, vs9 +; PWR9LE-NEXT: xvadddp vs4, vs10, vs11 +; PWR9LE-NEXT: xvadddp vs5, vs13, v0 +; PWR9LE-NEXT: xvadddp vs6, v1, v14 +; PWR9LE-NEXT: xvadddp vs7, v4, v12 +; PWR9LE-NEXT: xvadddp vs6, vs7, vs6 +; PWR9LE-NEXT: xvadddp vs4, vs5, vs4 +; PWR9LE-NEXT: xvadddp vs2, vs3, vs2 +; PWR9LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs2 +; PWR9LE-NEXT: xvadddp vs1, vs4, vs6 +; PWR9LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v64f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: lxv vs0, 384(r1) +; PWR9BE-NEXT: lxv vs1, 512(r1) +; PWR9BE-NEXT: lxv vs2, 256(r1) +; PWR9BE-NEXT: lxv vs3, 320(r1) +; PWR9BE-NEXT: xvadddp vs3, v3, vs3 +; PWR9BE-NEXT: lxv vs4, 448(r1) +; PWR9BE-NEXT: lxv vs5, 416(r1) +; PWR9BE-NEXT: lxv vs6, 544(r1) +; PWR9BE-NEXT: lxv vs7, 288(r1) +; PWR9BE-NEXT: lxv vs8, 352(r1) +; PWR9BE-NEXT: lxv vs9, 480(r1) +; PWR9BE-NEXT: lxv vs10, 368(r1) +; PWR9BE-NEXT: lxv vs11, 496(r1) +; PWR9BE-NEXT: lxv vs12, 240(r1) +; PWR9BE-NEXT: lxv vs13, 304(r1) +; PWR9BE-NEXT: lxv v0, 432(r1) +; PWR9BE-NEXT: lxv v1, 400(r1) +; PWR9BE-NEXT: lxv v14, 528(r1) +; PWR9BE-NEXT: lxv v15, 272(r1) +; PWR9BE-NEXT: lxv v16, 336(r1) +; PWR9BE-NEXT: lxv v17, 464(r1) +; PWR9BE-NEXT: xvadddp v12, v12, v17 +; PWR9BE-NEXT: xvadddp v4, v4, v16 +; PWR9BE-NEXT: xvadddp v14, v15, v14 +; PWR9BE-NEXT: xvadddp v1, v8, v1 +; PWR9BE-NEXT: xvadddp v0, v10, v0 +; PWR9BE-NEXT: xvadddp vs13, v2, vs13 +; PWR9BE-NEXT: xvadddp vs11, vs12, vs11 +; PWR9BE-NEXT: xvadddp vs10, v6, vs10 +; PWR9BE-NEXT: xvadddp vs9, v13, vs9 +; PWR9BE-NEXT: xvadddp vs8, v5, vs8 +; PWR9BE-NEXT: xvadddp vs6, vs7, vs6 +; PWR9BE-NEXT: xvadddp vs5, v9, vs5 +; PWR9BE-NEXT: xvadddp vs4, v11, vs4 +; PWR9BE-NEXT: xvadddp vs1, vs2, vs1 +; PWR9BE-NEXT: xvadddp vs0, v7, vs0 +; PWR9BE-NEXT: xvadddp vs0, vs0, vs1 +; PWR9BE-NEXT: xvadddp vs1, vs3, vs4 +; PWR9BE-NEXT: xvadddp vs2, vs5, vs6 +; PWR9BE-NEXT: xvadddp vs3, vs8, vs9 +; PWR9BE-NEXT: xvadddp vs4, vs10, vs11 +; PWR9BE-NEXT: xvadddp vs5, vs13, v0 +; PWR9BE-NEXT: xvadddp vs6, v1, v14 +; PWR9BE-NEXT: xvadddp vs7, v4, v12 +; PWR9BE-NEXT: xvadddp vs6, vs7, vs6 +; PWR9BE-NEXT: xvadddp vs4, vs5, vs4 +; PWR9BE-NEXT: xvadddp vs2, vs3, vs2 +; PWR9BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9BE-NEXT: xvadddp vs0, vs0, vs2 +; PWR9BE-NEXT: xvadddp vs1, vs4, vs6 +; PWR9BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v64f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: lxv vs0, 368(r1) +; PWR10LE-NEXT: lxv vs1, 496(r1) +; PWR10LE-NEXT: xvadddp vs0, v7, vs0 +; PWR10LE-NEXT: lxv vs2, 240(r1) +; PWR10LE-NEXT: lxv vs3, 304(r1) +; PWR10LE-NEXT: lxv vs4, 432(r1) +; PWR10LE-NEXT: lxv vs5, 400(r1) +; PWR10LE-NEXT: lxv vs6, 528(r1) +; PWR10LE-NEXT: lxv vs7, 272(r1) +; PWR10LE-NEXT: lxv vs8, 336(r1) +; PWR10LE-NEXT: lxv vs9, 464(r1) +; PWR10LE-NEXT: lxv vs10, 352(r1) +; PWR10LE-NEXT: lxv vs11, 480(r1) +; PWR10LE-NEXT: lxv vs12, 224(r1) +; PWR10LE-NEXT: lxv vs13, 288(r1) +; PWR10LE-NEXT: xvadddp vs13, v2, vs13 +; PWR10LE-NEXT: xvadddp vs11, vs12, vs11 +; PWR10LE-NEXT: xvadddp vs10, v6, vs10 +; PWR10LE-NEXT: xvadddp vs9, v13, vs9 +; PWR10LE-NEXT: xvadddp vs8, v5, vs8 +; PWR10LE-NEXT: xvadddp vs6, vs7, vs6 +; PWR10LE-NEXT: xvadddp vs5, v9, vs5 +; PWR10LE-NEXT: xvadddp vs4, v11, vs4 +; PWR10LE-NEXT: xvadddp vs3, v3, vs3 +; PWR10LE-NEXT: xvadddp vs1, vs2, vs1 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR10LE-NEXT: lxv v0, 416(r1) +; PWR10LE-NEXT: lxv v1, 384(r1) +; PWR10LE-NEXT: lxv v14, 512(r1) +; PWR10LE-NEXT: lxv v15, 256(r1) +; PWR10LE-NEXT: lxv v16, 320(r1) +; PWR10LE-NEXT: lxv v17, 448(r1) +; PWR10LE-NEXT: xvadddp v12, v12, v17 +; PWR10LE-NEXT: xvadddp v4, v4, v16 +; PWR10LE-NEXT: xvadddp v14, v15, v14 +; PWR10LE-NEXT: xvadddp v1, v8, v1 +; PWR10LE-NEXT: xvadddp v0, v10, v0 +; PWR10LE-NEXT: xvadddp vs1, vs3, vs4 +; PWR10LE-NEXT: xvadddp vs2, vs5, vs6 +; PWR10LE-NEXT: xvadddp vs3, vs8, vs9 +; PWR10LE-NEXT: xvadddp vs4, vs10, vs11 +; PWR10LE-NEXT: xvadddp vs5, vs13, v0 +; PWR10LE-NEXT: xvadddp vs6, v1, v14 +; PWR10LE-NEXT: xvadddp vs7, v4, v12 +; PWR10LE-NEXT: xvadddp vs6, vs7, vs6 +; PWR10LE-NEXT: xvadddp vs4, vs5, vs4 +; PWR10LE-NEXT: xvadddp vs2, vs3, vs2 +; PWR10LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs2 +; PWR10LE-NEXT: xvadddp vs1, vs4, vs6 +; PWR10LE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvadddp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v64f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: lxv vs0, 384(r1) +; PWR10BE-NEXT: lxv vs1, 512(r1) +; PWR10BE-NEXT: xvadddp vs0, v7, vs0 +; PWR10BE-NEXT: lxv vs2, 256(r1) +; PWR10BE-NEXT: lxv vs3, 320(r1) +; PWR10BE-NEXT: lxv vs4, 448(r1) +; PWR10BE-NEXT: lxv vs5, 416(r1) +; PWR10BE-NEXT: lxv vs6, 544(r1) +; PWR10BE-NEXT: lxv vs7, 288(r1) +; PWR10BE-NEXT: lxv vs8, 352(r1) +; PWR10BE-NEXT: lxv vs9, 480(r1) +; PWR10BE-NEXT: lxv vs10, 368(r1) +; PWR10BE-NEXT: lxv vs11, 496(r1) +; PWR10BE-NEXT: lxv vs12, 240(r1) +; PWR10BE-NEXT: lxv vs13, 304(r1) +; PWR10BE-NEXT: xvadddp vs13, v2, vs13 +; PWR10BE-NEXT: xvadddp vs11, vs12, vs11 +; PWR10BE-NEXT: xvadddp vs10, v6, vs10 +; PWR10BE-NEXT: xvadddp vs9, v13, vs9 +; PWR10BE-NEXT: xvadddp vs8, v5, vs8 +; PWR10BE-NEXT: xvadddp vs6, vs7, vs6 +; PWR10BE-NEXT: xvadddp vs5, v9, vs5 +; PWR10BE-NEXT: xvadddp vs4, v11, vs4 +; PWR10BE-NEXT: xvadddp vs3, v3, vs3 +; PWR10BE-NEXT: xvadddp vs1, vs2, vs1 +; PWR10BE-NEXT: xvadddp vs0, vs0, vs1 +; PWR10BE-NEXT: lxv v0, 432(r1) +; PWR10BE-NEXT: lxv v1, 400(r1) +; PWR10BE-NEXT: lxv v14, 528(r1) +; PWR10BE-NEXT: lxv v15, 272(r1) +; PWR10BE-NEXT: lxv v16, 336(r1) +; PWR10BE-NEXT: lxv v17, 464(r1) +; PWR10BE-NEXT: xvadddp v12, v12, v17 +; PWR10BE-NEXT: xvadddp v4, v4, v16 +; PWR10BE-NEXT: xvadddp v14, v15, v14 +; PWR10BE-NEXT: xvadddp v1, v8, v1 +; PWR10BE-NEXT: xvadddp v0, v10, v0 +; PWR10BE-NEXT: xvadddp vs1, vs3, vs4 +; PWR10BE-NEXT: xvadddp vs2, vs5, vs6 +; PWR10BE-NEXT: xvadddp vs3, vs8, vs9 +; PWR10BE-NEXT: xvadddp vs4, vs10, vs11 +; PWR10BE-NEXT: xvadddp vs5, vs13, v0 +; PWR10BE-NEXT: xvadddp vs6, v1, v14 +; PWR10BE-NEXT: xvadddp vs7, v4, v12 +; PWR10BE-NEXT: xvadddp vs6, vs7, vs6 +; PWR10BE-NEXT: xvadddp vs4, vs5, vs4 +; PWR10BE-NEXT: xvadddp vs2, vs3, vs2 +; PWR10BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10BE-NEXT: xvadddp vs0, vs0, vs2 +; PWR10BE-NEXT: xvadddp vs1, vs4, vs6 +; PWR10BE-NEXT: xvadddp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvadddp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fadd.v64f64(double -0.000000e+00, <64 x double> %a) + ret double %0 +} + +declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) #0 +declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) #0 +declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) #0 +declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) #0 +declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>) #0 +declare double @llvm.vector.reduce.fadd.v64f64(double, <64 x double>) #0 + +;; +;; Vectors of ppc_fp128 +;; +define dso_local ppc_fp128 @v2ppcf128(<2 x ppc_fp128> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2ppcf128: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: mflr r0 +; PWR9LE-NEXT: std r0, 16(r1) +; PWR9LE-NEXT: stdu r1, -32(r1) +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: addi r1, r1, 32 +; PWR9LE-NEXT: ld r0, 16(r1) +; PWR9LE-NEXT: mtlr r0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2ppcf128: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: mflr r0 +; PWR9BE-NEXT: std r0, 16(r1) +; PWR9BE-NEXT: stdu r1, -112(r1) +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: addi r1, r1, 112 +; PWR9BE-NEXT: ld r0, 16(r1) +; PWR9BE-NEXT: mtlr r0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2ppcf128: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: mflr r0 +; PWR10LE-NEXT: std r0, 16(r1) +; PWR10LE-NEXT: stdu r1, -32(r1) +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: addi r1, r1, 32 +; PWR10LE-NEXT: ld r0, 16(r1) +; PWR10LE-NEXT: mtlr r0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2ppcf128: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: mflr r0 +; PWR10BE-NEXT: std r0, 16(r1) +; PWR10BE-NEXT: stdu r1, -112(r1) +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: addi r1, r1, 112 +; PWR10BE-NEXT: ld r0, 16(r1) +; PWR10BE-NEXT: mtlr r0 +; PWR10BE-NEXT: blr +entry: + %0 = call ppc_fp128 @llvm.vector.reduce.fadd.v2ppcf128(ppc_fp128 0xM80000000000000000000000000000000, <2 x ppc_fp128> %a) + ret ppc_fp128 %0 +} + +define dso_local ppc_fp128 @v2ppcf128_b(<2 x ppc_fp128> %a, ppc_fp128 %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2ppcf128_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: mflr r0 +; PWR9LE-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: std r0, 16(r1) +; PWR9LE-NEXT: stdu r1, -48(r1) +; PWR9LE-NEXT: fmr f31, f4 +; PWR9LE-NEXT: fmr f30, f3 +; PWR9LE-NEXT: fmr f4, f2 +; PWR9LE-NEXT: fmr f3, f1 +; PWR9LE-NEXT: fmr f1, f5 +; PWR9LE-NEXT: fmr f2, f6 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: fmr f3, f30 +; PWR9LE-NEXT: fmr f4, f31 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: addi r1, r1, 48 +; PWR9LE-NEXT: ld r0, 16(r1) +; PWR9LE-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: mtlr r0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2ppcf128_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: mflr r0 +; PWR9BE-NEXT: std r0, 16(r1) +; PWR9BE-NEXT: stdu r1, -128(r1) +; PWR9BE-NEXT: stfd f30, 112(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f31, 120(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: fmr f31, f4 +; PWR9BE-NEXT: fmr f30, f3 +; PWR9BE-NEXT: fmr f4, f2 +; PWR9BE-NEXT: fmr f3, f1 +; PWR9BE-NEXT: fmr f1, f5 +; PWR9BE-NEXT: fmr f2, f6 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: fmr f3, f30 +; PWR9BE-NEXT: fmr f4, f31 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: lfd f31, 120(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f30, 112(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: addi r1, r1, 128 +; PWR9BE-NEXT: ld r0, 16(r1) +; PWR9BE-NEXT: mtlr r0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2ppcf128_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: mflr r0 +; PWR10LE-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: std r0, 16(r1) +; PWR10LE-NEXT: stdu r1, -48(r1) +; PWR10LE-NEXT: fmr f31, f4 +; PWR10LE-NEXT: fmr f30, f3 +; PWR10LE-NEXT: fmr f4, f2 +; PWR10LE-NEXT: fmr f3, f1 +; PWR10LE-NEXT: fmr f1, f5 +; PWR10LE-NEXT: fmr f2, f6 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: fmr f3, f30 +; PWR10LE-NEXT: fmr f4, f31 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: addi r1, r1, 48 +; PWR10LE-NEXT: ld r0, 16(r1) +; PWR10LE-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: mtlr r0 +; PWR10LE-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2ppcf128_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: mflr r0 +; PWR10BE-NEXT: std r0, 16(r1) +; PWR10BE-NEXT: stdu r1, -128(r1) +; PWR10BE-NEXT: stfd f30, 112(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f31, 120(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: fmr f31, f4 +; PWR10BE-NEXT: fmr f30, f3 +; PWR10BE-NEXT: fmr f4, f2 +; PWR10BE-NEXT: fmr f3, f1 +; PWR10BE-NEXT: fmr f1, f5 +; PWR10BE-NEXT: fmr f2, f6 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: fmr f3, f30 +; PWR10BE-NEXT: fmr f4, f31 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: lfd f31, 120(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f30, 112(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: addi r1, r1, 128 +; PWR10BE-NEXT: ld r0, 16(r1) +; PWR10BE-NEXT: mtlr r0 +; PWR10BE-NEXT: blr +entry: + %0 = call ppc_fp128 @llvm.vector.reduce.fadd.v2ppcf128(ppc_fp128 %b, <2 x ppc_fp128> %a) + ret ppc_fp128 %0 +} + +define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2ppcf128_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: mflr r0 +; PWR9LE-NEXT: std r0, 16(r1) +; PWR9LE-NEXT: stdu r1, -64(r1) +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: stfd f2, 40(r1) +; PWR9LE-NEXT: stfd f1, 32(r1) +; PWR9LE-NEXT: lxv vs1, 32(r1) +; PWR9LE-NEXT: xxswapd vs2, vs1 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PWR9LE-NEXT: addi r1, r1, 64 +; PWR9LE-NEXT: ld r0, 16(r1) +; PWR9LE-NEXT: mtlr r0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2ppcf128_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: mflr r0 +; PWR9BE-NEXT: std r0, 16(r1) +; PWR9BE-NEXT: stdu r1, -144(r1) +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: stfd f2, 120(r1) +; PWR9BE-NEXT: stfd f1, 112(r1) +; PWR9BE-NEXT: lxv vs1, 112(r1) +; PWR9BE-NEXT: xxswapd vs2, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PWR9BE-NEXT: addi r1, r1, 144 +; PWR9BE-NEXT: ld r0, 16(r1) +; PWR9BE-NEXT: mtlr r0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2ppcf128_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: mflr r0 +; PWR10LE-NEXT: std r0, 16(r1) +; PWR10LE-NEXT: stdu r1, -64(r1) +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: stfd f2, 40(r1) +; PWR10LE-NEXT: stfd f1, 32(r1) +; PWR10LE-NEXT: lxv vs1, 32(r1) +; PWR10LE-NEXT: xxswapd vs2, vs1 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PWR10LE-NEXT: addi r1, r1, 64 +; PWR10LE-NEXT: ld r0, 16(r1) +; PWR10LE-NEXT: mtlr r0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2ppcf128_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: mflr r0 +; PWR10BE-NEXT: std r0, 16(r1) +; PWR10BE-NEXT: stdu r1, -144(r1) +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: stfd f2, 120(r1) +; PWR10BE-NEXT: stfd f1, 112(r1) +; PWR10BE-NEXT: lxv vs1, 112(r1) +; PWR10BE-NEXT: xxswapd vs2, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PWR10BE-NEXT: addi r1, r1, 144 +; PWR10BE-NEXT: ld r0, 16(r1) +; PWR10BE-NEXT: mtlr r0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast ppc_fp128 @llvm.vector.reduce.fadd.v2ppcf128(ppc_fp128 0xM80000000000000000000000000000000, <2 x ppc_fp128> %a) + ret ppc_fp128 %0 +} + +define dso_local ppc_fp128 @v4ppcf128(<4 x ppc_fp128> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4ppcf128: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: mflr r0 +; PWR9LE-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: std r0, 16(r1) +; PWR9LE-NEXT: stdu r1, -64(r1) +; PWR9LE-NEXT: fmr f31, f8 +; PWR9LE-NEXT: fmr f30, f7 +; PWR9LE-NEXT: fmr f29, f6 +; PWR9LE-NEXT: fmr f28, f5 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: fmr f3, f28 +; PWR9LE-NEXT: fmr f4, f29 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: fmr f3, f30 +; PWR9LE-NEXT: fmr f4, f31 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: addi r1, r1, 64 +; PWR9LE-NEXT: ld r0, 16(r1) +; PWR9LE-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: mtlr r0 +; PWR9LE-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4ppcf128: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: mflr r0 +; PWR9BE-NEXT: std r0, 16(r1) +; PWR9BE-NEXT: stdu r1, -144(r1) +; PWR9BE-NEXT: stfd f28, 112(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f29, 120(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f30, 128(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f31, 136(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: fmr f31, f8 +; PWR9BE-NEXT: fmr f30, f7 +; PWR9BE-NEXT: fmr f29, f6 +; PWR9BE-NEXT: fmr f28, f5 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: fmr f3, f28 +; PWR9BE-NEXT: fmr f4, f29 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: fmr f3, f30 +; PWR9BE-NEXT: fmr f4, f31 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: lfd f31, 136(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f30, 128(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f29, 120(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f28, 112(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: addi r1, r1, 144 +; PWR9BE-NEXT: ld r0, 16(r1) +; PWR9BE-NEXT: mtlr r0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4ppcf128: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: mflr r0 +; PWR10LE-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: std r0, 16(r1) +; PWR10LE-NEXT: stdu r1, -64(r1) +; PWR10LE-NEXT: fmr f31, f8 +; PWR10LE-NEXT: fmr f30, f7 +; PWR10LE-NEXT: fmr f29, f6 +; PWR10LE-NEXT: fmr f28, f5 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: fmr f3, f28 +; PWR10LE-NEXT: fmr f4, f29 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: fmr f3, f30 +; PWR10LE-NEXT: fmr f4, f31 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: addi r1, r1, 64 +; PWR10LE-NEXT: ld r0, 16(r1) +; PWR10LE-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: mtlr r0 +; PWR10LE-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4ppcf128: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: mflr r0 +; PWR10BE-NEXT: std r0, 16(r1) +; PWR10BE-NEXT: stdu r1, -144(r1) +; PWR10BE-NEXT: stfd f28, 112(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f29, 120(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: fmr f29, f6 +; PWR10BE-NEXT: fmr f28, f5 +; PWR10BE-NEXT: stfd f30, 128(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f31, 136(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: fmr f31, f8 +; PWR10BE-NEXT: fmr f30, f7 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: fmr f3, f28 +; PWR10BE-NEXT: fmr f4, f29 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: fmr f3, f30 +; PWR10BE-NEXT: fmr f4, f31 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: lfd f31, 136(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f30, 128(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f29, 120(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f28, 112(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: addi r1, r1, 144 +; PWR10BE-NEXT: ld r0, 16(r1) +; PWR10BE-NEXT: mtlr r0 +; PWR10BE-NEXT: blr +entry: + %0 = call ppc_fp128 @llvm.vector.reduce.fadd.v4ppcf128(ppc_fp128 0xM80000000000000000000000000000000, <4 x ppc_fp128> %a) + ret ppc_fp128 %0 +} + +define dso_local ppc_fp128 @v4ppcf128_b(<4 x ppc_fp128> %a, ppc_fp128 %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4ppcf128_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: mflr r0 +; PWR9LE-NEXT: stfd f26, -48(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f27, -40(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: std r0, 16(r1) +; PWR9LE-NEXT: stdu r1, -80(r1) +; PWR9LE-NEXT: fmr f27, f4 +; PWR9LE-NEXT: fmr f26, f3 +; PWR9LE-NEXT: fmr f4, f2 +; PWR9LE-NEXT: fmr f3, f1 +; PWR9LE-NEXT: fmr f1, f9 +; PWR9LE-NEXT: fmr f2, f10 +; PWR9LE-NEXT: fmr f31, f8 +; PWR9LE-NEXT: fmr f30, f7 +; PWR9LE-NEXT: fmr f29, f6 +; PWR9LE-NEXT: fmr f28, f5 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: fmr f3, f26 +; PWR9LE-NEXT: fmr f4, f27 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: fmr f3, f28 +; PWR9LE-NEXT: fmr f4, f29 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: fmr f3, f30 +; PWR9LE-NEXT: fmr f4, f31 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: addi r1, r1, 80 +; PWR9LE-NEXT: ld r0, 16(r1) +; PWR9LE-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: mtlr r0 +; PWR9LE-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f27, -40(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f26, -48(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4ppcf128_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: mflr r0 +; PWR9BE-NEXT: std r0, 16(r1) +; PWR9BE-NEXT: stdu r1, -160(r1) +; PWR9BE-NEXT: stfd f26, 112(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f27, 120(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: fmr f27, f4 +; PWR9BE-NEXT: fmr f26, f3 +; PWR9BE-NEXT: fmr f4, f2 +; PWR9BE-NEXT: fmr f3, f1 +; PWR9BE-NEXT: fmr f1, f9 +; PWR9BE-NEXT: fmr f2, f10 +; PWR9BE-NEXT: stfd f28, 128(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f29, 136(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f30, 144(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f31, 152(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: fmr f31, f8 +; PWR9BE-NEXT: fmr f30, f7 +; PWR9BE-NEXT: fmr f29, f6 +; PWR9BE-NEXT: fmr f28, f5 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: fmr f3, f26 +; PWR9BE-NEXT: fmr f4, f27 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: fmr f3, f28 +; PWR9BE-NEXT: fmr f4, f29 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: fmr f3, f30 +; PWR9BE-NEXT: fmr f4, f31 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: lfd f31, 152(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f30, 144(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f29, 136(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f28, 128(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f27, 120(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f26, 112(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: addi r1, r1, 160 +; PWR9BE-NEXT: ld r0, 16(r1) +; PWR9BE-NEXT: mtlr r0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4ppcf128_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: mflr r0 +; PWR10LE-NEXT: stfd f26, -48(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f27, -40(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: std r0, 16(r1) +; PWR10LE-NEXT: stdu r1, -80(r1) +; PWR10LE-NEXT: fmr f27, f4 +; PWR10LE-NEXT: fmr f26, f3 +; PWR10LE-NEXT: fmr f4, f2 +; PWR10LE-NEXT: fmr f3, f1 +; PWR10LE-NEXT: fmr f1, f9 +; PWR10LE-NEXT: fmr f2, f10 +; PWR10LE-NEXT: fmr f31, f8 +; PWR10LE-NEXT: fmr f30, f7 +; PWR10LE-NEXT: fmr f29, f6 +; PWR10LE-NEXT: fmr f28, f5 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: fmr f3, f26 +; PWR10LE-NEXT: fmr f4, f27 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: fmr f3, f28 +; PWR10LE-NEXT: fmr f4, f29 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: fmr f3, f30 +; PWR10LE-NEXT: fmr f4, f31 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: addi r1, r1, 80 +; PWR10LE-NEXT: ld r0, 16(r1) +; PWR10LE-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: mtlr r0 +; PWR10LE-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f27, -40(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f26, -48(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4ppcf128_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: mflr r0 +; PWR10BE-NEXT: std r0, 16(r1) +; PWR10BE-NEXT: stdu r1, -160(r1) +; PWR10BE-NEXT: stfd f26, 112(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f27, 120(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: fmr f27, f4 +; PWR10BE-NEXT: fmr f26, f3 +; PWR10BE-NEXT: fmr f4, f2 +; PWR10BE-NEXT: fmr f3, f1 +; PWR10BE-NEXT: fmr f1, f9 +; PWR10BE-NEXT: stfd f28, 128(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f29, 136(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: fmr f2, f10 +; PWR10BE-NEXT: fmr f29, f6 +; PWR10BE-NEXT: fmr f28, f5 +; PWR10BE-NEXT: stfd f30, 144(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f31, 152(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: fmr f31, f8 +; PWR10BE-NEXT: fmr f30, f7 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: fmr f3, f26 +; PWR10BE-NEXT: fmr f4, f27 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: fmr f3, f28 +; PWR10BE-NEXT: fmr f4, f29 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: fmr f3, f30 +; PWR10BE-NEXT: fmr f4, f31 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: lfd f31, 152(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f30, 144(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f29, 136(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f28, 128(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f27, 120(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f26, 112(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: addi r1, r1, 160 +; PWR10BE-NEXT: ld r0, 16(r1) +; PWR10BE-NEXT: mtlr r0 +; PWR10BE-NEXT: blr +entry: + %0 = call ppc_fp128 @llvm.vector.reduce.fadd.v4ppcf128(ppc_fp128 %b, <4 x ppc_fp128> %a) + ret ppc_fp128 %0 +} + +define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4ppcf128_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: mflr r0 +; PWR9LE-NEXT: stfd f26, -48(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f27, -40(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill +; PWR9LE-NEXT: std r0, 16(r1) +; PWR9LE-NEXT: stdu r1, -96(r1) +; PWR9LE-NEXT: fmr f29, f4 +; PWR9LE-NEXT: fmr f28, f3 +; PWR9LE-NEXT: fmr f3, f5 +; PWR9LE-NEXT: fmr f4, f6 +; PWR9LE-NEXT: fmr f31, f8 +; PWR9LE-NEXT: fmr f30, f7 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: fmr f27, f1 +; PWR9LE-NEXT: fmr f26, f2 +; PWR9LE-NEXT: fmr f1, f28 +; PWR9LE-NEXT: fmr f2, f29 +; PWR9LE-NEXT: fmr f3, f30 +; PWR9LE-NEXT: fmr f4, f31 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: fmr f3, f1 +; PWR9LE-NEXT: fmr f4, f2 +; PWR9LE-NEXT: fmr f1, f27 +; PWR9LE-NEXT: fmr f2, f26 +; PWR9LE-NEXT: bl __gcc_qadd +; PWR9LE-NEXT: nop +; PWR9LE-NEXT: stfd f2, 40(r1) +; PWR9LE-NEXT: stfd f1, 32(r1) +; PWR9LE-NEXT: lxv vs1, 32(r1) +; PWR9LE-NEXT: xxswapd vs2, vs1 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PWR9LE-NEXT: addi r1, r1, 96 +; PWR9LE-NEXT: ld r0, 16(r1) +; PWR9LE-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: mtlr r0 +; PWR9LE-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f27, -40(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: lfd f26, -48(r1) # 8-byte Folded Reload +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4ppcf128_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: mflr r0 +; PWR9BE-NEXT: std r0, 16(r1) +; PWR9BE-NEXT: stdu r1, -176(r1) +; PWR9BE-NEXT: stfd f28, 144(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f29, 152(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: fmr f29, f4 +; PWR9BE-NEXT: fmr f28, f3 +; PWR9BE-NEXT: fmr f3, f5 +; PWR9BE-NEXT: fmr f4, f6 +; PWR9BE-NEXT: stfd f26, 128(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f27, 136(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f30, 160(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: stfd f31, 168(r1) # 8-byte Folded Spill +; PWR9BE-NEXT: fmr f31, f8 +; PWR9BE-NEXT: fmr f30, f7 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: fmr f27, f1 +; PWR9BE-NEXT: fmr f26, f2 +; PWR9BE-NEXT: fmr f1, f28 +; PWR9BE-NEXT: fmr f2, f29 +; PWR9BE-NEXT: fmr f3, f30 +; PWR9BE-NEXT: fmr f4, f31 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: fmr f3, f1 +; PWR9BE-NEXT: fmr f4, f2 +; PWR9BE-NEXT: fmr f1, f27 +; PWR9BE-NEXT: fmr f2, f26 +; PWR9BE-NEXT: bl __gcc_qadd +; PWR9BE-NEXT: nop +; PWR9BE-NEXT: stfd f2, 120(r1) +; PWR9BE-NEXT: stfd f1, 112(r1) +; PWR9BE-NEXT: lxv vs1, 112(r1) +; PWR9BE-NEXT: lfd f31, 168(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f30, 160(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: xxswapd vs2, vs1 +; PWR9BE-NEXT: lfd f29, 152(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f28, 144(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f27, 136(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: lfd f26, 128(r1) # 8-byte Folded Reload +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PWR9BE-NEXT: addi r1, r1, 176 +; PWR9BE-NEXT: ld r0, 16(r1) +; PWR9BE-NEXT: mtlr r0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4ppcf128_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: mflr r0 +; PWR10LE-NEXT: stfd f26, -48(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f27, -40(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f28, -32(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f29, -24(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f30, -16(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: stfd f31, -8(r1) # 8-byte Folded Spill +; PWR10LE-NEXT: std r0, 16(r1) +; PWR10LE-NEXT: stdu r1, -96(r1) +; PWR10LE-NEXT: fmr f29, f4 +; PWR10LE-NEXT: fmr f28, f3 +; PWR10LE-NEXT: fmr f3, f5 +; PWR10LE-NEXT: fmr f4, f6 +; PWR10LE-NEXT: fmr f31, f8 +; PWR10LE-NEXT: fmr f30, f7 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: fmr f27, f1 +; PWR10LE-NEXT: fmr f26, f2 +; PWR10LE-NEXT: fmr f1, f28 +; PWR10LE-NEXT: fmr f2, f29 +; PWR10LE-NEXT: fmr f3, f30 +; PWR10LE-NEXT: fmr f4, f31 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: fmr f3, f1 +; PWR10LE-NEXT: fmr f4, f2 +; PWR10LE-NEXT: fmr f1, f27 +; PWR10LE-NEXT: fmr f2, f26 +; PWR10LE-NEXT: bl __gcc_qadd@notoc +; PWR10LE-NEXT: stfd f2, 40(r1) +; PWR10LE-NEXT: stfd f1, 32(r1) +; PWR10LE-NEXT: lxv vs1, 32(r1) +; PWR10LE-NEXT: xxswapd vs2, vs1 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PWR10LE-NEXT: addi r1, r1, 96 +; PWR10LE-NEXT: ld r0, 16(r1) +; PWR10LE-NEXT: lfd f31, -8(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: mtlr r0 +; PWR10LE-NEXT: lfd f30, -16(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f29, -24(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f28, -32(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f27, -40(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: lfd f26, -48(r1) # 8-byte Folded Reload +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4ppcf128_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: mflr r0 +; PWR10BE-NEXT: std r0, 16(r1) +; PWR10BE-NEXT: stdu r1, -176(r1) +; PWR10BE-NEXT: stfd f28, 144(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f29, 152(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: fmr f29, f4 +; PWR10BE-NEXT: fmr f28, f3 +; PWR10BE-NEXT: fmr f3, f5 +; PWR10BE-NEXT: fmr f4, f6 +; PWR10BE-NEXT: stfd f26, 128(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f27, 136(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f30, 160(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: stfd f31, 168(r1) # 8-byte Folded Spill +; PWR10BE-NEXT: fmr f31, f8 +; PWR10BE-NEXT: fmr f30, f7 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: fmr f27, f1 +; PWR10BE-NEXT: fmr f26, f2 +; PWR10BE-NEXT: fmr f1, f28 +; PWR10BE-NEXT: fmr f2, f29 +; PWR10BE-NEXT: fmr f3, f30 +; PWR10BE-NEXT: fmr f4, f31 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: fmr f3, f1 +; PWR10BE-NEXT: fmr f4, f2 +; PWR10BE-NEXT: fmr f1, f27 +; PWR10BE-NEXT: fmr f2, f26 +; PWR10BE-NEXT: bl __gcc_qadd +; PWR10BE-NEXT: nop +; PWR10BE-NEXT: stfd f2, 120(r1) +; PWR10BE-NEXT: stfd f1, 112(r1) +; PWR10BE-NEXT: lfd f31, 168(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f30, 160(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f29, 152(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f28, 144(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f27, 136(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lfd f26, 128(r1) # 8-byte Folded Reload +; PWR10BE-NEXT: lxv vs1, 112(r1) +; PWR10BE-NEXT: xxswapd vs2, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: # kill: def $f2 killed $f2 killed $vsl2 +; PWR10BE-NEXT: addi r1, r1, 176 +; PWR10BE-NEXT: ld r0, 16(r1) +; PWR10BE-NEXT: mtlr r0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast ppc_fp128 @llvm.vector.reduce.fadd.v4ppcf128(ppc_fp128 0xM80000000000000000000000000000000, <4 x ppc_fp128> %a) + ret ppc_fp128 %0 +} + +declare ppc_fp128 @llvm.vector.reduce.fadd.v2ppcf128(ppc_fp128, <2 x ppc_fp128>) #0 +declare ppc_fp128 @llvm.vector.reduce.fadd.v4ppcf128(ppc_fp128, <4 x ppc_fp128>) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll new file mode 100644 index 000000000000..b1f72f694aea --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll @@ -0,0 +1,1169 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64le < %s | \ +; RUN: FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64 < %s | \ +; RUN: FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of f32 +;; +define dso_local float @v2f32(<2 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmaxdp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmaxdp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) + ret float %0 +} + +define dso_local float @v2f32_fast(<2 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw vs0, v2, 2 +; PWR9LE-NEXT: xvmaxsp vs0, v2, vs0 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw vs0, v2, 1 +; PWR9BE-NEXT: xvmaxsp vs0, v2, vs0 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw vs0, v2, 2 +; PWR10LE-NEXT: xvmaxsp vs0, v2, vs0 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw vs0, v2, 1 +; PWR10BE-NEXT: xvmaxsp vs0, v2, vs0 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) + ret float %0 +} + +define dso_local float @v4f32(<4 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs2, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs3, v2 +; PWR9LE-NEXT: xscvspdpn f0, v2 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f3, vs3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmaxdp f2, f2, f3 +; PWR9LE-NEXT: xsmaxdp f1, f2, f1 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs2, v2, v2, 1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f3, v2 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xscvspdpn f0, vs0 +; PWR9BE-NEXT: xsmaxdp f2, f3, f2 +; PWR9BE-NEXT: xsmaxdp f1, f2, f1 +; PWR9BE-NEXT: xsmaxdp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs2, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs3, v2 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f0, v2 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xscvspdpn f3, vs3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmaxdp f2, f2, f3 +; PWR10LE-NEXT: xsmaxdp f1, f2, f1 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs2, v2, v2, 1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f3, v2 +; PWR10BE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xscvspdpn f0, vs0 +; PWR10BE-NEXT: xsmaxdp f2, f3, f2 +; PWR10BE-NEXT: xsmaxdp f1, f2, f1 +; PWR10BE-NEXT: xsmaxdp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) + ret float %0 +} + +define dso_local float @v4f32_fast(<4 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: xvmaxsp vs0, v2, v3 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: xvmaxsp vs0, v2, v3 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: xvmaxsp vs0, v2, v3 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: xvmaxsp vs0, v2, v3 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) + ret float %0 +} + +define dso_local float @v8f32(<8 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxsp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmaxdp f1, f2, f1 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmaxdp f1, f1, f2 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxsp vs0, v2, v3 +; PWR9BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xsmaxdp f1, f1, f2 +; PWR9BE-NEXT: xxswapd vs2, vs0 +; PWR9BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xscvspdpn f0, vs0 +; PWR9BE-NEXT: xsmaxdp f1, f1, f2 +; PWR9BE-NEXT: xsmaxdp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxsp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmaxdp f1, f2, f1 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmaxdp f1, f1, f2 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxsp vs0, v2, v3 +; PWR10BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xsmaxdp f1, f1, f2 +; PWR10BE-NEXT: xxswapd vs2, vs0 +; PWR10BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xscvspdpn f0, vs0 +; PWR10BE-NEXT: xsmaxdp f1, f1, f2 +; PWR10BE-NEXT: xsmaxdp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a) + ret float %0 +} + +define dso_local float @v8f32_fast(<8 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxsp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxsp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxsp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxsp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a) + ret float %0 +} + +define dso_local float @v16f32(<16 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxsp vs0, v3, v5 +; PWR9LE-NEXT: xvmaxsp vs1, v2, v4 +; PWR9LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmaxdp f1, f2, f1 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmaxdp f1, f1, f2 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxsp vs0, v3, v5 +; PWR9BE-NEXT: xvmaxsp vs1, v2, v4 +; PWR9BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xsmaxdp f1, f1, f2 +; PWR9BE-NEXT: xxswapd vs2, vs0 +; PWR9BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xscvspdpn f0, vs0 +; PWR9BE-NEXT: xsmaxdp f1, f1, f2 +; PWR9BE-NEXT: xsmaxdp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxsp vs0, v3, v5 +; PWR10LE-NEXT: xvmaxsp vs1, v2, v4 +; PWR10LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmaxdp f1, f2, f1 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmaxdp f1, f1, f2 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxsp vs0, v3, v5 +; PWR10BE-NEXT: xvmaxsp vs1, v2, v4 +; PWR10BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xsmaxdp f1, f1, f2 +; PWR10BE-NEXT: xxswapd vs2, vs0 +; PWR10BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xscvspdpn f0, vs0 +; PWR10BE-NEXT: xsmaxdp f1, f1, f2 +; PWR10BE-NEXT: xsmaxdp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a) + ret float %0 +} + +define dso_local float @v16f32_fast(<16 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxsp vs0, v3, v5 +; PWR9LE-NEXT: xvmaxsp vs1, v2, v4 +; PWR9LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxsp vs0, v3, v5 +; PWR9BE-NEXT: xvmaxsp vs1, v2, v4 +; PWR9BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxsp vs0, v3, v5 +; PWR10LE-NEXT: xvmaxsp vs1, v2, v4 +; PWR10LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxsp vs0, v3, v5 +; PWR10BE-NEXT: xvmaxsp vs1, v2, v4 +; PWR10BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmax.v16f32(<16 x float> %a) + ret float %0 +} + +define dso_local float @v32f32(<32 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxsp vs0, v5, v9 +; PWR9LE-NEXT: xvmaxsp vs1, v3, v7 +; PWR9LE-NEXT: xvmaxsp vs2, v2, v6 +; PWR9LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmaxsp vs1, v4, v8 +; PWR9LE-NEXT: xvmaxsp vs1, vs2, vs1 +; PWR9LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmaxdp f1, f2, f1 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmaxdp f1, f1, f2 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxsp vs0, v5, v9 +; PWR9BE-NEXT: xvmaxsp vs1, v3, v7 +; PWR9BE-NEXT: xvmaxsp vs2, v2, v6 +; PWR9BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmaxsp vs1, v4, v8 +; PWR9BE-NEXT: xvmaxsp vs1, vs2, vs1 +; PWR9BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xsmaxdp f1, f1, f2 +; PWR9BE-NEXT: xxswapd vs2, vs0 +; PWR9BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xscvspdpn f0, vs0 +; PWR9BE-NEXT: xsmaxdp f1, f1, f2 +; PWR9BE-NEXT: xsmaxdp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxsp vs0, v5, v9 +; PWR10LE-NEXT: xvmaxsp vs1, v3, v7 +; PWR10LE-NEXT: xvmaxsp vs2, v2, v6 +; PWR10LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmaxsp vs1, v4, v8 +; PWR10LE-NEXT: xvmaxsp vs1, vs2, vs1 +; PWR10LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmaxdp f1, f2, f1 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmaxdp f1, f1, f2 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxsp vs0, v5, v9 +; PWR10BE-NEXT: xvmaxsp vs1, v3, v7 +; PWR10BE-NEXT: xvmaxsp vs2, v2, v6 +; PWR10BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmaxsp vs1, v4, v8 +; PWR10BE-NEXT: xvmaxsp vs1, vs2, vs1 +; PWR10BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xsmaxdp f1, f1, f2 +; PWR10BE-NEXT: xxswapd vs2, vs0 +; PWR10BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xscvspdpn f0, vs0 +; PWR10BE-NEXT: xsmaxdp f1, f1, f2 +; PWR10BE-NEXT: xsmaxdp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %a) + ret float %0 +} + +define dso_local float @v32f32_fast(<32 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxsp vs0, v4, v8 +; PWR9LE-NEXT: xvmaxsp vs1, v2, v6 +; PWR9LE-NEXT: xvmaxsp vs2, v5, v9 +; PWR9LE-NEXT: xvmaxsp vs3, v3, v7 +; PWR9LE-NEXT: xvmaxsp vs2, vs3, vs2 +; PWR9LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmaxsp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxsp vs0, v4, v8 +; PWR9BE-NEXT: xvmaxsp vs1, v2, v6 +; PWR9BE-NEXT: xvmaxsp vs2, v5, v9 +; PWR9BE-NEXT: xvmaxsp vs3, v3, v7 +; PWR9BE-NEXT: xvmaxsp vs2, vs3, vs2 +; PWR9BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmaxsp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxsp vs0, v4, v8 +; PWR10LE-NEXT: xvmaxsp vs1, v2, v6 +; PWR10LE-NEXT: xvmaxsp vs2, v5, v9 +; PWR10LE-NEXT: xvmaxsp vs3, v3, v7 +; PWR10LE-NEXT: xvmaxsp vs2, vs3, vs2 +; PWR10LE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmaxsp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxsp vs0, v4, v8 +; PWR10BE-NEXT: xvmaxsp vs1, v2, v6 +; PWR10BE-NEXT: xvmaxsp vs2, v5, v9 +; PWR10BE-NEXT: xvmaxsp vs3, v3, v7 +; PWR10BE-NEXT: xvmaxsp vs2, vs3, vs2 +; PWR10BE-NEXT: xvmaxsp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmaxsp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvmaxsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvmaxsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmax.v32f32(<32 x float> %a) + ret float %0 +} + +declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) #0 +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) #0 +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) #0 +declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) #0 +declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>) #0 + +;; +;; Vectors of f64 +;; +define dso_local double @v2f64(<2 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsmaxdp f1, f0, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xsmaxdp f1, v2, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsmaxdp f1, f0, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xsmaxdp f1, v2, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) + ret double %0 +} + +define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xvmaxdp vs0, v2, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xvmaxdp vs1, v2, vs0 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xvmaxdp vs0, v2, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xvmaxdp vs1, v2, vs0 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) + ret double %0 +} + +define dso_local double @v4f64(<4 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxdp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxdp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xsmaxdp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxdp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxdp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xsmaxdp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a) + ret double %0 +} + +define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxdp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxdp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmaxdp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxdp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxdp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmaxdp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a) + ret double %0 +} + +define dso_local double @v8f64(<8 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxdp vs0, v3, v5 +; PWR9LE-NEXT: xvmaxdp vs1, v2, v4 +; PWR9LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxdp vs0, v3, v5 +; PWR9BE-NEXT: xvmaxdp vs1, v2, v4 +; PWR9BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xsmaxdp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxdp vs0, v3, v5 +; PWR10LE-NEXT: xvmaxdp vs1, v2, v4 +; PWR10LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxdp vs0, v3, v5 +; PWR10BE-NEXT: xvmaxdp vs1, v2, v4 +; PWR10BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xsmaxdp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a) + ret double %0 +} + +define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxdp vs0, v3, v5 +; PWR9LE-NEXT: xvmaxdp vs1, v2, v4 +; PWR9LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxdp vs0, v3, v5 +; PWR9BE-NEXT: xvmaxdp vs1, v2, v4 +; PWR9BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmaxdp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxdp vs0, v3, v5 +; PWR10LE-NEXT: xvmaxdp vs1, v2, v4 +; PWR10LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxdp vs0, v3, v5 +; PWR10BE-NEXT: xvmaxdp vs1, v2, v4 +; PWR10BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmaxdp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a) + ret double %0 +} + +define dso_local double @v16f64(<16 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxdp vs0, v5, v9 +; PWR9LE-NEXT: xvmaxdp vs1, v3, v7 +; PWR9LE-NEXT: xvmaxdp vs2, v2, v6 +; PWR9LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmaxdp vs1, v4, v8 +; PWR9LE-NEXT: xvmaxdp vs1, vs2, vs1 +; PWR9LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxdp vs0, v5, v9 +; PWR9BE-NEXT: xvmaxdp vs1, v3, v7 +; PWR9BE-NEXT: xvmaxdp vs2, v2, v6 +; PWR9BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmaxdp vs1, v4, v8 +; PWR9BE-NEXT: xvmaxdp vs1, vs2, vs1 +; PWR9BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xsmaxdp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxdp vs0, v5, v9 +; PWR10LE-NEXT: xvmaxdp vs1, v3, v7 +; PWR10LE-NEXT: xvmaxdp vs2, v2, v6 +; PWR10LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmaxdp vs1, v4, v8 +; PWR10LE-NEXT: xvmaxdp vs1, vs2, vs1 +; PWR10LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxdp vs0, v5, v9 +; PWR10BE-NEXT: xvmaxdp vs1, v3, v7 +; PWR10BE-NEXT: xvmaxdp vs2, v2, v6 +; PWR10BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmaxdp vs1, v4, v8 +; PWR10BE-NEXT: xvmaxdp vs1, vs2, vs1 +; PWR10BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xsmaxdp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a) + ret double %0 +} + +define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmaxdp vs0, v4, v8 +; PWR9LE-NEXT: xvmaxdp vs1, v2, v6 +; PWR9LE-NEXT: xvmaxdp vs2, v5, v9 +; PWR9LE-NEXT: xvmaxdp vs3, v3, v7 +; PWR9LE-NEXT: xvmaxdp vs2, vs3, vs2 +; PWR9LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmaxdp vs0, v4, v8 +; PWR9BE-NEXT: xvmaxdp vs1, v2, v6 +; PWR9BE-NEXT: xvmaxdp vs2, v5, v9 +; PWR9BE-NEXT: xvmaxdp vs3, v3, v7 +; PWR9BE-NEXT: xvmaxdp vs2, vs3, vs2 +; PWR9BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmaxdp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmaxdp vs0, v4, v8 +; PWR10LE-NEXT: xvmaxdp vs1, v2, v6 +; PWR10LE-NEXT: xvmaxdp vs2, v5, v9 +; PWR10LE-NEXT: xvmaxdp vs3, v3, v7 +; PWR10LE-NEXT: xvmaxdp vs2, vs3, vs2 +; PWR10LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmaxdp vs0, v4, v8 +; PWR10BE-NEXT: xvmaxdp vs1, v2, v6 +; PWR10BE-NEXT: xvmaxdp vs2, v5, v9 +; PWR10BE-NEXT: xvmaxdp vs3, v3, v7 +; PWR10BE-NEXT: xvmaxdp vs2, vs3, vs2 +; PWR10BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmaxdp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a) + ret double %0 +} + +define dso_local double @v32f64(<32 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: lxv vs3, 272(r1) +; PWR9LE-NEXT: lxv vs2, 240(r1) +; PWR9LE-NEXT: xvmaxdp vs4, v5, v13 +; PWR9LE-NEXT: lxv vs1, 256(r1) +; PWR9LE-NEXT: lxv vs0, 224(r1) +; PWR9LE-NEXT: xvmaxdp vs3, v9, vs3 +; PWR9LE-NEXT: xvmaxdp vs2, v7, vs2 +; PWR9LE-NEXT: xvmaxdp vs1, v8, vs1 +; PWR9LE-NEXT: xvmaxdp vs0, v6, vs0 +; PWR9LE-NEXT: xvmaxdp vs3, vs4, vs3 +; PWR9LE-NEXT: xvmaxdp vs4, v3, v11 +; PWR9LE-NEXT: xvmaxdp vs2, vs4, vs2 +; PWR9LE-NEXT: xvmaxdp vs2, vs2, vs3 +; PWR9LE-NEXT: xvmaxdp vs3, v4, v12 +; PWR9LE-NEXT: xvmaxdp vs1, vs3, vs1 +; PWR9LE-NEXT: xvmaxdp vs3, v2, v10 +; PWR9LE-NEXT: xvmaxdp vs0, vs3, vs0 +; PWR9LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR9LE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xsmaxdp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: lxv vs3, 288(r1) +; PWR9BE-NEXT: lxv vs2, 256(r1) +; PWR9BE-NEXT: xvmaxdp vs4, v5, v13 +; PWR9BE-NEXT: lxv vs1, 272(r1) +; PWR9BE-NEXT: lxv vs0, 240(r1) +; PWR9BE-NEXT: xvmaxdp vs3, v9, vs3 +; PWR9BE-NEXT: xvmaxdp vs2, v7, vs2 +; PWR9BE-NEXT: xvmaxdp vs1, v8, vs1 +; PWR9BE-NEXT: xvmaxdp vs0, v6, vs0 +; PWR9BE-NEXT: xvmaxdp vs3, vs4, vs3 +; PWR9BE-NEXT: xvmaxdp vs4, v3, v11 +; PWR9BE-NEXT: xvmaxdp vs2, vs4, vs2 +; PWR9BE-NEXT: xvmaxdp vs2, vs2, vs3 +; PWR9BE-NEXT: xvmaxdp vs3, v4, v12 +; PWR9BE-NEXT: xvmaxdp vs1, vs3, vs1 +; PWR9BE-NEXT: xvmaxdp vs3, v2, v10 +; PWR9BE-NEXT: xvmaxdp vs0, vs3, vs0 +; PWR9BE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR9BE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xsmaxdp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: lxv vs3, 272(r1) +; PWR10LE-NEXT: lxv vs2, 240(r1) +; PWR10LE-NEXT: xvmaxdp vs4, v5, v13 +; PWR10LE-NEXT: xvmaxdp vs3, v9, vs3 +; PWR10LE-NEXT: lxv vs1, 256(r1) +; PWR10LE-NEXT: xvmaxdp vs2, v7, vs2 +; PWR10LE-NEXT: lxv vs0, 224(r1) +; PWR10LE-NEXT: xvmaxdp vs1, v8, vs1 +; PWR10LE-NEXT: xvmaxdp vs0, v6, vs0 +; PWR10LE-NEXT: xvmaxdp vs3, vs4, vs3 +; PWR10LE-NEXT: xvmaxdp vs4, v3, v11 +; PWR10LE-NEXT: xvmaxdp vs2, vs4, vs2 +; PWR10LE-NEXT: xvmaxdp vs2, vs2, vs3 +; PWR10LE-NEXT: xvmaxdp vs3, v4, v12 +; PWR10LE-NEXT: xvmaxdp vs1, vs3, vs1 +; PWR10LE-NEXT: xvmaxdp vs3, v2, v10 +; PWR10LE-NEXT: xvmaxdp vs0, vs3, vs0 +; PWR10LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR10LE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xsmaxdp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: lxv vs3, 288(r1) +; PWR10BE-NEXT: lxv vs2, 256(r1) +; PWR10BE-NEXT: xvmaxdp vs4, v5, v13 +; PWR10BE-NEXT: xvmaxdp vs3, v9, vs3 +; PWR10BE-NEXT: lxv vs1, 272(r1) +; PWR10BE-NEXT: xvmaxdp vs2, v7, vs2 +; PWR10BE-NEXT: lxv vs0, 240(r1) +; PWR10BE-NEXT: xvmaxdp vs1, v8, vs1 +; PWR10BE-NEXT: xvmaxdp vs0, v6, vs0 +; PWR10BE-NEXT: xvmaxdp vs3, vs4, vs3 +; PWR10BE-NEXT: xvmaxdp vs4, v3, v11 +; PWR10BE-NEXT: xvmaxdp vs2, vs4, vs2 +; PWR10BE-NEXT: xvmaxdp vs2, vs2, vs3 +; PWR10BE-NEXT: xvmaxdp vs3, v4, v12 +; PWR10BE-NEXT: xvmaxdp vs1, vs3, vs1 +; PWR10BE-NEXT: xvmaxdp vs3, v2, v10 +; PWR10BE-NEXT: xvmaxdp vs0, vs3, vs0 +; PWR10BE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR10BE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xsmaxdp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %a) + ret double %0 +} + +define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: lxv vs0, 256(r1) +; PWR9LE-NEXT: lxv vs1, 224(r1) +; PWR9LE-NEXT: lxv vs2, 272(r1) +; PWR9LE-NEXT: lxv vs3, 240(r1) +; PWR9LE-NEXT: xvmaxdp vs4, v3, v11 +; PWR9LE-NEXT: xvmaxdp vs5, v5, v13 +; PWR9LE-NEXT: xvmaxdp vs6, v2, v10 +; PWR9LE-NEXT: xvmaxdp vs7, v4, v12 +; PWR9LE-NEXT: xvmaxdp vs3, v7, vs3 +; PWR9LE-NEXT: xvmaxdp vs2, v9, vs2 +; PWR9LE-NEXT: xvmaxdp vs1, v6, vs1 +; PWR9LE-NEXT: xvmaxdp vs0, v8, vs0 +; PWR9LE-NEXT: xvmaxdp vs0, vs7, vs0 +; PWR9LE-NEXT: xvmaxdp vs1, vs6, vs1 +; PWR9LE-NEXT: xvmaxdp vs2, vs5, vs2 +; PWR9LE-NEXT: xvmaxdp vs3, vs4, vs3 +; PWR9LE-NEXT: xvmaxdp vs2, vs3, vs2 +; PWR9LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: lxv vs0, 272(r1) +; PWR9BE-NEXT: lxv vs1, 240(r1) +; PWR9BE-NEXT: lxv vs2, 288(r1) +; PWR9BE-NEXT: lxv vs3, 256(r1) +; PWR9BE-NEXT: xvmaxdp vs4, v3, v11 +; PWR9BE-NEXT: xvmaxdp vs5, v5, v13 +; PWR9BE-NEXT: xvmaxdp vs6, v2, v10 +; PWR9BE-NEXT: xvmaxdp vs7, v4, v12 +; PWR9BE-NEXT: xvmaxdp vs3, v7, vs3 +; PWR9BE-NEXT: xvmaxdp vs2, v9, vs2 +; PWR9BE-NEXT: xvmaxdp vs1, v6, vs1 +; PWR9BE-NEXT: xvmaxdp vs0, v8, vs0 +; PWR9BE-NEXT: xvmaxdp vs0, vs7, vs0 +; PWR9BE-NEXT: xvmaxdp vs1, vs6, vs1 +; PWR9BE-NEXT: xvmaxdp vs2, vs5, vs2 +; PWR9BE-NEXT: xvmaxdp vs3, vs4, vs3 +; PWR9BE-NEXT: xvmaxdp vs2, vs3, vs2 +; PWR9BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmaxdp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: lxv vs0, 256(r1) +; PWR10LE-NEXT: lxv vs1, 224(r1) +; PWR10LE-NEXT: xvmaxdp vs4, v3, v11 +; PWR10LE-NEXT: xvmaxdp vs5, v5, v13 +; PWR10LE-NEXT: xvmaxdp vs6, v2, v10 +; PWR10LE-NEXT: xvmaxdp vs7, v4, v12 +; PWR10LE-NEXT: xvmaxdp vs1, v6, vs1 +; PWR10LE-NEXT: lxv vs2, 272(r1) +; PWR10LE-NEXT: lxv vs3, 240(r1) +; PWR10LE-NEXT: xvmaxdp vs3, v7, vs3 +; PWR10LE-NEXT: xvmaxdp vs2, v9, vs2 +; PWR10LE-NEXT: xvmaxdp vs0, v8, vs0 +; PWR10LE-NEXT: xvmaxdp vs0, vs7, vs0 +; PWR10LE-NEXT: xvmaxdp vs1, vs6, vs1 +; PWR10LE-NEXT: xvmaxdp vs2, vs5, vs2 +; PWR10LE-NEXT: xvmaxdp vs3, vs4, vs3 +; PWR10LE-NEXT: xvmaxdp vs2, vs3, vs2 +; PWR10LE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmaxdp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: lxv vs0, 272(r1) +; PWR10BE-NEXT: lxv vs1, 240(r1) +; PWR10BE-NEXT: xvmaxdp vs4, v3, v11 +; PWR10BE-NEXT: xvmaxdp vs5, v5, v13 +; PWR10BE-NEXT: xvmaxdp vs6, v2, v10 +; PWR10BE-NEXT: xvmaxdp vs7, v4, v12 +; PWR10BE-NEXT: xvmaxdp vs1, v6, vs1 +; PWR10BE-NEXT: lxv vs2, 288(r1) +; PWR10BE-NEXT: lxv vs3, 256(r1) +; PWR10BE-NEXT: xvmaxdp vs3, v7, vs3 +; PWR10BE-NEXT: xvmaxdp vs2, v9, vs2 +; PWR10BE-NEXT: xvmaxdp vs0, v8, vs0 +; PWR10BE-NEXT: xvmaxdp vs0, vs7, vs0 +; PWR10BE-NEXT: xvmaxdp vs1, vs6, vs1 +; PWR10BE-NEXT: xvmaxdp vs2, vs5, vs2 +; PWR10BE-NEXT: xvmaxdp vs3, vs4, vs3 +; PWR10BE-NEXT: xvmaxdp vs2, vs3, vs2 +; PWR10BE-NEXT: xvmaxdp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmaxdp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmaxdp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmax.v32f64(<32 x double> %a) + ret double %0 +} + +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) #0 +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) #0 +declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) #0 +declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) #0 +declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll new file mode 100644 index 000000000000..e806a702cd62 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll @@ -0,0 +1,1169 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64le < %s | \ +; RUN: FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64 < %s | \ +; RUN: FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of f32 +;; +define dso_local float @v2f32(<2 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmindp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmindp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) + ret float %0 +} + +define dso_local float @v2f32_fast(<2 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw vs0, v2, 2 +; PWR9LE-NEXT: xvminsp vs0, v2, vs0 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw vs0, v2, 1 +; PWR9BE-NEXT: xvminsp vs0, v2, vs0 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw vs0, v2, 2 +; PWR10LE-NEXT: xvminsp vs0, v2, vs0 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw vs0, v2, 1 +; PWR10BE-NEXT: xvminsp vs0, v2, vs0 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) + ret float %0 +} + +define dso_local float @v4f32(<4 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs2, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs3, v2 +; PWR9LE-NEXT: xscvspdpn f0, v2 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f3, vs3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmindp f2, f2, f3 +; PWR9LE-NEXT: xsmindp f1, f2, f1 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs2, v2, v2, 1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f3, v2 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xscvspdpn f0, vs0 +; PWR9BE-NEXT: xsmindp f2, f3, f2 +; PWR9BE-NEXT: xsmindp f1, f2, f1 +; PWR9BE-NEXT: xsmindp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs2, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs3, v2 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f0, v2 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xscvspdpn f3, vs3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmindp f2, f2, f3 +; PWR10LE-NEXT: xsmindp f1, f2, f1 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs2, v2, v2, 1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f3, v2 +; PWR10BE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xscvspdpn f0, vs0 +; PWR10BE-NEXT: xsmindp f2, f3, f2 +; PWR10BE-NEXT: xsmindp f1, f2, f1 +; PWR10BE-NEXT: xsmindp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) + ret float %0 +} + +define dso_local float @v4f32_fast(<4 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: xvminsp vs0, v2, v3 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvminsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: xvminsp vs0, v2, v3 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvminsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: xvminsp vs0, v2, v3 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvminsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: xvminsp vs0, v2, v3 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvminsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) + ret float %0 +} + +define dso_local float @v8f32(<8 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvminsp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmindp f1, f2, f1 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmindp f1, f1, f2 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvminsp vs0, v2, v3 +; PWR9BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xsmindp f1, f1, f2 +; PWR9BE-NEXT: xxswapd vs2, vs0 +; PWR9BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xscvspdpn f0, vs0 +; PWR9BE-NEXT: xsmindp f1, f1, f2 +; PWR9BE-NEXT: xsmindp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvminsp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmindp f1, f2, f1 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmindp f1, f1, f2 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvminsp vs0, v2, v3 +; PWR10BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xsmindp f1, f1, f2 +; PWR10BE-NEXT: xxswapd vs2, vs0 +; PWR10BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xscvspdpn f0, vs0 +; PWR10BE-NEXT: xsmindp f1, f1, f2 +; PWR10BE-NEXT: xsmindp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a) + ret float %0 +} + +define dso_local float @v8f32_fast(<8 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvminsp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvminsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvminsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvminsp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvminsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvminsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvminsp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvminsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvminsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvminsp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvminsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvminsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a) + ret float %0 +} + +define dso_local float @v16f32(<16 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvminsp vs0, v3, v5 +; PWR9LE-NEXT: xvminsp vs1, v2, v4 +; PWR9LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmindp f1, f2, f1 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmindp f1, f1, f2 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvminsp vs0, v3, v5 +; PWR9BE-NEXT: xvminsp vs1, v2, v4 +; PWR9BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xsmindp f1, f1, f2 +; PWR9BE-NEXT: xxswapd vs2, vs0 +; PWR9BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xscvspdpn f0, vs0 +; PWR9BE-NEXT: xsmindp f1, f1, f2 +; PWR9BE-NEXT: xsmindp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvminsp vs0, v3, v5 +; PWR10LE-NEXT: xvminsp vs1, v2, v4 +; PWR10LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmindp f1, f2, f1 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmindp f1, f1, f2 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvminsp vs0, v3, v5 +; PWR10BE-NEXT: xvminsp vs1, v2, v4 +; PWR10BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xsmindp f1, f1, f2 +; PWR10BE-NEXT: xxswapd vs2, vs0 +; PWR10BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xscvspdpn f0, vs0 +; PWR10BE-NEXT: xsmindp f1, f1, f2 +; PWR10BE-NEXT: xsmindp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a) + ret float %0 +} + +define dso_local float @v16f32_fast(<16 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvminsp vs0, v3, v5 +; PWR9LE-NEXT: xvminsp vs1, v2, v4 +; PWR9LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvminsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvminsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvminsp vs0, v3, v5 +; PWR9BE-NEXT: xvminsp vs1, v2, v4 +; PWR9BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvminsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvminsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvminsp vs0, v3, v5 +; PWR10LE-NEXT: xvminsp vs1, v2, v4 +; PWR10LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvminsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvminsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvminsp vs0, v3, v5 +; PWR10BE-NEXT: xvminsp vs1, v2, v4 +; PWR10BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvminsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvminsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmin.v16f32(<16 x float> %a) + ret float %0 +} + +define dso_local float @v32f32(<32 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvminsp vs0, v5, v9 +; PWR9LE-NEXT: xvminsp vs1, v3, v7 +; PWR9LE-NEXT: xvminsp vs2, v2, v6 +; PWR9LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9LE-NEXT: xvminsp vs1, v4, v8 +; PWR9LE-NEXT: xvminsp vs1, vs2, vs1 +; PWR9LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmindp f1, f2, f1 +; PWR9LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f2, vs2 +; PWR9LE-NEXT: xsmindp f1, f1, f2 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvminsp vs0, v5, v9 +; PWR9BE-NEXT: xvminsp vs1, v3, v7 +; PWR9BE-NEXT: xvminsp vs2, v2, v6 +; PWR9BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9BE-NEXT: xvminsp vs1, v4, v8 +; PWR9BE-NEXT: xvminsp vs1, vs2, vs1 +; PWR9BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xsmindp f1, f1, f2 +; PWR9BE-NEXT: xxswapd vs2, vs0 +; PWR9BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9BE-NEXT: xscvspdpn f2, vs2 +; PWR9BE-NEXT: xscvspdpn f0, vs0 +; PWR9BE-NEXT: xsmindp f1, f1, f2 +; PWR9BE-NEXT: xsmindp f1, f1, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvminsp vs0, v5, v9 +; PWR10LE-NEXT: xvminsp vs1, v3, v7 +; PWR10LE-NEXT: xvminsp vs2, v2, v6 +; PWR10LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10LE-NEXT: xvminsp vs1, v4, v8 +; PWR10LE-NEXT: xvminsp vs1, vs2, vs1 +; PWR10LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmindp f1, f2, f1 +; PWR10LE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f2, vs2 +; PWR10LE-NEXT: xsmindp f1, f1, f2 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvminsp vs0, v5, v9 +; PWR10BE-NEXT: xvminsp vs1, v3, v7 +; PWR10BE-NEXT: xvminsp vs2, v2, v6 +; PWR10BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10BE-NEXT: xvminsp vs1, v4, v8 +; PWR10BE-NEXT: xvminsp vs1, vs2, vs1 +; PWR10BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10BE-NEXT: xxsldwi vs2, vs0, vs0, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xsmindp f1, f1, f2 +; PWR10BE-NEXT: xxswapd vs2, vs0 +; PWR10BE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10BE-NEXT: xscvspdpn f2, vs2 +; PWR10BE-NEXT: xscvspdpn f0, vs0 +; PWR10BE-NEXT: xsmindp f1, f1, f2 +; PWR10BE-NEXT: xsmindp f1, f1, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %a) + ret float %0 +} + +define dso_local float @v32f32_fast(<32 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvminsp vs0, v4, v8 +; PWR9LE-NEXT: xvminsp vs1, v2, v6 +; PWR9LE-NEXT: xvminsp vs2, v5, v9 +; PWR9LE-NEXT: xvminsp vs3, v3, v7 +; PWR9LE-NEXT: xvminsp vs2, vs3, vs2 +; PWR9LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9LE-NEXT: xvminsp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvminsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvminsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvminsp vs0, v4, v8 +; PWR9BE-NEXT: xvminsp vs1, v2, v6 +; PWR9BE-NEXT: xvminsp vs2, v5, v9 +; PWR9BE-NEXT: xvminsp vs3, v3, v7 +; PWR9BE-NEXT: xvminsp vs2, vs3, vs2 +; PWR9BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR9BE-NEXT: xvminsp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvminsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvminsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvminsp vs0, v4, v8 +; PWR10LE-NEXT: xvminsp vs1, v2, v6 +; PWR10LE-NEXT: xvminsp vs2, v5, v9 +; PWR10LE-NEXT: xvminsp vs3, v3, v7 +; PWR10LE-NEXT: xvminsp vs2, vs3, vs2 +; PWR10LE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10LE-NEXT: xvminsp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvminsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvminsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvminsp vs0, v4, v8 +; PWR10BE-NEXT: xvminsp vs1, v2, v6 +; PWR10BE-NEXT: xvminsp vs2, v5, v9 +; PWR10BE-NEXT: xvminsp vs3, v3, v7 +; PWR10BE-NEXT: xvminsp vs2, vs3, vs2 +; PWR10BE-NEXT: xvminsp vs0, vs1, vs0 +; PWR10BE-NEXT: xvminsp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvminsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvminsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmin.v32f32(<32 x float> %a) + ret float %0 +} + +declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) #0 +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) #0 +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) #0 +declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>) #0 +declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>) #0 + +;; +;; Vectors of f64 +;; +define dso_local double @v2f64(<2 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsmindp f1, f0, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xsmindp f1, v2, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsmindp f1, f0, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xsmindp f1, v2, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) + ret double %0 +} + +define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xvmindp vs0, v2, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xvmindp vs1, v2, vs0 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xvmindp vs0, v2, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xvmindp vs1, v2, vs0 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) + ret double %0 +} + +define dso_local double @v4f64(<4 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmindp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmindp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xsmindp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmindp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmindp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xsmindp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a) + ret double %0 +} + +define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmindp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmindp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmindp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmindp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmindp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmindp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a) + ret double %0 +} + +define dso_local double @v8f64(<8 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmindp vs0, v3, v5 +; PWR9LE-NEXT: xvmindp vs1, v2, v4 +; PWR9LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmindp vs0, v3, v5 +; PWR9BE-NEXT: xvmindp vs1, v2, v4 +; PWR9BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xsmindp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmindp vs0, v3, v5 +; PWR10LE-NEXT: xvmindp vs1, v2, v4 +; PWR10LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmindp vs0, v3, v5 +; PWR10BE-NEXT: xvmindp vs1, v2, v4 +; PWR10BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xsmindp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a) + ret double %0 +} + +define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmindp vs0, v3, v5 +; PWR9LE-NEXT: xvmindp vs1, v2, v4 +; PWR9LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmindp vs0, v3, v5 +; PWR9BE-NEXT: xvmindp vs1, v2, v4 +; PWR9BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmindp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmindp vs0, v3, v5 +; PWR10LE-NEXT: xvmindp vs1, v2, v4 +; PWR10LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmindp vs0, v3, v5 +; PWR10BE-NEXT: xvmindp vs1, v2, v4 +; PWR10BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmindp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a) + ret double %0 +} + +define dso_local double @v16f64(<16 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmindp vs0, v5, v9 +; PWR9LE-NEXT: xvmindp vs1, v3, v7 +; PWR9LE-NEXT: xvmindp vs2, v2, v6 +; PWR9LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmindp vs1, v4, v8 +; PWR9LE-NEXT: xvmindp vs1, vs2, vs1 +; PWR9LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmindp vs0, v5, v9 +; PWR9BE-NEXT: xvmindp vs1, v3, v7 +; PWR9BE-NEXT: xvmindp vs2, v2, v6 +; PWR9BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmindp vs1, v4, v8 +; PWR9BE-NEXT: xvmindp vs1, vs2, vs1 +; PWR9BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xsmindp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmindp vs0, v5, v9 +; PWR10LE-NEXT: xvmindp vs1, v3, v7 +; PWR10LE-NEXT: xvmindp vs2, v2, v6 +; PWR10LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmindp vs1, v4, v8 +; PWR10LE-NEXT: xvmindp vs1, vs2, vs1 +; PWR10LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmindp vs0, v5, v9 +; PWR10BE-NEXT: xvmindp vs1, v3, v7 +; PWR10BE-NEXT: xvmindp vs2, v2, v6 +; PWR10BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmindp vs1, v4, v8 +; PWR10BE-NEXT: xvmindp vs1, vs2, vs1 +; PWR10BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xsmindp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a) + ret double %0 +} + +define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmindp vs0, v4, v8 +; PWR9LE-NEXT: xvmindp vs1, v2, v6 +; PWR9LE-NEXT: xvmindp vs2, v5, v9 +; PWR9LE-NEXT: xvmindp vs3, v3, v7 +; PWR9LE-NEXT: xvmindp vs2, vs3, vs2 +; PWR9LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmindp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmindp vs0, v4, v8 +; PWR9BE-NEXT: xvmindp vs1, v2, v6 +; PWR9BE-NEXT: xvmindp vs2, v5, v9 +; PWR9BE-NEXT: xvmindp vs3, v3, v7 +; PWR9BE-NEXT: xvmindp vs2, vs3, vs2 +; PWR9BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmindp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmindp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmindp vs0, v4, v8 +; PWR10LE-NEXT: xvmindp vs1, v2, v6 +; PWR10LE-NEXT: xvmindp vs2, v5, v9 +; PWR10LE-NEXT: xvmindp vs3, v3, v7 +; PWR10LE-NEXT: xvmindp vs2, vs3, vs2 +; PWR10LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmindp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmindp vs0, v4, v8 +; PWR10BE-NEXT: xvmindp vs1, v2, v6 +; PWR10BE-NEXT: xvmindp vs2, v5, v9 +; PWR10BE-NEXT: xvmindp vs3, v3, v7 +; PWR10BE-NEXT: xvmindp vs2, vs3, vs2 +; PWR10BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmindp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmindp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a) + ret double %0 +} + +define dso_local double @v32f64(<32 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: lxv vs3, 272(r1) +; PWR9LE-NEXT: lxv vs2, 240(r1) +; PWR9LE-NEXT: xvmindp vs4, v5, v13 +; PWR9LE-NEXT: lxv vs1, 256(r1) +; PWR9LE-NEXT: lxv vs0, 224(r1) +; PWR9LE-NEXT: xvmindp vs3, v9, vs3 +; PWR9LE-NEXT: xvmindp vs2, v7, vs2 +; PWR9LE-NEXT: xvmindp vs1, v8, vs1 +; PWR9LE-NEXT: xvmindp vs0, v6, vs0 +; PWR9LE-NEXT: xvmindp vs3, vs4, vs3 +; PWR9LE-NEXT: xvmindp vs4, v3, v11 +; PWR9LE-NEXT: xvmindp vs2, vs4, vs2 +; PWR9LE-NEXT: xvmindp vs2, vs2, vs3 +; PWR9LE-NEXT: xvmindp vs3, v4, v12 +; PWR9LE-NEXT: xvmindp vs1, vs3, vs1 +; PWR9LE-NEXT: xvmindp vs3, v2, v10 +; PWR9LE-NEXT: xvmindp vs0, vs3, vs0 +; PWR9LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR9LE-NEXT: xvmindp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xsmindp f1, f1, f0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: lxv vs3, 288(r1) +; PWR9BE-NEXT: lxv vs2, 256(r1) +; PWR9BE-NEXT: xvmindp vs4, v5, v13 +; PWR9BE-NEXT: lxv vs1, 272(r1) +; PWR9BE-NEXT: lxv vs0, 240(r1) +; PWR9BE-NEXT: xvmindp vs3, v9, vs3 +; PWR9BE-NEXT: xvmindp vs2, v7, vs2 +; PWR9BE-NEXT: xvmindp vs1, v8, vs1 +; PWR9BE-NEXT: xvmindp vs0, v6, vs0 +; PWR9BE-NEXT: xvmindp vs3, vs4, vs3 +; PWR9BE-NEXT: xvmindp vs4, v3, v11 +; PWR9BE-NEXT: xvmindp vs2, vs4, vs2 +; PWR9BE-NEXT: xvmindp vs2, vs2, vs3 +; PWR9BE-NEXT: xvmindp vs3, v4, v12 +; PWR9BE-NEXT: xvmindp vs1, vs3, vs1 +; PWR9BE-NEXT: xvmindp vs3, v2, v10 +; PWR9BE-NEXT: xvmindp vs0, vs3, vs0 +; PWR9BE-NEXT: xvmindp vs0, vs0, vs1 +; PWR9BE-NEXT: xvmindp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xsmindp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: lxv vs3, 272(r1) +; PWR10LE-NEXT: lxv vs2, 240(r1) +; PWR10LE-NEXT: xvmindp vs4, v5, v13 +; PWR10LE-NEXT: xvmindp vs3, v9, vs3 +; PWR10LE-NEXT: lxv vs1, 256(r1) +; PWR10LE-NEXT: xvmindp vs2, v7, vs2 +; PWR10LE-NEXT: lxv vs0, 224(r1) +; PWR10LE-NEXT: xvmindp vs1, v8, vs1 +; PWR10LE-NEXT: xvmindp vs0, v6, vs0 +; PWR10LE-NEXT: xvmindp vs3, vs4, vs3 +; PWR10LE-NEXT: xvmindp vs4, v3, v11 +; PWR10LE-NEXT: xvmindp vs2, vs4, vs2 +; PWR10LE-NEXT: xvmindp vs2, vs2, vs3 +; PWR10LE-NEXT: xvmindp vs3, v4, v12 +; PWR10LE-NEXT: xvmindp vs1, vs3, vs1 +; PWR10LE-NEXT: xvmindp vs3, v2, v10 +; PWR10LE-NEXT: xvmindp vs0, vs3, vs0 +; PWR10LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR10LE-NEXT: xvmindp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xsmindp f1, f1, f0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: lxv vs3, 288(r1) +; PWR10BE-NEXT: lxv vs2, 256(r1) +; PWR10BE-NEXT: xvmindp vs4, v5, v13 +; PWR10BE-NEXT: xvmindp vs3, v9, vs3 +; PWR10BE-NEXT: lxv vs1, 272(r1) +; PWR10BE-NEXT: xvmindp vs2, v7, vs2 +; PWR10BE-NEXT: lxv vs0, 240(r1) +; PWR10BE-NEXT: xvmindp vs1, v8, vs1 +; PWR10BE-NEXT: xvmindp vs0, v6, vs0 +; PWR10BE-NEXT: xvmindp vs3, vs4, vs3 +; PWR10BE-NEXT: xvmindp vs4, v3, v11 +; PWR10BE-NEXT: xvmindp vs2, vs4, vs2 +; PWR10BE-NEXT: xvmindp vs2, vs2, vs3 +; PWR10BE-NEXT: xvmindp vs3, v4, v12 +; PWR10BE-NEXT: xvmindp vs1, vs3, vs1 +; PWR10BE-NEXT: xvmindp vs3, v2, v10 +; PWR10BE-NEXT: xvmindp vs0, vs3, vs0 +; PWR10BE-NEXT: xvmindp vs0, vs0, vs1 +; PWR10BE-NEXT: xvmindp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xsmindp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %a) + ret double %0 +} + +define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v32f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: lxv vs0, 256(r1) +; PWR9LE-NEXT: lxv vs1, 224(r1) +; PWR9LE-NEXT: lxv vs2, 272(r1) +; PWR9LE-NEXT: lxv vs3, 240(r1) +; PWR9LE-NEXT: xvmindp vs4, v3, v11 +; PWR9LE-NEXT: xvmindp vs5, v5, v13 +; PWR9LE-NEXT: xvmindp vs6, v2, v10 +; PWR9LE-NEXT: xvmindp vs7, v4, v12 +; PWR9LE-NEXT: xvmindp vs3, v7, vs3 +; PWR9LE-NEXT: xvmindp vs2, v9, vs2 +; PWR9LE-NEXT: xvmindp vs1, v6, vs1 +; PWR9LE-NEXT: xvmindp vs0, v8, vs0 +; PWR9LE-NEXT: xvmindp vs0, vs7, vs0 +; PWR9LE-NEXT: xvmindp vs1, vs6, vs1 +; PWR9LE-NEXT: xvmindp vs2, vs5, vs2 +; PWR9LE-NEXT: xvmindp vs3, vs4, vs3 +; PWR9LE-NEXT: xvmindp vs2, vs3, vs2 +; PWR9LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmindp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v32f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: lxv vs0, 272(r1) +; PWR9BE-NEXT: lxv vs1, 240(r1) +; PWR9BE-NEXT: lxv vs2, 288(r1) +; PWR9BE-NEXT: lxv vs3, 256(r1) +; PWR9BE-NEXT: xvmindp vs4, v3, v11 +; PWR9BE-NEXT: xvmindp vs5, v5, v13 +; PWR9BE-NEXT: xvmindp vs6, v2, v10 +; PWR9BE-NEXT: xvmindp vs7, v4, v12 +; PWR9BE-NEXT: xvmindp vs3, v7, vs3 +; PWR9BE-NEXT: xvmindp vs2, v9, vs2 +; PWR9BE-NEXT: xvmindp vs1, v6, vs1 +; PWR9BE-NEXT: xvmindp vs0, v8, vs0 +; PWR9BE-NEXT: xvmindp vs0, vs7, vs0 +; PWR9BE-NEXT: xvmindp vs1, vs6, vs1 +; PWR9BE-NEXT: xvmindp vs2, vs5, vs2 +; PWR9BE-NEXT: xvmindp vs3, vs4, vs3 +; PWR9BE-NEXT: xvmindp vs2, vs3, vs2 +; PWR9BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmindp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmindp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v32f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: lxv vs0, 256(r1) +; PWR10LE-NEXT: lxv vs1, 224(r1) +; PWR10LE-NEXT: xvmindp vs4, v3, v11 +; PWR10LE-NEXT: xvmindp vs5, v5, v13 +; PWR10LE-NEXT: xvmindp vs6, v2, v10 +; PWR10LE-NEXT: xvmindp vs7, v4, v12 +; PWR10LE-NEXT: xvmindp vs1, v6, vs1 +; PWR10LE-NEXT: lxv vs2, 272(r1) +; PWR10LE-NEXT: lxv vs3, 240(r1) +; PWR10LE-NEXT: xvmindp vs3, v7, vs3 +; PWR10LE-NEXT: xvmindp vs2, v9, vs2 +; PWR10LE-NEXT: xvmindp vs0, v8, vs0 +; PWR10LE-NEXT: xvmindp vs0, vs7, vs0 +; PWR10LE-NEXT: xvmindp vs1, vs6, vs1 +; PWR10LE-NEXT: xvmindp vs2, vs5, vs2 +; PWR10LE-NEXT: xvmindp vs3, vs4, vs3 +; PWR10LE-NEXT: xvmindp vs2, vs3, vs2 +; PWR10LE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmindp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmindp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v32f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: lxv vs0, 272(r1) +; PWR10BE-NEXT: lxv vs1, 240(r1) +; PWR10BE-NEXT: xvmindp vs4, v3, v11 +; PWR10BE-NEXT: xvmindp vs5, v5, v13 +; PWR10BE-NEXT: xvmindp vs6, v2, v10 +; PWR10BE-NEXT: xvmindp vs7, v4, v12 +; PWR10BE-NEXT: xvmindp vs1, v6, vs1 +; PWR10BE-NEXT: lxv vs2, 288(r1) +; PWR10BE-NEXT: lxv vs3, 256(r1) +; PWR10BE-NEXT: xvmindp vs3, v7, vs3 +; PWR10BE-NEXT: xvmindp vs2, v9, vs2 +; PWR10BE-NEXT: xvmindp vs0, v8, vs0 +; PWR10BE-NEXT: xvmindp vs0, vs7, vs0 +; PWR10BE-NEXT: xvmindp vs1, vs6, vs1 +; PWR10BE-NEXT: xvmindp vs2, vs5, vs2 +; PWR10BE-NEXT: xvmindp vs3, vs4, vs3 +; PWR10BE-NEXT: xvmindp vs2, vs3, vs2 +; PWR10BE-NEXT: xvmindp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmindp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmindp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmin.v32f64(<32 x double> %a) + ret double %0 +} + +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) #0 +declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) #0 +declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) #0 +declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>) #0 +declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll new file mode 100644 index 000000000000..e123f5c2056d --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll @@ -0,0 +1,1717 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64le < %s | \ +; RUN: FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mattr=-paired-vector-memops -mtriple=powerpc64 < %s | \ +; RUN: FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of f32 +;; +define dso_local float @v2f32(<2 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmul.v2f32(float 1.000000e+00, <2 x float> %a) + ret float %0 +} + +define dso_local float @v2f32_b(<2 x float> %a, float %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xsmulsp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xsmulsp f0, f1, f0 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xsmulsp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xsmulsp f0, f1, f0 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmul.v2f32(float %b, <2 x float> %a) + ret float %0 +} + +define dso_local float @v2f32_fast(<2 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw vs0, v2, 2 +; PWR9LE-NEXT: xvmulsp vs0, v2, vs0 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw vs0, v2, 1 +; PWR9BE-NEXT: xvmulsp vs0, v2, vs0 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw vs0, v2, 2 +; PWR10LE-NEXT: xvmulsp vs0, v2, vs0 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw vs0, v2, 1 +; PWR10BE-NEXT: xvmulsp vs0, v2, vs0 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmul.v2f32(float 1.000000e+00, <2 x float> %a) + ret float %0 +} + +define dso_local float @v4f32(<4 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsmulsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsmulsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> %a) + ret float %0 +} + +define dso_local float @v4f32_b(<4 x float> %a, float %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xsmulsp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsmulsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xsmulsp f0, f1, f0 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xsmulsp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsmulsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xsmulsp f0, f1, f0 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmul.v4f32(float %b, <4 x float> %a) + ret float %0 +} + +define dso_local float @v4f32_fast(<4 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: xvmulsp vs0, v2, v3 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: xvmulsp vs0, v2, v3 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: xvmulsp vs0, v2, v3 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: xvmulsp vs0, v2, v3 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> %a) + ret float %0 +} + +define dso_local float @v8f32(<8 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v3 +; PWR9LE-NEXT: xsmulsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v3 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v3 +; PWR10LE-NEXT: xsmulsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v3 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> %a) + ret float %0 +} + +define dso_local float @v8f32_b(<8 x float> %a, float %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xsmulsp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v3 +; PWR9LE-NEXT: xsmulsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xsmulsp f0, f1, f0 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v3 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xsmulsp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v3 +; PWR10LE-NEXT: xsmulsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xsmulsp f0, f1, f0 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v3 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmul.v8f32(float %b, <8 x float> %a) + ret float %0 +} + +define dso_local float @v8f32_fast(<8 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmulsp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvmulsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmulsp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvmulsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmulsp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvmulsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmulsp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvmulsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> %a) + ret float %0 +} + +define dso_local float @v16f32(<16 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v3 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v4 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v5 +; PWR9LE-NEXT: xsmulsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v3 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v4 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v5 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v3 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v4 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v5 +; PWR10LE-NEXT: xsmulsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v3 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v4 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v5 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %a) + ret float %0 +} + +define dso_local float @v16f32_b(<16 x float> %a, float %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR9LE-NEXT: xscvspdpn f0, vs0 +; PWR9LE-NEXT: xsmulsp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v2 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v2 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v3 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v4 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR9LE-NEXT: xscvspdpn f1, vs1 +; PWR9LE-NEXT: xsmulsp f0, f0, f1 +; PWR9LE-NEXT: xscvspdpn f1, v5 +; PWR9LE-NEXT: xsmulsp f1, f0, f1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xscvspdpn f0, v2 +; PWR9BE-NEXT: xsmulsp f0, f1, f0 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v3 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v4 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xscvspdpn f1, v5 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f0, f0, f1 +; PWR9BE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR9BE-NEXT: xscvspdpn f1, vs1 +; PWR9BE-NEXT: xsmulsp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxsldwi vs0, v2, v2, 3 +; PWR10LE-NEXT: xscvspdpn f0, vs0 +; PWR10LE-NEXT: xsmulsp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v2 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v2 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v3 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v4 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR10LE-NEXT: xscvspdpn f1, vs1 +; PWR10LE-NEXT: xsmulsp f0, f0, f1 +; PWR10LE-NEXT: xscvspdpn f1, v5 +; PWR10LE-NEXT: xsmulsp f1, f0, f1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xscvspdpn f0, v2 +; PWR10BE-NEXT: xsmulsp f0, f1, f0 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v2, v2, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v3 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v3, v3, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v4 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v4, v4, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v4, v4, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xscvspdpn f1, v5 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v5, v5, 1 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f0, f0, f1 +; PWR10BE-NEXT: xxsldwi vs1, v5, v5, 3 +; PWR10BE-NEXT: xscvspdpn f1, vs1 +; PWR10BE-NEXT: xsmulsp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call float @llvm.vector.reduce.fmul.v16f32(float %b, <16 x float> %a) + ret float %0 +} + +define dso_local float @v16f32_fast(<16 x float> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f32_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmulsp vs0, v3, v5 +; PWR9LE-NEXT: xvmulsp vs1, v2, v4 +; PWR9LE-NEXT: xvmulsp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xvmulsp vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR9LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR9LE-NEXT: xscvspdpn f1, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f32_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmulsp vs0, v3, v5 +; PWR9BE-NEXT: xvmulsp vs1, v2, v4 +; PWR9BE-NEXT: xvmulsp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xvmulsp vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR9BE-NEXT: xscvspdpn f1, vs0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f32_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmulsp vs0, v3, v5 +; PWR10LE-NEXT: xvmulsp vs1, v2, v4 +; PWR10LE-NEXT: xvmulsp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xvmulsp vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR10LE-NEXT: xxsldwi vs0, vs0, vs0, 3 +; PWR10LE-NEXT: xscvspdpn f1, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f32_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmulsp vs0, v3, v5 +; PWR10BE-NEXT: xvmulsp vs1, v2, v4 +; PWR10BE-NEXT: xvmulsp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xvmulsp vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xvmulsp vs0, vs0, vs1 +; PWR10BE-NEXT: xscvspdpn f1, vs0 +; PWR10BE-NEXT: blr +entry: + %0 = call fast float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %a) + ret float %0 +} + +declare float @llvm.vector.reduce.fmul.v2f32(float, <2 x float>) #0 +declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>) #0 +declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>) #0 +declare float @llvm.vector.reduce.fmul.v16f32(float, <16 x float>) #0 + +;; +;; Vectors of f64 +;; +define dso_local double @v2f64(<2 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsmuldp f1, f0, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xsmuldp f1, v2, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsmuldp f1, f0, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xsmuldp f1, v2, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmul.v2f64(double 1.000000e+00, <2 x double> %a) + ret double %0 +} + +define dso_local double @v2f64_b(<2 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsmuldp f0, f1, f0 +; PWR9LE-NEXT: xsmuldp f1, f0, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsmuldp f0, f1, v2 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xsmuldp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsmuldp f0, f1, f0 +; PWR10LE-NEXT: xsmuldp f1, f0, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsmuldp f0, f1, v2 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xsmuldp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmul.v2f64(double %b, <2 x double> %a) + ret double %0 +} + +define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xvmuldp vs0, v2, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xvmuldp vs1, v2, vs0 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xvmuldp vs0, v2, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xvmuldp vs1, v2, vs0 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmul.v2f64(double 1.000000e+00, <2 x double> %a) + ret double %0 +} + +define dso_local double @v4f64(<4 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, v2 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xsmuldp f1, f0, v3 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsmuldp f0, v2, f0 +; PWR9BE-NEXT: xsmuldp f0, f0, v3 +; PWR9BE-NEXT: xsmuldp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, v2 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xsmuldp f1, f0, v3 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsmuldp f0, v2, f0 +; PWR10BE-NEXT: xsmuldp f0, f0, v3 +; PWR10BE-NEXT: xsmuldp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> %a) + ret double %0 +} + +define dso_local double @v4f64_b(<4 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsmuldp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, v2 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xsmuldp f1, f0, v3 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsmuldp f0, f1, v2 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsmuldp f0, f0, v3 +; PWR9BE-NEXT: xsmuldp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsmuldp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, v2 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xsmuldp f1, f0, v3 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsmuldp f0, f1, v2 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsmuldp f0, f0, v3 +; PWR10BE-NEXT: xsmuldp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmul.v4f64(double %b, <4 x double> %a) + ret double %0 +} + +define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmuldp vs0, v2, v3 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmuldp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmuldp vs0, v2, v3 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmuldp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmuldp vs0, v2, v3 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmuldp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmuldp vs0, v2, v3 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmuldp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> %a) + ret double %0 +} + +define dso_local double @v8f64(<8 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, v2 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xsmuldp f0, f0, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xsmuldp f0, f0, v4 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xsmuldp f1, f0, v5 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsmuldp f0, v2, f0 +; PWR9BE-NEXT: xsmuldp f0, f0, v3 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xsmuldp f0, f0, v4 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xsmuldp f0, f0, v5 +; PWR9BE-NEXT: xsmuldp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, v2 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xsmuldp f0, f0, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xsmuldp f0, f0, v4 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xsmuldp f1, f0, v5 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsmuldp f0, v2, f0 +; PWR10BE-NEXT: xsmuldp f0, f0, v3 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xsmuldp f0, f0, v4 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xsmuldp f0, f0, v5 +; PWR10BE-NEXT: xsmuldp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %a) + ret double %0 +} + +define dso_local double @v8f64_b(<8 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsmuldp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, v2 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xsmuldp f0, f0, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xsmuldp f0, f0, v4 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xsmuldp f1, f0, v5 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsmuldp f0, f1, v2 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsmuldp f0, f0, v3 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xsmuldp f0, f0, v4 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xsmuldp f0, f0, v5 +; PWR9BE-NEXT: xsmuldp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsmuldp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, v2 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xsmuldp f0, f0, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xsmuldp f0, f0, v4 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xsmuldp f1, f0, v5 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsmuldp f0, f1, v2 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsmuldp f0, f0, v3 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xsmuldp f0, f0, v4 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xsmuldp f0, f0, v5 +; PWR10BE-NEXT: xsmuldp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmul.v8f64(double %b, <8 x double> %a) + ret double %0 +} + +define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmuldp vs0, v3, v5 +; PWR9LE-NEXT: xvmuldp vs1, v2, v4 +; PWR9LE-NEXT: xvmuldp vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmuldp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmuldp vs0, v3, v5 +; PWR9BE-NEXT: xvmuldp vs1, v2, v4 +; PWR9BE-NEXT: xvmuldp vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmuldp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmuldp vs0, v3, v5 +; PWR10LE-NEXT: xvmuldp vs1, v2, v4 +; PWR10LE-NEXT: xvmuldp vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmuldp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmuldp vs0, v3, v5 +; PWR10BE-NEXT: xvmuldp vs1, v2, v4 +; PWR10BE-NEXT: xvmuldp vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmuldp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %a) + ret double %0 +} + +define dso_local double @v16f64(<16 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, v2 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xsmuldp f0, f0, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xsmuldp f0, f0, v4 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v6 +; PWR9LE-NEXT: xsmuldp f0, f0, v5 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v7 +; PWR9LE-NEXT: xsmuldp f0, f0, v6 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v8 +; PWR9LE-NEXT: xsmuldp f0, f0, v7 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v9 +; PWR9LE-NEXT: xsmuldp f0, f0, v8 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xsmuldp f1, f0, v9 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd vs0, v2 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsmuldp f0, v2, f0 +; PWR9BE-NEXT: xsmuldp f0, f0, v3 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xsmuldp f0, f0, v4 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xsmuldp f0, f0, v5 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v6 +; PWR9BE-NEXT: xsmuldp f0, f0, v6 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v7 +; PWR9BE-NEXT: xsmuldp f0, f0, v7 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v8 +; PWR9BE-NEXT: xsmuldp f0, f0, v8 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v9 +; PWR9BE-NEXT: xsmuldp f0, f0, v9 +; PWR9BE-NEXT: xsmuldp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, v2 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xsmuldp f0, f0, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xsmuldp f0, f0, v4 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v6 +; PWR10LE-NEXT: xsmuldp f0, f0, v5 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v7 +; PWR10LE-NEXT: xsmuldp f0, f0, v6 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v8 +; PWR10LE-NEXT: xsmuldp f0, f0, v7 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v9 +; PWR10LE-NEXT: xsmuldp f0, f0, v8 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xsmuldp f1, f0, v9 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd vs0, v2 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsmuldp f0, v2, f0 +; PWR10BE-NEXT: xsmuldp f0, f0, v3 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xsmuldp f0, f0, v4 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xsmuldp f0, f0, v5 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v6 +; PWR10BE-NEXT: xsmuldp f0, f0, v6 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v7 +; PWR10BE-NEXT: xsmuldp f0, f0, v7 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v8 +; PWR10BE-NEXT: xsmuldp f0, f0, v8 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v9 +; PWR10BE-NEXT: xsmuldp f0, f0, v9 +; PWR10BE-NEXT: xsmuldp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmul.v16f64(double 1.000000e+00, <16 x double> %a) + ret double %0 +} + +define dso_local double @v16f64_b(<16 x double> %a, double %b) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64_b: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd vs0, v2 +; PWR9LE-NEXT: xsmuldp f0, f1, f0 +; PWR9LE-NEXT: xxswapd vs1, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, v2 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v4 +; PWR9LE-NEXT: xsmuldp f0, f0, v3 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v5 +; PWR9LE-NEXT: xsmuldp f0, f0, v4 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v6 +; PWR9LE-NEXT: xsmuldp f0, f0, v5 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v7 +; PWR9LE-NEXT: xsmuldp f0, f0, v6 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v8 +; PWR9LE-NEXT: xsmuldp f0, f0, v7 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xxswapd vs1, v9 +; PWR9LE-NEXT: xsmuldp f0, f0, v8 +; PWR9LE-NEXT: xsmuldp f0, f0, f1 +; PWR9LE-NEXT: xsmuldp f1, f0, v9 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64_b: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xsmuldp f0, f1, v2 +; PWR9BE-NEXT: xxswapd vs1, v2 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v3 +; PWR9BE-NEXT: xsmuldp f0, f0, v3 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v4 +; PWR9BE-NEXT: xsmuldp f0, f0, v4 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v5 +; PWR9BE-NEXT: xsmuldp f0, f0, v5 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v6 +; PWR9BE-NEXT: xsmuldp f0, f0, v6 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v7 +; PWR9BE-NEXT: xsmuldp f0, f0, v7 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v8 +; PWR9BE-NEXT: xsmuldp f0, f0, v8 +; PWR9BE-NEXT: xsmuldp f0, f0, f1 +; PWR9BE-NEXT: xxswapd vs1, v9 +; PWR9BE-NEXT: xsmuldp f0, f0, v9 +; PWR9BE-NEXT: xsmuldp f1, f0, f1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64_b: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd vs0, v2 +; PWR10LE-NEXT: xsmuldp f0, f1, f0 +; PWR10LE-NEXT: xxswapd vs1, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, v2 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v4 +; PWR10LE-NEXT: xsmuldp f0, f0, v3 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v5 +; PWR10LE-NEXT: xsmuldp f0, f0, v4 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v6 +; PWR10LE-NEXT: xsmuldp f0, f0, v5 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v7 +; PWR10LE-NEXT: xsmuldp f0, f0, v6 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v8 +; PWR10LE-NEXT: xsmuldp f0, f0, v7 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xxswapd vs1, v9 +; PWR10LE-NEXT: xsmuldp f0, f0, v8 +; PWR10LE-NEXT: xsmuldp f0, f0, f1 +; PWR10LE-NEXT: xsmuldp f1, f0, v9 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64_b: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xsmuldp f0, f1, v2 +; PWR10BE-NEXT: xxswapd vs1, v2 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v3 +; PWR10BE-NEXT: xsmuldp f0, f0, v3 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v4 +; PWR10BE-NEXT: xsmuldp f0, f0, v4 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v5 +; PWR10BE-NEXT: xsmuldp f0, f0, v5 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v6 +; PWR10BE-NEXT: xsmuldp f0, f0, v6 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v7 +; PWR10BE-NEXT: xsmuldp f0, f0, v7 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v8 +; PWR10BE-NEXT: xsmuldp f0, f0, v8 +; PWR10BE-NEXT: xsmuldp f0, f0, f1 +; PWR10BE-NEXT: xxswapd vs1, v9 +; PWR10BE-NEXT: xsmuldp f0, f0, v9 +; PWR10BE-NEXT: xsmuldp f1, f0, f1 +; PWR10BE-NEXT: blr +entry: + %0 = call double @llvm.vector.reduce.fmul.v16f64(double %b, <16 x double> %a) + ret double %0 +} + +define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16f64_fast: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xvmuldp vs0, v4, v8 +; PWR9LE-NEXT: xvmuldp vs1, v2, v6 +; PWR9LE-NEXT: xvmuldp vs2, v5, v9 +; PWR9LE-NEXT: xvmuldp vs3, v3, v7 +; PWR9LE-NEXT: xvmuldp vs2, vs3, vs2 +; PWR9LE-NEXT: xvmuldp vs0, vs1, vs0 +; PWR9LE-NEXT: xvmuldp vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: xvmuldp vs0, vs0, vs1 +; PWR9LE-NEXT: xxswapd vs1, vs0 +; PWR9LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16f64_fast: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xvmuldp vs0, v4, v8 +; PWR9BE-NEXT: xvmuldp vs1, v2, v6 +; PWR9BE-NEXT: xvmuldp vs2, v5, v9 +; PWR9BE-NEXT: xvmuldp vs3, v3, v7 +; PWR9BE-NEXT: xvmuldp vs2, vs3, vs2 +; PWR9BE-NEXT: xvmuldp vs0, vs1, vs0 +; PWR9BE-NEXT: xvmuldp vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd vs1, vs0 +; PWR9BE-NEXT: xvmuldp vs1, vs0, vs1 +; PWR9BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16f64_fast: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xvmuldp vs0, v4, v8 +; PWR10LE-NEXT: xvmuldp vs1, v2, v6 +; PWR10LE-NEXT: xvmuldp vs2, v5, v9 +; PWR10LE-NEXT: xvmuldp vs3, v3, v7 +; PWR10LE-NEXT: xvmuldp vs2, vs3, vs2 +; PWR10LE-NEXT: xvmuldp vs0, vs1, vs0 +; PWR10LE-NEXT: xvmuldp vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: xvmuldp vs0, vs0, vs1 +; PWR10LE-NEXT: xxswapd vs1, vs0 +; PWR10LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16f64_fast: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xvmuldp vs0, v4, v8 +; PWR10BE-NEXT: xvmuldp vs1, v2, v6 +; PWR10BE-NEXT: xvmuldp vs2, v5, v9 +; PWR10BE-NEXT: xvmuldp vs3, v3, v7 +; PWR10BE-NEXT: xvmuldp vs2, vs3, vs2 +; PWR10BE-NEXT: xvmuldp vs0, vs1, vs0 +; PWR10BE-NEXT: xvmuldp vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd vs1, vs0 +; PWR10BE-NEXT: xvmuldp vs1, vs0, vs1 +; PWR10BE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PWR10BE-NEXT: blr +entry: + %0 = call fast double @llvm.vector.reduce.fmul.v16f64(double 1.000000e+00, <16 x double> %a) + ret double %0 +} + +declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>) #0 +declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>) #0 +declare double @llvm.vector.reduce.fmul.v8f64(double, <8 x double>) #0 +declare double @llvm.vector.reduce.fmul.v16f64(double, <16 x double>) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll new file mode 100644 index 000000000000..fc51f6aadee1 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-mul.ll @@ -0,0 +1,204 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmuluwm v3, v3, v5 +; PWR9LE-NEXT: vmuluwm v2, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmuluwm v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmuluwm v3, v3, v5 +; PWR9BE-NEXT: vmuluwm v2, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmuluwm v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmuluwm v3, v3, v5 +; PWR10LE-NEXT: vmuluwm v2, v2, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmuluwm v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmuluwm v3, v3, v5 +; PWR10BE-NEXT: vmuluwm v2, v2, v4 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmuluwm v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %a) + ret i32 %0 +} + +declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-or.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-or.ll new file mode 100644 index 000000000000..eb884d6f12d8 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-or.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of type i32 +;; +define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw vs0, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxlor v2, v2, vs0 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw vs0, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxlor v2, v2, vs0 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw vs0, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxlor v2, v2, vs0 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw vs0, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxlor v2, v2, vs0 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxlor vs0, v2, v3 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxlor v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxlor vs0, v2, v3 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxlor v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxlor vs0, v2, v3 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xxlor v2, vs0, vs1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxlor vs0, v2, v3 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xxlor v2, vs0, vs1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlor vs0, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlor vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxlor v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlor vs0, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlor vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxlor v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlor vs0, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlor vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xxlor v2, vs0, vs1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlor vs0, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlor vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xxlor v2, vs0, vs1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlor vs0, v3, v5 +; PWR9LE-NEXT: xxlor vs1, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxlor vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlor vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxlor v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlor vs0, v3, v5 +; PWR9BE-NEXT: xxlor vs1, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxlor vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlor vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxlor v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlor vs0, v3, v5 +; PWR10LE-NEXT: xxlor vs1, v2, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxlor vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlor vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xxlor v2, vs0, vs1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlor vs0, v3, v5 +; PWR10BE-NEXT: xxlor vs1, v2, v4 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxlor vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlor vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xxlor v2, vs0, vs1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %a) + ret i32 %0 +} + +declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>) #0 + +;; +;; Vectors of type i64 +;; +define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: xxlor vs0, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: xxlor vs0, v2, v3 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: xxlor vs0, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: xxlor vs0, v2, v3 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlor vs0, v2, v3 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlor vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlor vs0, v2, v3 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlor vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlor vs0, v2, v3 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlor vs0, vs0, v2 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlor vs0, v2, v3 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlor vs0, vs0, v2 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlor vs0, v3, v5 +; PWR9LE-NEXT: xxlor vs1, v2, v4 +; PWR9LE-NEXT: xxlor vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlor vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlor vs0, v3, v5 +; PWR9BE-NEXT: xxlor vs1, v2, v4 +; PWR9BE-NEXT: xxlor vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlor vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlor vs0, v3, v5 +; PWR10LE-NEXT: xxlor vs1, v2, v4 +; PWR10LE-NEXT: xxlor vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlor vs0, vs0, v2 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlor vs0, v3, v5 +; PWR10BE-NEXT: xxlor vs1, v2, v4 +; PWR10BE-NEXT: xxlor vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlor vs0, vs0, v2 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlor vs0, v4, v8 +; PWR9LE-NEXT: xxlor vs1, v2, v6 +; PWR9LE-NEXT: xxlor vs2, v5, v9 +; PWR9LE-NEXT: xxlor vs3, v3, v7 +; PWR9LE-NEXT: xxlor vs2, vs3, vs2 +; PWR9LE-NEXT: xxlor vs0, vs1, vs0 +; PWR9LE-NEXT: xxlor vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlor vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlor vs0, v4, v8 +; PWR9BE-NEXT: xxlor vs1, v2, v6 +; PWR9BE-NEXT: xxlor vs2, v5, v9 +; PWR9BE-NEXT: xxlor vs3, v3, v7 +; PWR9BE-NEXT: xxlor vs2, vs3, vs2 +; PWR9BE-NEXT: xxlor vs0, vs1, vs0 +; PWR9BE-NEXT: xxlor vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlor vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlor vs0, v4, v8 +; PWR10LE-NEXT: xxlor vs1, v2, v6 +; PWR10LE-NEXT: xxlor vs2, v5, v9 +; PWR10LE-NEXT: xxlor vs3, v3, v7 +; PWR10LE-NEXT: xxlor vs2, vs3, vs2 +; PWR10LE-NEXT: xxlor vs0, vs1, vs0 +; PWR10LE-NEXT: xxlor vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlor vs0, vs0, v2 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlor vs0, v4, v8 +; PWR10BE-NEXT: xxlor vs1, v2, v6 +; PWR10BE-NEXT: xxlor vs2, v5, v9 +; PWR10BE-NEXT: xxlor vs3, v3, v7 +; PWR10BE-NEXT: xxlor vs2, vs3, vs2 +; PWR10BE-NEXT: xxlor vs0, vs1, vs0 +; PWR10BE-NEXT: xxlor vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlor vs0, vs0, v2 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %a) + ret i64 %0 +} + +declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) #0 +declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) #0 +declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>) #0 +declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>) #0 + + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-smax.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-smax.ll new file mode 100644 index 000000000000..2f01e00373f2 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-smax.ll @@ -0,0 +1,796 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of type i8 +;; +define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v16i8(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vmaxsb v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vmaxsb v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vmaxsb v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vmaxsb v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a) + ret i8 %0 +} + +declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>) #0 +declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>) #0 +declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) #0 +declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) #0 + +;; +;; Vectors of type i16 +;; +define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxsh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxsh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxsh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxsh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %a) + ret i16 %0 +} + +declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>) #0 +declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) #0 +declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) #0 +declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) #0 + +;; +;; Vectors of type i32 +;; +define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxsw v3, v3, v5 +; PWR9LE-NEXT: vmaxsw v2, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxsw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxsw v3, v3, v5 +; PWR9BE-NEXT: vmaxsw v2, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxsw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxsw v3, v3, v5 +; PWR10LE-NEXT: vmaxsw v2, v2, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxsw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxsw v3, v3, v5 +; PWR10BE-NEXT: vmaxsw v2, v2, v4 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxsw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %a) + ret i32 %0 +} + +declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) #0 + +;; +;; Vectors of type i64 +;; +define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxsd v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxsd v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxsd v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxsd v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxsd v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxsd v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxsd v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxsd v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxsd v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxsd v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxsd v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxsd v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxsd v2, v2, v4 +; PWR9LE-NEXT: vmaxsd v3, v3, v5 +; PWR9LE-NEXT: vmaxsd v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxsd v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxsd v2, v2, v4 +; PWR9BE-NEXT: vmaxsd v3, v3, v5 +; PWR9BE-NEXT: vmaxsd v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxsd v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxsd v2, v2, v4 +; PWR10LE-NEXT: vmaxsd v3, v3, v5 +; PWR10LE-NEXT: vmaxsd v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxsd v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxsd v2, v2, v4 +; PWR10BE-NEXT: vmaxsd v3, v3, v5 +; PWR10BE-NEXT: vmaxsd v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxsd v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxsd v3, v3, v7 +; PWR9LE-NEXT: vmaxsd v5, v5, v9 +; PWR9LE-NEXT: vmaxsd v2, v2, v6 +; PWR9LE-NEXT: vmaxsd v4, v4, v8 +; PWR9LE-NEXT: vmaxsd v2, v2, v4 +; PWR9LE-NEXT: vmaxsd v3, v3, v5 +; PWR9LE-NEXT: vmaxsd v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxsd v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxsd v3, v3, v7 +; PWR9BE-NEXT: vmaxsd v5, v5, v9 +; PWR9BE-NEXT: vmaxsd v2, v2, v6 +; PWR9BE-NEXT: vmaxsd v4, v4, v8 +; PWR9BE-NEXT: vmaxsd v2, v2, v4 +; PWR9BE-NEXT: vmaxsd v3, v3, v5 +; PWR9BE-NEXT: vmaxsd v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxsd v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxsd v3, v3, v7 +; PWR10LE-NEXT: vmaxsd v5, v5, v9 +; PWR10LE-NEXT: vmaxsd v2, v2, v6 +; PWR10LE-NEXT: vmaxsd v4, v4, v8 +; PWR10LE-NEXT: vmaxsd v2, v2, v4 +; PWR10LE-NEXT: vmaxsd v3, v3, v5 +; PWR10LE-NEXT: vmaxsd v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxsd v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxsd v3, v3, v7 +; PWR10BE-NEXT: vmaxsd v5, v5, v9 +; PWR10BE-NEXT: vmaxsd v2, v2, v6 +; PWR10BE-NEXT: vmaxsd v4, v4, v8 +; PWR10BE-NEXT: vmaxsd v2, v2, v4 +; PWR10BE-NEXT: vmaxsd v3, v3, v5 +; PWR10BE-NEXT: vmaxsd v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxsd v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %a) + ret i64 %0 +} + +declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) #0 +declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) #0 +declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) #0 +declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) #0 + + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-smin.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-smin.ll new file mode 100644 index 000000000000..9483a9918c5f --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-smin.ll @@ -0,0 +1,796 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of type i8 +;; +define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v16i8(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vminsb v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vminsb v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vminsb v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vminsb v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a) + ret i8 %0 +} + +declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>) #0 +declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>) #0 +declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) #0 +declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) #0 + +;; +;; Vectors of type i16 +;; +define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminsh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminsh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminsh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminsh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %a) + ret i16 %0 +} + +declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>) #0 +declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) #0 +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) #0 +declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) #0 + +;; +;; Vectors of type i32 +;; +define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminsw v3, v3, v5 +; PWR9LE-NEXT: vminsw v2, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminsw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminsw v3, v3, v5 +; PWR9BE-NEXT: vminsw v2, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminsw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminsw v3, v3, v5 +; PWR10LE-NEXT: vminsw v2, v2, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminsw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminsw v3, v3, v5 +; PWR10BE-NEXT: vminsw v2, v2, v4 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminsw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %a) + ret i32 %0 +} + +declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) #0 + +;; +;; Vectors of type i64 +;; +define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminsd v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminsd v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminsd v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminsd v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminsd v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminsd v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminsd v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminsd v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminsd v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminsd v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminsd v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminsd v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminsd v2, v2, v4 +; PWR9LE-NEXT: vminsd v3, v3, v5 +; PWR9LE-NEXT: vminsd v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminsd v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminsd v2, v2, v4 +; PWR9BE-NEXT: vminsd v3, v3, v5 +; PWR9BE-NEXT: vminsd v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminsd v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminsd v2, v2, v4 +; PWR10LE-NEXT: vminsd v3, v3, v5 +; PWR10LE-NEXT: vminsd v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminsd v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminsd v2, v2, v4 +; PWR10BE-NEXT: vminsd v3, v3, v5 +; PWR10BE-NEXT: vminsd v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminsd v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminsd v3, v3, v7 +; PWR9LE-NEXT: vminsd v5, v5, v9 +; PWR9LE-NEXT: vminsd v2, v2, v6 +; PWR9LE-NEXT: vminsd v4, v4, v8 +; PWR9LE-NEXT: vminsd v2, v2, v4 +; PWR9LE-NEXT: vminsd v3, v3, v5 +; PWR9LE-NEXT: vminsd v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminsd v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminsd v3, v3, v7 +; PWR9BE-NEXT: vminsd v5, v5, v9 +; PWR9BE-NEXT: vminsd v2, v2, v6 +; PWR9BE-NEXT: vminsd v4, v4, v8 +; PWR9BE-NEXT: vminsd v2, v2, v4 +; PWR9BE-NEXT: vminsd v3, v3, v5 +; PWR9BE-NEXT: vminsd v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminsd v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminsd v3, v3, v7 +; PWR10LE-NEXT: vminsd v5, v5, v9 +; PWR10LE-NEXT: vminsd v2, v2, v6 +; PWR10LE-NEXT: vminsd v4, v4, v8 +; PWR10LE-NEXT: vminsd v2, v2, v4 +; PWR10LE-NEXT: vminsd v3, v3, v5 +; PWR10LE-NEXT: vminsd v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminsd v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminsd v3, v3, v7 +; PWR10BE-NEXT: vminsd v5, v5, v9 +; PWR10BE-NEXT: vminsd v2, v2, v6 +; PWR10BE-NEXT: vminsd v4, v4, v8 +; PWR10BE-NEXT: vminsd v2, v2, v4 +; PWR10BE-NEXT: vminsd v3, v3, v5 +; PWR10BE-NEXT: vminsd v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminsd v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %a) + ret i64 %0 +} + +declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) #0 +declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) #0 +declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) #0 +declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) #0 + + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-umax.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-umax.ll new file mode 100644 index 000000000000..e9fdca0ef2a0 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-umax.ll @@ -0,0 +1,796 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of type i8 +;; +define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v16i8(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vmaxub v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vmaxub v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vmaxub v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vmaxub v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a) + ret i8 %0 +} + +declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>) #0 +declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>) #0 +declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) #0 +declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) #0 + +;; +;; Vectors of type i16 +;; +define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vmaxuh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vmaxuh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vmaxuh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vmaxuh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %a) + ret i16 %0 +} + +declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>) #0 +declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) #0 +declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) #0 +declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) #0 + +;; +;; Vectors of type i32 +;; +define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxuw v3, v3, v5 +; PWR9LE-NEXT: vmaxuw v2, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vmaxuw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxuw v3, v3, v5 +; PWR9BE-NEXT: vmaxuw v2, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vmaxuw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxuw v3, v3, v5 +; PWR10LE-NEXT: vmaxuw v2, v2, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vmaxuw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxuw v3, v3, v5 +; PWR10BE-NEXT: vmaxuw v2, v2, v4 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vmaxuw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %a) + ret i32 %0 +} + +declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) #0 + +;; +;; Vectors of type i64 +;; +define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxud v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxud v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxud v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxud v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxud v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxud v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxud v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxud v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxud v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxud v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxud v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxud v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxud v2, v2, v4 +; PWR9LE-NEXT: vmaxud v3, v3, v5 +; PWR9LE-NEXT: vmaxud v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxud v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxud v2, v2, v4 +; PWR9BE-NEXT: vmaxud v3, v3, v5 +; PWR9BE-NEXT: vmaxud v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxud v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxud v2, v2, v4 +; PWR10LE-NEXT: vmaxud v3, v3, v5 +; PWR10LE-NEXT: vmaxud v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxud v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxud v2, v2, v4 +; PWR10BE-NEXT: vmaxud v3, v3, v5 +; PWR10BE-NEXT: vmaxud v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxud v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vmaxud v3, v3, v7 +; PWR9LE-NEXT: vmaxud v5, v5, v9 +; PWR9LE-NEXT: vmaxud v2, v2, v6 +; PWR9LE-NEXT: vmaxud v4, v4, v8 +; PWR9LE-NEXT: vmaxud v2, v2, v4 +; PWR9LE-NEXT: vmaxud v3, v3, v5 +; PWR9LE-NEXT: vmaxud v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vmaxud v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vmaxud v3, v3, v7 +; PWR9BE-NEXT: vmaxud v5, v5, v9 +; PWR9BE-NEXT: vmaxud v2, v2, v6 +; PWR9BE-NEXT: vmaxud v4, v4, v8 +; PWR9BE-NEXT: vmaxud v2, v2, v4 +; PWR9BE-NEXT: vmaxud v3, v3, v5 +; PWR9BE-NEXT: vmaxud v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vmaxud v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vmaxud v3, v3, v7 +; PWR10LE-NEXT: vmaxud v5, v5, v9 +; PWR10LE-NEXT: vmaxud v2, v2, v6 +; PWR10LE-NEXT: vmaxud v4, v4, v8 +; PWR10LE-NEXT: vmaxud v2, v2, v4 +; PWR10LE-NEXT: vmaxud v3, v3, v5 +; PWR10LE-NEXT: vmaxud v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vmaxud v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vmaxud v3, v3, v7 +; PWR10BE-NEXT: vmaxud v5, v5, v9 +; PWR10BE-NEXT: vmaxud v2, v2, v6 +; PWR10BE-NEXT: vmaxud v4, v4, v8 +; PWR10BE-NEXT: vmaxud v2, v2, v4 +; PWR10BE-NEXT: vmaxud v3, v3, v5 +; PWR10BE-NEXT: vmaxud v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vmaxud v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %a) + ret i64 %0 +} + +declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) #0 +declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) #0 +declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) #0 +declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) #0 + + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-umin.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-umin.ll new file mode 100644 index 000000000000..dae6b56c885b --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-umin.ll @@ -0,0 +1,796 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of type i8 +;; +define dso_local i8 @v2i8(<2 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v4i8(<4 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v8i8(<8 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a) + ret i8 %0 +} + +define dso_local i8 @v16i8(<16 x i8> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i8: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vspltb v3, v2, 14 +; PWR9LE-NEXT: vminub v2, v2, v3 +; PWR9LE-NEXT: vextubrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i8: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vspltb v3, v2, 1 +; PWR9BE-NEXT: vminub v2, v2, v3 +; PWR9BE-NEXT: vextublx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i8: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vspltb v3, v2, 14 +; PWR10LE-NEXT: vminub v2, v2, v3 +; PWR10LE-NEXT: vextubrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i8: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vspltb v3, v2, 1 +; PWR10BE-NEXT: vminub v2, v2, v3 +; PWR10BE-NEXT: vextublx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a) + ret i8 %0 +} + +declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>) #0 +declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>) #0 +declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) #0 +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) #0 + +;; +;; Vectors of type i16 +;; +define dso_local i16 @v2i16(<2 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v4i16(<4 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v8i16(<8 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a) + ret i16 %0 +} + +define dso_local i16 @v16i16(<16 x i16> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i16: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: vsplth v3, v2, 6 +; PWR9LE-NEXT: vminuh v2, v2, v3 +; PWR9LE-NEXT: vextuhrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i16: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: vsplth v3, v2, 1 +; PWR9BE-NEXT: vminuh v2, v2, v3 +; PWR9BE-NEXT: vextuhlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i16: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: vsplth v3, v2, 6 +; PWR10LE-NEXT: vminuh v2, v2, v3 +; PWR10LE-NEXT: vextuhrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i16: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: vsplth v3, v2, 1 +; PWR10BE-NEXT: vminuh v2, v2, v3 +; PWR10BE-NEXT: vextuhlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %a) + ret i16 %0 +} + +declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>) #0 +declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) #0 +declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) #0 +declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) #0 + +;; +;; Vectors of type i32 +;; +define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminuw v3, v3, v5 +; PWR9LE-NEXT: vminuw v2, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: xxspltw v3, v2, 2 +; PWR9LE-NEXT: vminuw v2, v2, v3 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminuw v3, v3, v5 +; PWR9BE-NEXT: vminuw v2, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: xxspltw v3, v2, 1 +; PWR9BE-NEXT: vminuw v2, v2, v3 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminuw v3, v3, v5 +; PWR10LE-NEXT: vminuw v2, v2, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: xxspltw v3, v2, 2 +; PWR10LE-NEXT: vminuw v2, v2, v3 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminuw v3, v3, v5 +; PWR10BE-NEXT: vminuw v2, v2, v4 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: xxspltw v3, v2, 1 +; PWR10BE-NEXT: vminuw v2, v2, v3 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %a) + ret i32 %0 +} + +declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) #0 + +;; +;; Vectors of type i64 +;; +define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminud v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminud v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminud v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminud v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminud v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminud v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminud v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminud v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminud v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminud v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminud v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminud v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminud v2, v2, v4 +; PWR9LE-NEXT: vminud v3, v3, v5 +; PWR9LE-NEXT: vminud v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminud v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminud v2, v2, v4 +; PWR9BE-NEXT: vminud v3, v3, v5 +; PWR9BE-NEXT: vminud v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminud v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminud v2, v2, v4 +; PWR10LE-NEXT: vminud v3, v3, v5 +; PWR10LE-NEXT: vminud v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminud v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminud v2, v2, v4 +; PWR10BE-NEXT: vminud v3, v3, v5 +; PWR10BE-NEXT: vminud v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminud v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: vminud v3, v3, v7 +; PWR9LE-NEXT: vminud v5, v5, v9 +; PWR9LE-NEXT: vminud v2, v2, v6 +; PWR9LE-NEXT: vminud v4, v4, v8 +; PWR9LE-NEXT: vminud v2, v2, v4 +; PWR9LE-NEXT: vminud v3, v3, v5 +; PWR9LE-NEXT: vminud v2, v2, v3 +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: vminud v2, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: vminud v3, v3, v7 +; PWR9BE-NEXT: vminud v5, v5, v9 +; PWR9BE-NEXT: vminud v2, v2, v6 +; PWR9BE-NEXT: vminud v4, v4, v8 +; PWR9BE-NEXT: vminud v2, v2, v4 +; PWR9BE-NEXT: vminud v3, v3, v5 +; PWR9BE-NEXT: vminud v2, v2, v3 +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: vminud v2, v2, v3 +; PWR9BE-NEXT: mfvsrd r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: vminud v3, v3, v7 +; PWR10LE-NEXT: vminud v5, v5, v9 +; PWR10LE-NEXT: vminud v2, v2, v6 +; PWR10LE-NEXT: vminud v4, v4, v8 +; PWR10LE-NEXT: vminud v2, v2, v4 +; PWR10LE-NEXT: vminud v3, v3, v5 +; PWR10LE-NEXT: vminud v2, v2, v3 +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: vminud v2, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: vminud v3, v3, v7 +; PWR10BE-NEXT: vminud v5, v5, v9 +; PWR10BE-NEXT: vminud v2, v2, v6 +; PWR10BE-NEXT: vminud v4, v4, v8 +; PWR10BE-NEXT: vminud v2, v2, v4 +; PWR10BE-NEXT: vminud v3, v3, v5 +; PWR10BE-NEXT: vminud v2, v2, v3 +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: vminud v2, v2, v3 +; PWR10BE-NEXT: mfvsrd r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %a) + ret i64 %0 +} + +declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) #0 +declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) #0 +declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) #0 +declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) #0 + + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-xor.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-xor.ll new file mode 100644 index 000000000000..552ca9a69724 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-xor.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR9LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr9 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR9BE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64le < %s | FileCheck %s --check-prefix=PWR10LE +; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mcpu=pwr10 -mtriple=powerpc64 < %s | FileCheck %s --check-prefix=PWR10BE + +;; +;; Vectors of type i32 +;; +define dso_local i32 @v2i32(<2 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltw vs0, v2, 2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxlxor v2, v2, vs0 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltw vs0, v2, 1 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxlxor v2, v2, vs0 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltw vs0, v2, 2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxlxor v2, v2, vs0 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltw vs0, v2, 1 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxlxor v2, v2, vs0 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v4i32(<4 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxlxor vs0, v2, v3 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxlxor v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxlxor vs0, v2, v3 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxlxor v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxlxor vs0, v2, v3 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xxlxor v2, vs0, vs1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxlxor vs0, v2, v3 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xxlxor v2, vs0, vs1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v8i32(<8 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlxor vs0, v2, v3 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlxor vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxlxor v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlxor vs0, v2, v3 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlxor vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxlxor v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlxor vs0, v2, v3 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlxor vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xxlxor v2, vs0, vs1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlxor vs0, v2, v3 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlxor vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xxlxor v2, vs0, vs1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a) + ret i32 %0 +} + +define dso_local i32 @v16i32(<16 x i32> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i32: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlxor vs0, v3, v5 +; PWR9LE-NEXT: xxlxor vs1, v2, v4 +; PWR9LE-NEXT: li r3, 0 +; PWR9LE-NEXT: xxlxor vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlxor vs0, vs0, v2 +; PWR9LE-NEXT: xxspltw vs1, vs0, 2 +; PWR9LE-NEXT: xxlxor v2, vs0, vs1 +; PWR9LE-NEXT: vextuwrx r3, r3, v2 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i32: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlxor vs0, v3, v5 +; PWR9BE-NEXT: xxlxor vs1, v2, v4 +; PWR9BE-NEXT: li r3, 0 +; PWR9BE-NEXT: xxlxor vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlxor vs0, vs0, v2 +; PWR9BE-NEXT: xxspltw vs1, vs0, 1 +; PWR9BE-NEXT: xxlxor v2, vs0, vs1 +; PWR9BE-NEXT: vextuwlx r3, r3, v2 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i32: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlxor vs0, v3, v5 +; PWR10LE-NEXT: xxlxor vs1, v2, v4 +; PWR10LE-NEXT: li r3, 0 +; PWR10LE-NEXT: xxlxor vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlxor vs0, vs0, v2 +; PWR10LE-NEXT: xxspltw vs1, vs0, 2 +; PWR10LE-NEXT: xxlxor v2, vs0, vs1 +; PWR10LE-NEXT: vextuwrx r3, r3, v2 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i32: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlxor vs0, v3, v5 +; PWR10BE-NEXT: xxlxor vs1, v2, v4 +; PWR10BE-NEXT: li r3, 0 +; PWR10BE-NEXT: xxlxor vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlxor vs0, vs0, v2 +; PWR10BE-NEXT: xxspltw vs1, vs0, 1 +; PWR10BE-NEXT: xxlxor v2, vs0, vs1 +; PWR10BE-NEXT: vextuwlx r3, r3, v2 +; PWR10BE-NEXT: blr +entry: + %0 = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %a) + ret i32 %0 +} + +declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>) #0 +declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) #0 +declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>) #0 +declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>) #0 + +;; +;; Vectors of type i64 +;; +define dso_local i64 @v2i64(<2 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v2i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxswapd v3, v2 +; PWR9LE-NEXT: xxlxor vs0, v2, v3 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v2i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxswapd v3, v2 +; PWR9BE-NEXT: xxlxor vs0, v2, v3 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v2i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxswapd v3, v2 +; PWR10LE-NEXT: xxlxor vs0, v2, v3 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v2i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxswapd v3, v2 +; PWR10BE-NEXT: xxlxor vs0, v2, v3 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v4i64(<4 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v4i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlxor vs0, v2, v3 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlxor vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v4i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlxor vs0, v2, v3 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlxor vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v4i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlxor vs0, v2, v3 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlxor vs0, vs0, v2 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v4i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlxor vs0, v2, v3 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlxor vs0, vs0, v2 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v8i64(<8 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v8i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlxor vs0, v3, v5 +; PWR9LE-NEXT: xxlxor vs1, v2, v4 +; PWR9LE-NEXT: xxlxor vs0, vs1, vs0 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlxor vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v8i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlxor vs0, v3, v5 +; PWR9BE-NEXT: xxlxor vs1, v2, v4 +; PWR9BE-NEXT: xxlxor vs0, vs1, vs0 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlxor vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v8i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlxor vs0, v3, v5 +; PWR10LE-NEXT: xxlxor vs1, v2, v4 +; PWR10LE-NEXT: xxlxor vs0, vs1, vs0 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlxor vs0, vs0, v2 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v8i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlxor vs0, v3, v5 +; PWR10BE-NEXT: xxlxor vs1, v2, v4 +; PWR10BE-NEXT: xxlxor vs0, vs1, vs0 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlxor vs0, vs0, v2 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %a) + ret i64 %0 +} + +define dso_local i64 @v16i64(<16 x i64> %a) local_unnamed_addr #0 { +; PWR9LE-LABEL: v16i64: +; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxlxor vs0, v4, v8 +; PWR9LE-NEXT: xxlxor vs1, v2, v6 +; PWR9LE-NEXT: xxlxor vs2, v5, v9 +; PWR9LE-NEXT: xxlxor vs3, v3, v7 +; PWR9LE-NEXT: xxlxor vs2, vs3, vs2 +; PWR9LE-NEXT: xxlxor vs0, vs1, vs0 +; PWR9LE-NEXT: xxlxor vs0, vs0, vs2 +; PWR9LE-NEXT: xxswapd v2, vs0 +; PWR9LE-NEXT: xxlxor vs0, vs0, v2 +; PWR9LE-NEXT: mfvsrld r3, vs0 +; PWR9LE-NEXT: blr +; +; PWR9BE-LABEL: v16i64: +; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxlxor vs0, v4, v8 +; PWR9BE-NEXT: xxlxor vs1, v2, v6 +; PWR9BE-NEXT: xxlxor vs2, v5, v9 +; PWR9BE-NEXT: xxlxor vs3, v3, v7 +; PWR9BE-NEXT: xxlxor vs2, vs3, vs2 +; PWR9BE-NEXT: xxlxor vs0, vs1, vs0 +; PWR9BE-NEXT: xxlxor vs0, vs0, vs2 +; PWR9BE-NEXT: xxswapd v2, vs0 +; PWR9BE-NEXT: xxlxor vs0, vs0, v2 +; PWR9BE-NEXT: mffprd r3, f0 +; PWR9BE-NEXT: blr +; +; PWR10LE-LABEL: v16i64: +; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxlxor vs0, v4, v8 +; PWR10LE-NEXT: xxlxor vs1, v2, v6 +; PWR10LE-NEXT: xxlxor vs2, v5, v9 +; PWR10LE-NEXT: xxlxor vs3, v3, v7 +; PWR10LE-NEXT: xxlxor vs2, vs3, vs2 +; PWR10LE-NEXT: xxlxor vs0, vs1, vs0 +; PWR10LE-NEXT: xxlxor vs0, vs0, vs2 +; PWR10LE-NEXT: xxswapd v2, vs0 +; PWR10LE-NEXT: xxlxor vs0, vs0, v2 +; PWR10LE-NEXT: mfvsrld r3, vs0 +; PWR10LE-NEXT: blr +; +; PWR10BE-LABEL: v16i64: +; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxlxor vs0, v4, v8 +; PWR10BE-NEXT: xxlxor vs1, v2, v6 +; PWR10BE-NEXT: xxlxor vs2, v5, v9 +; PWR10BE-NEXT: xxlxor vs3, v3, v7 +; PWR10BE-NEXT: xxlxor vs2, vs3, vs2 +; PWR10BE-NEXT: xxlxor vs0, vs1, vs0 +; PWR10BE-NEXT: xxlxor vs0, vs0, vs2 +; PWR10BE-NEXT: xxswapd v2, vs0 +; PWR10BE-NEXT: xxlxor vs0, vs0, v2 +; PWR10BE-NEXT: mffprd r3, f0 +; PWR10BE-NEXT: blr +entry: + %0 = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %a) + ret i64 %0 +} + +declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) #0 +declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) #0 +declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>) #0 +declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>) #0 + + +attributes #0 = { nounwind } |