summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Devereau <matthew.devereau@arm.com>2022-09-08 11:22:11 +0000
committerTobias Hieta <tobias@hieta.se>2022-10-17 09:41:25 +0200
commitd35bc70e82511b38c55872ab33d3a950d2c8bbc4 (patch)
treebf9d2e62b8ae0365def758bd9ff0f32e284b45a2
parentf3c5289e78462fb96015f79c954d95a0d527ba55 (diff)
downloadllvm-d35bc70e82511b38c55872ab33d3a950d2c8bbc4.tar.gz
[AArch64][SVE] Fix AArch64_SVE_VectorCall calling convention
This fixes the case where callees with SVE arguments outside of the z0-z7 range were incorrectly deduced as SVE calling convention functions
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp67
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h2
-rw-r--r--llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll297
3 files changed, 330 insertions, 36 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c28216048d7c..06e21f90ebf1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5805,7 +5805,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
- SmallVector<SDValue, 16> ArgValues;
+
unsigned ExtraArgLocs = 0;
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
@@ -6157,17 +6157,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const {
- CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
- // Assign locations to each value returned by this call.
- SmallVector<CCValAssign, 16> RVLocs;
DenseMap<unsigned, SDValue> CopiedRegs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
- *DAG.getContext());
- CCInfo.AnalyzeCallResult(Ins, RetCC);
-
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign VA = RVLocs[i];
@@ -6508,17 +6501,39 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
GuardWithBTI = FuncInfo->branchTargetEnforcement();
}
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+ if (IsVarArg) {
+ unsigned NumArgs = Outs.size();
+
+ for (unsigned i = 0; i != NumArgs; ++i) {
+ if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
+ report_fatal_error("Passing SVE types to variadic functions is "
+ "currently not supported");
+ }
+ }
+
+ analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
+
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ RetCCInfo.AnalyzeCallResult(Ins, RetCC);
+
// Check callee args/returns for SVE registers and set calling convention
// accordingly.
if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
- bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
- return Out.VT.isScalableVector();
- });
- bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
- return In.VT.isScalableVector();
- });
-
- if (CalleeInSVE || CalleeOutSVE)
+ auto HasSVERegLoc = [](CCValAssign &Loc) {
+ if (!Loc.isRegLoc())
+ return false;
+ return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
+ AArch64::PPRRegClass.contains(Loc.getLocReg());
+ };
+ if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
CallConv = CallingConv::AArch64_SVE_VectorCall;
}
@@ -6540,22 +6555,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
- // Analyze operands of the call, assigning locations to each operand.
- SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
-
- if (IsVarArg) {
- unsigned NumArgs = Outs.size();
-
- for (unsigned i = 0; i != NumArgs; ++i) {
- if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
- report_fatal_error("Passing SVE types to variadic functions is "
- "currently not supported");
- }
- }
-
- analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
-
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -6961,7 +6960,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Handle result values, copying them out of physregs into vregs that we
// return.
- return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, RVLocs, DL, DAG,
InVals, IsThisReturn,
IsThisReturn ? OutVals[0] : SDValue());
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1ba2e2f315ec..ff3bfe897869 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -894,7 +894,7 @@ private:
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SmallVectorImpl<CCValAssign> &RVLocs,
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 1a159558ba5d..8648a6863c52 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -334,7 +334,46 @@ entry:
ret void
}
-; Use AAPCS, no SVE register in z0-7 used (floats occupy z0-z7)
+; Use AAVPCS, SVE register used in return
+
+define <vscale x 4 x float> @aavpcs5(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, float * %ptr) nounwind {
+; CHECK-LABEL: aavpcs5:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr x8, [sp]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8]
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7]
+; CHECK-NEXT: ld1w { z3.s }, p0/z, [x6]
+; CHECK-NEXT: ld1w { z4.s }, p0/z, [x5]
+; CHECK-NEXT: ld1w { z5.s }, p0/z, [x4]
+; CHECK-NEXT: ld1w { z6.s }, p0/z, [x3]
+; CHECK-NEXT: ld1w { z7.s }, p0/z, [x2]
+; CHECK-NEXT: ld1w { z24.s }, p0/z, [x1]
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ldr x8, [sp, #16]
+; CHECK-NEXT: st1w { z0.s }, p0, [x8]
+; CHECK-NEXT: st1w { z24.s }, p0, [x8]
+; CHECK-NEXT: st1w { z7.s }, p0, [x8]
+; CHECK-NEXT: st1w { z6.s }, p0, [x8]
+; CHECK-NEXT: st1w { z5.s }, p0, [x8]
+; CHECK-NEXT: st1w { z4.s }, p0, [x8]
+; CHECK-NEXT: st1w { z3.s }, p0, [x8]
+; CHECK-NEXT: st1w { z2.s }, p0, [x8]
+; CHECK-NEXT: st1w { z1.s }, p0, [x8]
+; CHECK-NEXT: ret
+entry:
+ %ptr1.bc = bitcast float * %ptr to <vscale x 4 x float> *
+ store volatile <vscale x 4 x float> %s8, <vscale x 4 x float>* %ptr1.bc
+ store volatile <vscale x 4 x float> %s9, <vscale x 4 x float>* %ptr1.bc
+ store volatile <vscale x 4 x float> %s10, <vscale x 4 x float>* %ptr1.bc
+ store volatile <vscale x 4 x float> %s11, <vscale x 4 x float>* %ptr1.bc
+ store volatile <vscale x 4 x float> %s12, <vscale x 4 x float>* %ptr1.bc
+ store volatile <vscale x 4 x float> %s13, <vscale x 4 x float>* %ptr1.bc
+ store volatile <vscale x 4 x float> %s14, <vscale x 4 x float>* %ptr1.bc
+ store volatile <vscale x 4 x float> %s15, <vscale x 4 x float>* %ptr1.bc
+ store volatile <vscale x 4 x float> %s16, <vscale x 4 x float>* %ptr1.bc
+ ret <vscale x 4 x float> %s8
+}
define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, float * %ptr) nounwind {
; CHECK-LABEL: aapcs1:
@@ -375,6 +414,262 @@ entry:
ret void
}
+declare void @non_sve_callee_high_range(float %f0, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6, float %f7, <vscale x 4 x float> %v0, <vscale x 4 x float> %v1)
+
+define void @non_sve_caller_non_sve_callee_high_range() {
+; CHECK-LABEL: non_sve_caller_non_sve_callee_high_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: fmov s1, #1.00000000
+; CHECK-NEXT: fmov s2, #2.00000000
+; CHECK-NEXT: fmov s3, #3.00000000
+; CHECK-NEXT: fmov s4, #4.00000000
+; CHECK-NEXT: fmov s5, #5.00000000
+; CHECK-NEXT: fmov s6, #6.00000000
+; CHECK-NEXT: fmov s7, #7.00000000
+; CHECK-NEXT: mov x1, sp
+; CHECK-NEXT: addvl x0, sp, #1
+; CHECK-NEXT: bl non_sve_callee_high_range
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @non_sve_callee_high_range(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
+ ret void
+}
+
+define void @non_sve_caller_high_range_non_sve_callee_high_range(float %f0, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6, float %f7, <vscale x 4 x float> %v0, <vscale x 4 x float> %v1) {
+; CHECK-LABEL: non_sve_caller_high_range_non_sve_callee_high_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: ld1w { z16.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z17.s }, p0/z, [x1]
+; CHECK-NEXT: fmov s1, #1.00000000
+; CHECK-NEXT: fmov s2, #2.00000000
+; CHECK-NEXT: fmov s3, #3.00000000
+; CHECK-NEXT: fmov s4, #4.00000000
+; CHECK-NEXT: fmov s5, #5.00000000
+; CHECK-NEXT: fmov s6, #6.00000000
+; CHECK-NEXT: fmov s7, #7.00000000
+; CHECK-NEXT: mov x1, sp
+; CHECK-NEXT: addvl x0, sp, #1
+; CHECK-NEXT: st1w { z17.s }, p0, [sp]
+; CHECK-NEXT: st1w { z16.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: bl non_sve_callee_high_range
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @non_sve_callee_high_range(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, <vscale x 4 x float> %v0, <vscale x 4 x float> %v1)
+ ret void
+}
+
+define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1) {
+; CHECK-LABEL: sve_caller_non_sve_callee_high_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-18
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 168 * VG
+; CHECK-NEXT: mov z25.d, z0.d
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmov s1, #1.00000000
+; CHECK-NEXT: fmov s2, #2.00000000
+; CHECK-NEXT: fmov s3, #3.00000000
+; CHECK-NEXT: fmov s4, #4.00000000
+; CHECK-NEXT: fmov s5, #5.00000000
+; CHECK-NEXT: fmov s6, #6.00000000
+; CHECK-NEXT: fmov s7, #7.00000000
+; CHECK-NEXT: addvl x0, sp, #2
+; CHECK-NEXT: addvl x1, sp, #1
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: st1w { z24.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z25.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: bl non_sve_callee_high_range
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #18
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @non_sve_callee_high_range(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, <vscale x 4 x float> %v0, <vscale x 4 x float> %v1)
+ ret <vscale x 4 x float> %v0
+}
+
+define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range() {
+; CHECK-LABEL: sve_ret_caller_non_sve_callee_high_range:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl sp, sp, #-18
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: fmov s1, #1.00000000
+; CHECK-NEXT: fmov s2, #2.00000000
+; CHECK-NEXT: fmov s3, #3.00000000
+; CHECK-NEXT: fmov s4, #4.00000000
+; CHECK-NEXT: fmov s5, #5.00000000
+; CHECK-NEXT: fmov s6, #6.00000000
+; CHECK-NEXT: fmov s7, #7.00000000
+; CHECK-NEXT: mov x1, sp
+; CHECK-NEXT: addvl x0, sp, #1
+; CHECK-NEXT: bl non_sve_callee_high_range
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #18
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @non_sve_callee_high_range(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
+ ret <vscale x 4 x float> undef
+}
+
declare float @callee1(float, <vscale x 8 x double>, <vscale x 8 x double>, <vscale x 2 x double>)
declare float @callee2(i32, i32, i32, i32, i32, i32, i32, i32, float, <vscale x 8 x double>, <vscale x 8 x double>)
declare float @callee3(float, float, <vscale x 8 x double>, <vscale x 6 x double>, <vscale x 2 x double>)