From 0117428c3ea163f42f4549ff4e7f255f29f6b4d4 Mon Sep 17 00:00:00 2001 From: Xavier Leroy Date: Thu, 9 Sep 2021 09:33:55 +0200 Subject: Support more arguments to tail calls by passing them through the domain state (#10595) In 2004, commit af9b98fcb, the calling conventions for the i386 port of ocamlopt were changed: the first 6 integer arguments go into registers, like before, but the next 16 arguments go into a global array `caml_extra_params`, instead of being passed on stack like before. The reason for this hack is that passing arguments in global memory does not preclude tail call optimization, unlike passing arguments on stack. Parameters passed via `caml_extra_params` are immediately copied on stack or in registers on function entry, before another function call, a GC, or a context switch can take place, so everything is safe in OCaml, and in Multicore OCaml as long as there is only one execution domain. This hack was justified by the paucity of registers provided by the i386 architecture. It was believed that other architectures provide enough registers for parameter passing that most if not all reasonable tail calls can be accommodated. Now it's 2021 and users want tail calls with more arguments than available registers on all the architectures we support. So, biting the bullet and swallowing some pride, this commit extends the 2004 i386 hack to all the architectures supported by OCaml. Once the registers available for passing function arguments are exhausted, the next 64 arguments are passed in a memory area that is part of the domain state. This argument passing is compatible with tail calls, so we get guaranteed tail calls up to 70 arguments (in the worst case). The domain state is used instead of a global array so that (1) this is compatible with Multicore OCaml and concurrent execution of multiple domains, and (2) we benefit from efficient addressing from the domain state register. For i386, we don't have a domain state register, and Multicore OCaml will support only one domain on this architecture, so we keep using a global `caml_extra_params` array; only, its size was increased to support 64 arguments. The tests for tail calls were extended to - Test tail calls to other functions, not just to self - Test up to 32 arguments. --- Changes | 5 +++ asmcomp/amd64/emit.mlp | 18 +++++--- asmcomp/amd64/proc.ml | 39 ++++++++++------- asmcomp/arm/emit.mlp | 7 +++- asmcomp/arm/proc.ml | 36 ++++++++++------ asmcomp/arm64/emit.mlp | 7 +++- asmcomp/arm64/proc.ml | 35 ++++++++++------ asmcomp/i386/emit.mlp | 23 +++++++---- asmcomp/i386/proc.ml | 30 +++++++------- asmcomp/power/emit.mlp | 11 +++-- asmcomp/power/proc.ml | 33 +++++++++------ asmcomp/printmach.ml | 2 + asmcomp/reg.ml | 1 + asmcomp/reg.mli | 23 +++++++++++ asmcomp/riscv/emit.mlp | 69 ++++++++++++++++--------------- asmcomp/riscv/proc.ml | 32 ++++++++------ asmcomp/s390x/emit.mlp | 9 +++- asmcomp/s390x/proc.ml | 25 +++++++---- runtime/caml/domain_state.h | 14 +++++-- runtime/caml/domain_state.tbl | 3 ++ runtime/i386.S | 4 +- runtime/i386nt.asm | 2 +- testsuite/tests/basic/tailcalls.ml | 49 ++++++++++++++++++++++ testsuite/tests/basic/tailcalls.reference | 4 ++ 24 files changed, 333 insertions(+), 148 deletions(-) diff --git a/Changes b/Changes index cd63e6daa2..95ec3fee5e 100644 --- a/Changes +++ b/Changes @@ -27,6 +27,11 @@ Working version and long register allocation times. (Xavier Leroy, report by Edwin Török, review by Nicolás Ojeda Bär) +- #10595: Tail calls with up to 64 arguments are guaranteed to be compiled + as tail calls. To this end, memory locations in the domain state + are used for passing arguments that do not fit in registers. + (Xavier Leroy, review by Vincent Laviron) + ### Standard library: * #7812, #10475: `Filename.chop_suffix name suff` now checks that `suff` diff --git a/asmcomp/amd64/emit.mlp b/asmcomp/amd64/emit.mlp index 16ff7f6161..eb38efbbe2 100644 --- a/asmcomp/amd64/emit.mlp +++ b/asmcomp/amd64/emit.mlp @@ -81,12 +81,13 @@ let frame_size env = (* includes return address *) let slot_offset env loc cl = match loc with - | Incoming n -> (frame_size env) + n + | Incoming n -> frame_size env + n | Local n -> if cl = 0 then env.stack_offset + n * 8 else env.stack_offset + (env.f.fun_num_stack_slots.(0) + n) * 8 | Outgoing n -> n + | Domainstate _ -> assert false (* not a stack slot *) (* Symbols *) @@ -171,14 +172,18 @@ let emit_Llabel env fallthrough lbl = (* Output a pseudo-register *) +let x86_data_type_for_stack_slot = function + | Float -> REAL8 + | _ -> QWORD + let reg env = function | { loc = Reg.Reg r } -> register_name r - | { loc = Stack s; typ = Float } as r -> - let ofs = slot_offset env s (register_class r) in - mem64 REAL8 ofs RSP - | { loc = Stack s } as r -> + | { loc = Stack(Domainstate n); typ = ty } -> + let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in + mem64 (x86_data_type_for_stack_slot ty) ofs R14 + | { loc = Stack s; typ = ty } as r -> let ofs = slot_offset env s (register_class r) in - mem64 QWORD ofs RSP + mem64 (x86_data_type_for_stack_slot ty) ofs RSP | { loc = Unknown } -> assert false @@ -188,6 +193,7 @@ let reg64 = function let arg env i n = reg env i.arg.(n) let res env i n = reg env i.res.(n) + (* Output a reference to the lower 8, 16 or 32 bits of a register *) let reg_low_8_name = Array.map (fun r -> Reg8L r) int_reg_name diff --git a/asmcomp/amd64/proc.ml b/asmcomp/amd64/proc.ml index ed17640736..b7047a1ead 100644 --- a/asmcomp/amd64/proc.ml +++ b/asmcomp/amd64/proc.ml @@ -76,8 +76,6 @@ let win64 = Arch.win64 stub saves them into the GC regs block). *) -let max_arguments_for_tailcalls = 10 - let int_reg_name = match Config.ccomp_type with | "msvc" -> @@ -157,12 +155,15 @@ let word_addressed = false (* Calling conventions *) -let calling_conventions first_int last_int first_float last_float make_stack +let size_domainstate_args = 64 * size_int + +let calling_conventions first_int last_int first_float last_float + make_stack first_stack arg = let loc = Array.make (Array.length arg) Reg.dummy in let int = ref first_int in let float = ref first_float in - let ofs = ref 0 in + let ofs = ref first_stack in for i = 0 to Array.length arg - 1 do match arg.(i) with | Val | Int | Addr as ty -> @@ -183,21 +184,29 @@ let calling_conventions first_int last_int first_float last_float make_stack ofs := !ofs + size_float end done; - (loc, Misc.align !ofs 16) (* keep stack 16-aligned *) - -let incoming ofs = Incoming ofs -let outgoing ofs = Outgoing ofs + (loc, Misc.align (max 0 !ofs) 16) (* keep stack 16-aligned *) + +let incoming ofs = + if ofs >= 0 + then Incoming ofs + else Domainstate (ofs + size_domainstate_args) +let outgoing ofs = + if ofs >= 0 + then Outgoing ofs + else Domainstate (ofs + size_domainstate_args) let not_supported _ofs = fatal_error "Proc.loc_results: cannot call" let loc_arguments arg = - calling_conventions 0 9 100 109 outgoing arg + calling_conventions 0 9 100 109 outgoing (- size_domainstate_args) arg let loc_parameters arg = let (loc, _ofs) = - calling_conventions 0 9 100 109 incoming arg - in - loc + calling_conventions 0 9 100 109 incoming (- size_domainstate_args) arg + in loc let loc_results res = - let (loc, _ofs) = calling_conventions 0 0 100 100 not_supported res in loc + let (loc, _ofs) = calling_conventions 0 0 100 100 not_supported 0 res + in loc + +let max_arguments_for_tailcalls = 10 (* in regs *) + 64 (* in domain state *) (* C calling conventions under Unix: first integer args in rdi, rsi, rdx, rcx, r8, r9 @@ -213,10 +222,10 @@ let loc_results res = Return value in rax or xmm0. *) let loc_external_results res = - let (loc, _ofs) = calling_conventions 0 0 100 100 not_supported res in loc + let (loc, _ofs) = calling_conventions 0 0 100 100 not_supported 0 res in loc let unix_loc_external_arguments arg = - calling_conventions 2 7 100 107 outgoing arg + calling_conventions 2 7 100 107 outgoing 0 arg let win64_int_external_arguments = [| 5 (*rcx*); 4 (*rdx*); 6 (*r8*); 7 (*r9*) |] diff --git a/asmcomp/arm/emit.mlp b/asmcomp/arm/emit.mlp index 350bcb7c3c..29d9c38ed6 100644 --- a/asmcomp/arm/emit.mlp +++ b/asmcomp/arm/emit.mlp @@ -74,13 +74,18 @@ let slot_offset env loc cl = | Outgoing n -> assert (n >= 0); n + | Domainstate _ -> assert false (* not a stack slot *) (* Output a stack reference *) let emit_stack env r = match r.loc with + | Stack (Domainstate n) -> + let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in + `[domain_state_ptr, #{emit_int ofs}]` | Stack s -> - let ofs = slot_offset env s (register_class r) in `[sp, #{emit_int ofs}]` + let ofs = slot_offset env s (register_class r) in + `[sp, #{emit_int ofs}]` | _ -> fatal_error "Emit_arm.emit_stack" (* Output an addressing mode *) diff --git a/asmcomp/arm/proc.ml b/asmcomp/arm/proc.ml index 11313fce7f..e7e34f3618 100644 --- a/asmcomp/arm/proc.ml +++ b/asmcomp/arm/proc.ml @@ -109,6 +109,8 @@ let stack_slot slot ty = (* Calling conventions *) +let size_domainstate_args = 64 * size_int + let loc_int last_int make_stack int ofs = if !int <= last_int then begin let l = phys_reg !int in @@ -149,12 +151,12 @@ let loc_int_pair last_int make_stack int ofs = [| stack_lower; stack_upper |] end -let calling_conventions first_int last_int first_float last_float make_stack - arg = +let calling_conventions first_int last_int first_float last_float + make_stack first_stack arg = let loc = Array.make (Array.length arg) Reg.dummy in let int = ref first_int in let float = ref first_float in - let ofs = ref 0 in + let ofs = ref first_stack in for i = 0 to Array.length arg - 1 do match arg.(i) with | Val | Int | Addr -> @@ -162,28 +164,36 @@ let calling_conventions first_int last_int first_float last_float make_stack | Float -> loc.(i) <- loc_float last_float make_stack float ofs done; - (loc, Misc.align !ofs 8) (* keep stack 8-aligned *) - -let incoming ofs = Incoming ofs -let outgoing ofs = Outgoing ofs + (loc, Misc.align (max 0 !ofs) 8) (* keep stack 8-aligned *) + +let incoming ofs = + if ofs >= 0 + then Incoming ofs + else Domainstate (ofs + size_domainstate_args) +let outgoing ofs = + if ofs >= 0 + then Outgoing ofs + else Domainstate (ofs + size_domainstate_args) let not_supported _ofs = fatal_error "Proc.loc_results: cannot call" (* OCaml calling convention: first integer args in r0...r7 first float args in d0...d15 (EABI+VFP) - remaining args on stack. + remaining args in domain state area, then on stack. Return values in r0...r7 or d0...d15. *) -let max_arguments_for_tailcalls = 8 +let max_arguments_for_tailcalls = 8 (* in regs *) + 64 (* in domain state *) let loc_arguments arg = - calling_conventions 0 7 100 115 outgoing arg + calling_conventions 0 7 100 115 outgoing (- size_domainstate_args) arg let loc_parameters arg = - let (loc, _) = calling_conventions 0 7 100 115 incoming arg in loc + let (loc, _) = + calling_conventions 0 7 100 115 incoming (- size_domainstate_args) arg + in loc let loc_results res = - let (loc, _) = calling_conventions 0 7 100 115 not_supported res in loc + let (loc, _) = calling_conventions 0 7 100 115 not_supported 0 res in loc (* C calling convention: first integer args in r0...r3 @@ -218,7 +228,7 @@ let loc_external_arguments ty_args = external_calling_conventions 0 3 100 107 outgoing ty_args let loc_external_results res = - let (loc, _) = calling_conventions 0 1 100 100 not_supported res + let (loc, _) = calling_conventions 0 1 100 100 not_supported 0 res in loc let loc_exn_bucket = phys_reg 0 diff --git a/asmcomp/arm64/emit.mlp b/asmcomp/arm64/emit.mlp index 9f0395e263..d3012f3b7f 100644 --- a/asmcomp/arm64/emit.mlp +++ b/asmcomp/arm64/emit.mlp @@ -104,13 +104,18 @@ let slot_offset env loc cl = | Outgoing n -> assert (n >= 0); n + | Domainstate _ -> assert false (* not a stack slot *) (* Output a stack reference *) let emit_stack env r = match r.loc with + | Stack (Domainstate n) -> + let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in + `[{emit_reg reg_domain_state_ptr}, #{emit_int ofs}]` | Stack s -> - let ofs = slot_offset env s (register_class r) in `[sp, #{emit_int ofs}]` + let ofs = slot_offset env s (register_class r) in + `[sp, #{emit_int ofs}]` | _ -> fatal_error "Emit.emit_stack" (* Output an addressing mode *) diff --git a/asmcomp/arm64/proc.ml b/asmcomp/arm64/proc.ml index 7a6f10a69d..4a921875e3 100644 --- a/asmcomp/arm64/proc.ml +++ b/asmcomp/arm64/proc.ml @@ -107,6 +107,8 @@ let stack_slot slot ty = (* Calling conventions *) +let size_domainstate_args = 64 * size_int + let loc_int last_int make_stack int ofs = if !int <= last_int then begin let l = phys_reg !int in @@ -138,11 +140,11 @@ let loc_int32 last_int make_stack int ofs = end let calling_conventions - first_int last_int first_float last_float make_stack arg = + first_int last_int first_float last_float make_stack first_stack arg = let loc = Array.make (Array.length arg) Reg.dummy in let int = ref first_int in let float = ref first_float in - let ofs = ref 0 in + let ofs = ref first_stack in for i = 0 to Array.length arg - 1 do match arg.(i) with | Val | Int | Addr -> @@ -150,31 +152,40 @@ let calling_conventions | Float -> loc.(i) <- loc_float last_float make_stack float ofs done; - (loc, Misc.align !ofs 16) (* keep stack 16-aligned *) - -let incoming ofs = Incoming ofs -let outgoing ofs = Outgoing ofs + (loc, Misc.align (max 0 !ofs) 16) (* keep stack 16-aligned *) + +let incoming ofs = + if ofs >= 0 + then Incoming ofs + else Domainstate (ofs + size_domainstate_args) +let outgoing ofs = + if ofs >= 0 + then Outgoing ofs + else Domainstate (ofs + size_domainstate_args) let not_supported _ofs = fatal_error "Proc.loc_results: cannot call" (* OCaml calling convention: first integer args in r0...r15 first float args in d0...d15 - remaining args on stack. + remaining args in domain state area, then on stack. Return values in r0...r15 or d0...d15. *) -let max_arguments_for_tailcalls = 16 +let max_arguments_for_tailcalls = 16 (* in regs *) + 64 (* in domain state *) + let last_int_register = if macosx then 7 else 15 let loc_arguments arg = - calling_conventions 0 last_int_register 100 115 outgoing arg + calling_conventions 0 last_int_register 100 115 + outgoing (- size_domainstate_args) arg let loc_parameters arg = let (loc, _) = - calling_conventions 0 last_int_register 100 115 incoming arg + calling_conventions 0 last_int_register 100 115 + incoming (- size_domainstate_args) arg in loc let loc_results res = let (loc, _) = - calling_conventions 0 last_int_register 100 115 not_supported res + calling_conventions 0 last_int_register 100 115 not_supported 0 res in loc @@ -208,7 +219,7 @@ let loc_external_arguments ty_args = external_calling_conventions 0 7 100 107 outgoing ty_args let loc_external_results res = - let (loc, _) = calling_conventions 0 1 100 100 not_supported res in loc + let (loc, _) = calling_conventions 0 1 100 100 not_supported 0 res in loc let loc_exn_bucket = phys_reg 0 diff --git a/asmcomp/i386/emit.mlp b/asmcomp/i386/emit.mlp index eb12cc96e5..b76af3687d 100644 --- a/asmcomp/i386/emit.mlp +++ b/asmcomp/i386/emit.mlp @@ -69,6 +69,7 @@ let slot_offset env loc cl = | Outgoing n -> assert (n >= 0); n + | Domainstate _ -> assert false (* not a stack slot *) (* Record symbols used and defined - at the end generate extern for those used but not defined *) @@ -138,16 +139,24 @@ let domain_field f r = let load_domain_state r = I.mov (sym32 "Caml_state") r +let x86_data_type_for_stack_slot = function + | Float -> REAL8 + | _ -> DWORD + +(* The Domainstate locations are mapped to a global array "caml_extra_params" + defined in runtime/i386*. We cannot access the domain state here + because in the i386 port there is no register that always point to the + domain state. A global array works because i386 does not + support multiple domains. *) + let reg env = function | { loc = Reg r } -> register_name r - | { loc = Stack(Incoming n | Outgoing n) } when n < 0 -> - sym32 "caml_extra_params" ~ofs:(n + 64) - | { loc = Stack s; typ = Float } as r -> - let ofs = slot_offset env s (register_class r) in - mem32 REAL8 ofs RSP - | { loc = Stack s } as r -> + | { loc = Stack(Domainstate n); typ = ty } -> + mem_sym (x86_data_type_for_stack_slot ty) + (emit_symbol "caml_extra_params") ~ofs:n + | { loc = Stack s; typ = ty } as r -> let ofs = slot_offset env s (register_class r) in - mem32 DWORD ofs RSP + mem32 (x86_data_type_for_stack_slot ty) ofs RSP | { loc = Unknown } -> fatal_error "Emit_i386.reg" diff --git a/asmcomp/i386/proc.ml b/asmcomp/i386/proc.ml index 53799397c4..fed3e678f9 100644 --- a/asmcomp/i386/proc.ml +++ b/asmcomp/i386/proc.ml @@ -101,23 +101,14 @@ let word_addressed = false (* Calling conventions *) -(* To supplement the processor's meagre supply of registers, we also - use some global memory locations to pass arguments beyond the 6th. - These globals are denoted by Incoming and Outgoing stack locations - with negative offsets, starting at -64. - Unlike arguments passed on stack, arguments passed in globals - do not prevent tail-call elimination. The caller stores arguments - in these globals immediately before the call, and the first thing the - callee does is copy them to registers or stack locations. - Neither GC nor thread context switches can occur between these two - times. *) +let size_domainstate_args = 64 * size_int let calling_conventions first_int last_int first_float last_float make_stack arg = let loc = Array.make (Array.length arg) Reg.dummy in let int = ref first_int in let float = ref first_float in - let ofs = ref (-64) in + let ofs = ref (- size_domainstate_args) in for i = 0 to Array.length arg - 1 do match arg.(i) with Val | Int | Addr as ty -> @@ -139,19 +130,26 @@ let calling_conventions first_int last_int first_float last_float make_stack done; (loc, Misc.align (max 0 !ofs) stack_alignment) -let incoming ofs = Incoming ofs -let outgoing ofs = Outgoing ofs +let incoming ofs = + if ofs >= 0 + then Incoming ofs + else Domainstate (ofs + size_domainstate_args) +let outgoing ofs = + if ofs >= 0 + then Outgoing ofs + else Domainstate (ofs + size_domainstate_args) let not_supported _ofs = fatal_error "Proc.loc_results: cannot call" -(* Six arguments in integer registers plus eight in global memory. *) -let max_arguments_for_tailcalls = 14 - let loc_arguments arg = calling_conventions 0 5 100 99 outgoing arg let loc_parameters arg = let (loc, _ofs) = calling_conventions 0 5 100 99 incoming arg in loc let loc_results res = let (loc, _ofs) = calling_conventions 0 5 100 100 not_supported res in loc + +let max_arguments_for_tailcalls = + 6 (* in registers *) + 64 (* in domain state *) + let loc_external_arguments _arg = fatal_error "Proc.loc_external_arguments" let loc_external_results res = diff --git a/asmcomp/power/emit.mlp b/asmcomp/power/emit.mlp index 4ce02425d5..28d9023be1 100644 --- a/asmcomp/power/emit.mlp +++ b/asmcomp/power/emit.mlp @@ -49,15 +49,16 @@ let frame_size env = let slot_offset env loc cls = match loc with - Local n -> + | Local n -> reserved_stack_space + env.stack_offset + (if cls = 0 then env.f.fun_num_stack_slots.(1) * size_float + n * size_int else n * size_float) | Incoming n -> (* Callee's [reserved_stack_space] is included in [frame_size]. To access incoming arguments, add caller's [reserverd_stack_space]. *) - frame_size env + reserved_stack_space + n + frame_size env + reserved_stack_space + n | Outgoing n -> reserved_stack_space + n + | Domainstate _ -> assert false (* not a stack slot *) let retaddr_offset env = match abi with @@ -133,8 +134,12 @@ let emit_reg r = let emit_stack env r = match r.loc with + | Stack (Domainstate n) -> + let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in + `{emit_int ofs}(30)` | Stack s -> - let ofs = slot_offset env s (register_class r) in `{emit_int ofs}(1)` + let ofs = slot_offset env s (register_class r) in + `{emit_int ofs}(1)` | _ -> Misc.fatal_error "Emit.emit_stack" (* Output the name of a symbol plus an optional offset *) diff --git a/asmcomp/power/proc.ml b/asmcomp/power/proc.ml index 2563d841b5..1a37578111 100644 --- a/asmcomp/power/proc.ml +++ b/asmcomp/power/proc.ml @@ -92,6 +92,8 @@ let stack_slot slot ty = (* Calling conventions *) +let size_domainstate_args = 64 * size_int + let loc_int last_int make_stack reg_use_stack int ofs = if !int <= last_int then begin let l = phys_reg !int in @@ -136,12 +138,12 @@ let loc_int_pair last_int make_stack int ofs = [| stack_lower; stack_upper |] end -let calling_conventions first_int last_int first_float last_float make_stack - arg = +let calling_conventions first_int last_int first_float last_float + make_stack first_stack arg = let loc = Array.make (Array.length arg) Reg.dummy in let int = ref first_int in let float = ref first_float in - let ofs = ref 0 in + let ofs = ref first_stack in for i = 0 to Array.length arg - 1 do match arg.(i) with | Val | Int | Addr -> @@ -149,23 +151,30 @@ let calling_conventions first_int last_int first_float last_float make_stack | Float -> loc.(i) <- loc_float last_float make_stack false int float ofs done; - (loc, Misc.align !ofs 16) (* keep stack 16-aligned *) - -let incoming ofs = Incoming ofs -let outgoing ofs = Outgoing ofs + (loc, Misc.align (max 0 !ofs) 16) (* keep stack 16-aligned *) + +let incoming ofs = + if ofs >= 0 + then Incoming ofs + else Domainstate (ofs + size_domainstate_args) +let outgoing ofs = + if ofs >= 0 + then Outgoing ofs + else Domainstate (ofs + size_domainstate_args) let not_supported _ofs = fatal_error "Proc.loc_results: cannot call" -let max_arguments_for_tailcalls = 16 +let max_arguments_for_tailcalls = 16 (* in regs *) + 64 (* in domain state *) let loc_arguments arg = - calling_conventions 0 15 100 112 outgoing arg + calling_conventions 0 15 100 112 outgoing (- size_domainstate_args) arg let loc_parameters arg = - let (loc, _ofs) = calling_conventions 0 15 100 112 incoming arg + let (loc, _ofs) = + calling_conventions 0 15 100 112 incoming (- size_domainstate_args) arg in loc let loc_results res = - let (loc, _ofs) = calling_conventions 0 15 100 112 not_supported res + let (loc, _ofs) = calling_conventions 0 15 100 112 not_supported 0 res in loc (* C calling conventions for ELF32: @@ -244,7 +253,7 @@ let loc_external_arguments ty_args = (* Results are in GPR 3 and FPR 1 *) let loc_external_results res = - let (loc, _ofs) = calling_conventions 0 1 100 100 not_supported res + let (loc, _ofs) = calling_conventions 0 1 100 100 not_supported 0 res in loc (* Exceptions are in GPR 3 *) diff --git a/asmcomp/printmach.ml b/asmcomp/printmach.ml index a0afc1a48e..656f951185 100644 --- a/asmcomp/printmach.ml +++ b/asmcomp/printmach.ml @@ -38,6 +38,8 @@ let reg ppf r = fprintf ppf "[si%i]" s | Stack(Outgoing s) -> fprintf ppf "[so%i]" s + | Stack(Domainstate s) -> + fprintf ppf "[ds%i]" s end let regs ppf v = diff --git a/asmcomp/reg.ml b/asmcomp/reg.ml index 2311a529bb..29dcbf725d 100644 --- a/asmcomp/reg.ml +++ b/asmcomp/reg.ml @@ -56,6 +56,7 @@ and stack_location = Local of int | Incoming of int | Outgoing of int + | Domainstate of int type reg = t diff --git a/asmcomp/reg.mli b/asmcomp/reg.mli index ad462c20a2..38983279ea 100644 --- a/asmcomp/reg.mli +++ b/asmcomp/reg.mli @@ -42,6 +42,29 @@ and stack_location = Local of int | Incoming of int | Outgoing of int + | Domainstate of int + +(* The [stack_location] describes the location of pseudo-registers + that reside in memory. + - [Local] is a local variable or spilled register residing in the stack frame + of the current function + - [Incoming] is a function parameter that was passed on the stack. + This is the callee's view: the location is just above the callee's + stack frame, in the caller's stack frame. + - [Outgoing] is a function call argument that is passed on the stack. + This is the caller's view: the location is at the bottom of the + caller's stack frame. + - [Domainstate] is a function call argument that is passed not on stack + but in the [extra_params] section of the domain state + (see file [../runtime/caml/domain_state.*]). Unlike arguments passed + on stack, arguments passed via the domain state are compatible with + tail calls. However, domain state locations are shared between + all functions that run in a given domain, hence they are not preserved + by function calls or thread context switches. The caller stores + arguments in the domain state immediately before the call, and the + first thing the callee does is copy them to registers or [Local] + stack locations. Neither GC nor thread context switches can occur + between these two times. *) val dummy: t val create: Cmm.machtype_component -> t diff --git a/asmcomp/riscv/emit.mlp b/asmcomp/riscv/emit.mlp index a2b011f355..474a3c6857 100644 --- a/asmcomp/riscv/emit.mlp +++ b/asmcomp/riscv/emit.mlp @@ -39,12 +39,17 @@ let frame_size env = let slot_offset env loc cls = match loc with | Local n -> - if cls = 0 - then env.stack_offset + env.f.fun_num_stack_slots.(1) * size_float - + n * size_int - else env.stack_offset + n * size_float - | Incoming n -> frame_size env + n - | Outgoing n -> n + ("sp", + if cls = 0 + then env.stack_offset + env.f.fun_num_stack_slots.(1) * size_float + + n * size_int + else env.stack_offset + n * size_float) + | Incoming n -> + ("sp", frame_size env + n) + | Outgoing n -> + ("sp", n) + | Domainstate n -> + ("s11", n + Domainstate.(idx_of_field Domain_extra_params) * 8) (* Output a symbol *) @@ -108,38 +113,32 @@ let adjust_stack_offset env delta = env.stack_offset <- env.stack_offset + delta; cfi_adjust_cfa_offset delta -let emit_mem_op op src ofs = +let emit_mem_op ?(base = "sp") op src ofs = if is_immediate ofs then - ` {emit_string op} {emit_string src}, {emit_int ofs}(sp)\n` + ` {emit_string op} {emit_string src}, {emit_int ofs}({emit_string base})\n` else begin ` li {emit_reg reg_tmp}, {emit_int ofs}\n`; - ` add {emit_reg reg_tmp}, sp, {emit_reg reg_tmp}\n`; + ` add {emit_reg reg_tmp}, {emit_string base}, {emit_reg reg_tmp}\n`; ` {emit_string op} {emit_string src}, 0({emit_reg reg_tmp})\n` end -let emit_store src ofs = - emit_mem_op "sd" src ofs - -let emit_load dst ofs = - emit_mem_op "ld" dst ofs - let reload_ra n = - emit_load "ra" (n - size_addr) + emit_mem_op "ld" "ra" (n - size_addr) let store_ra n = - emit_store "ra" (n - size_addr) + emit_mem_op "sd" "ra" (n - size_addr) -let emit_store src ofs = - emit_store (reg_name src) ofs +let emit_store ?base src ofs = + emit_mem_op ?base "sd" (reg_name src) ofs -let emit_load dst ofs = - emit_load (reg_name dst) ofs +let emit_load ?base dst ofs = + emit_mem_op ?base "ld" (reg_name dst) ofs -let emit_float_load dst ofs = - emit_mem_op "fld" (reg_name dst) ofs +let emit_float_load ?base dst ofs = + emit_mem_op ?base "fld" (reg_name dst) ofs -let emit_float_store src ofs = - emit_mem_op "fsd" (reg_name src) ofs +let emit_float_store ?base src ofs = + emit_mem_op ?base "fsd" (reg_name src) ofs (* Record live pointers at call points *) @@ -151,7 +150,9 @@ let record_frame_label env live dbg = {typ = Val; loc = Reg r} -> live_offset := (r lsl 1) + 1 :: !live_offset | {typ = Val; loc = Stack s} as reg -> - live_offset := slot_offset env s (register_class reg) :: !live_offset + let (base, ofs) = slot_offset env s (register_class reg) in + assert (base = "sp"); + live_offset := ofs :: !live_offset | {typ = Addr} as r -> Misc.fatal_error ("bad GC root " ^ Reg.name r) | _ -> () @@ -259,17 +260,17 @@ let emit_instr env i = | {loc = Reg _; typ = Float}, {loc = Reg _; typ = (Val | Int | Addr)} -> ` fmv.x.d {emit_reg dst}, {emit_reg src}\n` | {loc = Reg _; typ = (Val | Int | Addr)}, {loc = Stack s} -> - let ofs = slot_offset env s (register_class dst) in - emit_store src ofs + let (base, ofs) = slot_offset env s (register_class dst) in + emit_store ~base src ofs | {loc = Reg _; typ = Float}, {loc = Stack s} -> - let ofs = slot_offset env s (register_class dst) in - emit_float_store src ofs + let (base, ofs) = slot_offset env s (register_class dst) in + emit_float_store ~base src ofs | {loc = Stack s; typ = (Val | Int | Addr)}, {loc = Reg _} -> - let ofs = slot_offset env s (register_class src) in - emit_load dst ofs + let (base, ofs) = slot_offset env s (register_class src) in + emit_load ~base dst ofs | {loc = Stack s; typ = Float}, {loc = Reg _} -> - let ofs = slot_offset env s (register_class src) in - emit_float_load dst ofs + let (base, ofs) = slot_offset env s (register_class src) in + emit_float_load ~base dst ofs | {loc = Stack _}, {loc = Stack _} | {loc = Unknown}, _ | _, {loc = Unknown} -> Misc.fatal_error "Emit: Imove" diff --git a/asmcomp/riscv/proc.ml b/asmcomp/riscv/proc.ml index 0b37de4c9e..1fdad2ae63 100644 --- a/asmcomp/riscv/proc.ml +++ b/asmcomp/riscv/proc.ml @@ -122,12 +122,14 @@ let stack_slot slot ty = (* Calling conventions *) +let size_domainstate_args = 64 * size_int + let calling_conventions - first_int last_int first_float last_float make_stack arg = + first_int last_int first_float last_float make_stack first_stack arg = let loc = Array.make (Array.length arg) Reg.dummy in let int = ref first_int in let float = ref first_float in - let ofs = ref 0 in + let ofs = ref first_stack in for i = 0 to Array.length arg - 1 do match arg.(i) with | Val | Int | Addr as ty -> @@ -147,32 +149,38 @@ let calling_conventions ofs := !ofs + size_float end done; - (loc, Misc.align !ofs 16) (* Keep stack 16-aligned. *) - -let incoming ofs = Incoming ofs -let outgoing ofs = Outgoing ofs + (loc, Misc.align (max 0 !ofs) 16) (* Keep stack 16-aligned. *) + +let incoming ofs = + if ofs >= 0 + then Incoming ofs + else Domainstate (ofs + size_domainstate_args) +let outgoing ofs = + if ofs >= 0 + then Outgoing ofs + else Domainstate (ofs + size_domainstate_args) let not_supported _ = fatal_error "Proc.loc_results: cannot call" -let max_arguments_for_tailcalls = 16 +let max_arguments_for_tailcalls = 16 (* in regs *) + 64 (* in domain state *) (* OCaml calling convention: first integer args in a0 .. a7, s2 .. s9 first float args in fa0 .. fa7, fs2 .. fs9 - remaining args on stack. + remaining args in domain state area, then on stack. Return values in a0 .. a7, s2 .. s9 or fa0 .. fa7, fs2 .. fs9. *) let loc_arguments arg = - calling_conventions 0 15 110 125 outgoing arg + calling_conventions 0 15 110 125 outgoing (- size_domainstate_args) arg let loc_parameters arg = let (loc, _ofs) = - calling_conventions 0 15 110 125 incoming arg + calling_conventions 0 15 110 125 incoming (- size_domainstate_args) arg in loc let loc_results res = let (loc, _ofs) = - calling_conventions 0 15 110 125 not_supported res + calling_conventions 0 15 110 125 not_supported 0 res in loc @@ -219,7 +227,7 @@ let loc_external_arguments ty_args = external_calling_conventions 0 7 110 117 outgoing arg let loc_external_results res = - let (loc, _ofs) = calling_conventions 0 1 110 111 not_supported res + let (loc, _ofs) = calling_conventions 0 1 110 111 not_supported 0 res in loc (* Exceptions are in a0 *) diff --git a/asmcomp/s390x/emit.mlp b/asmcomp/s390x/emit.mlp index 156084ea0c..5b2e5931dc 100644 --- a/asmcomp/s390x/emit.mlp +++ b/asmcomp/s390x/emit.mlp @@ -43,6 +43,7 @@ let slot_offset env loc cls = else env.stack_offset + n * size_float | Incoming n -> frame_size env + n | Outgoing n -> n + | Domainstate _ -> assert false (* not a stack slot *) (* Output a symbol *) @@ -93,8 +94,12 @@ let reg_r7 = check_phys_reg 5 "%r7" let emit_stack env r = match r.loc with - Stack s -> - let ofs = slot_offset env s (register_class r) in `{emit_int ofs}(%r15)` + | Stack (Domainstate n) -> + let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in + `{emit_int ofs}(%r10)` + | Stack s -> + let ofs = slot_offset env s (register_class r) in + `{emit_int ofs}(%r15)` | _ -> fatal_error "Emit.emit_stack" diff --git a/asmcomp/s390x/proc.ml b/asmcomp/s390x/proc.ml index 3e15cf97c1..c9400e7c2d 100644 --- a/asmcomp/s390x/proc.ml +++ b/asmcomp/s390x/proc.ml @@ -96,6 +96,8 @@ let stack_slot slot ty = (* Calling conventions *) +let size_domainstate_args = 64 * size_int + let calling_conventions first_int last_int first_float last_float make_stack stack_ofs arg = let loc = Array.make (Array.length arg) Reg.dummy in @@ -121,19 +123,26 @@ let calling_conventions ofs := !ofs + size_float end done; - (loc, Misc.align !ofs 16) - (* Keep stack 16-aligned. *) - -let incoming ofs = Incoming ofs -let outgoing ofs = Outgoing ofs + (loc, Misc.align (max 0 !ofs) 16) (* Keep stack 16-aligned. *) + +let incoming ofs = + if ofs >= 0 + then Incoming ofs + else Domainstate (ofs + size_domainstate_args) +let outgoing ofs = + if ofs >= 0 + then Outgoing ofs + else Domainstate (ofs + size_domainstate_args) let not_supported _ofs = fatal_error "Proc.loc_results: cannot call" -let max_arguments_for_tailcalls = 8 +let max_arguments_for_tailcalls = 8 (* in regs *) + 64 (* in domain state *) let loc_arguments arg = - calling_conventions 0 7 100 103 outgoing 0 arg + calling_conventions 0 7 100 103 outgoing (- size_domainstate_args) arg let loc_parameters arg = - let (loc, _ofs) = calling_conventions 0 7 100 103 incoming 0 arg in loc + let (loc, _ofs) = + calling_conventions 0 7 100 103 incoming (- size_domainstate_args) arg + in loc let loc_results res = let (loc, _ofs) = calling_conventions 0 7 100 103 not_supported 0 res in loc diff --git a/runtime/caml/domain_state.h b/runtime/caml/domain_state.h index ee4613d60b..84e677d041 100644 --- a/runtime/caml/domain_state.h +++ b/runtime/caml/domain_state.h @@ -22,6 +22,9 @@ #include "misc.h" #include "mlvalues.h" +#define NUM_EXTRA_PARAMS 64 +typedef value extra_params_area[NUM_EXTRA_PARAMS]; + /* This structure sits in the TLS area and is also accessed efficiently * via native code, which is why the indices are important */ @@ -33,7 +36,6 @@ typedef struct { #endif #include "domain_state.tbl" #undef DOMAIN_STATE - CAMLalign(8) char end_of_domain_state; } caml_domain_state; enum { @@ -43,11 +45,17 @@ enum { #undef DOMAIN_STATE }; +#ifdef CAML_NAME_SPACE +#define LAST_DOMAIN_STATE_MEMBER extra_params +#else +#define LAST_DOMAIN_STATE_MEMBER _extra_params +#endif + /* Check that the structure was laid out without padding, since the runtime assumes this in computing offsets */ CAML_STATIC_ASSERT( - offsetof(caml_domain_state, end_of_domain_state) == - Domain_state_num_fields * 8); + offsetof(caml_domain_state, LAST_DOMAIN_STATE_MEMBER) == + (Domain_state_num_fields - 1) * 8); CAMLextern caml_domain_state* Caml_state; #ifdef CAML_NAME_SPACE diff --git a/runtime/caml/domain_state.tbl b/runtime/caml/domain_state.tbl index 7a349ef8df..4429f24b64 100644 --- a/runtime/caml/domain_state.tbl +++ b/runtime/caml/domain_state.tbl @@ -89,3 +89,6 @@ DOMAIN_STATE(FILE*, eventlog_out) DOMAIN_STATE(void*, checking_pointer_pc) /* See major_gc.c */ #endif + +DOMAIN_STATE(extra_params_area, extra_params) +/* This member must occur last, because it is an array, not a scalar */ diff --git a/runtime/i386.S b/runtime/i386.S index e1cc5778aa..4d273c9f04 100644 --- a/runtime/i386.S +++ b/runtime/i386.S @@ -437,9 +437,9 @@ G(caml_system__frametable): .globl G(caml_extra_params) G(caml_extra_params): #ifndef SYS_solaris - .space 64 + .space 256 #else - .zero 64 + .zero 256 #endif #if defined(SYS_linux_elf) diff --git a/runtime/i386nt.asm b/runtime/i386nt.asm index 52cd2109da..178a266356 100644 --- a/runtime/i386nt.asm +++ b/runtime/i386nt.asm @@ -310,6 +310,6 @@ _caml_system__frametable LABEL DWORD PUBLIC _caml_extra_params _caml_extra_params LABEL DWORD - BYTE 64 DUP (?) + BYTE 256 DUP (?) END diff --git a/testsuite/tests/basic/tailcalls.ml b/testsuite/tests/basic/tailcalls.ml index 32ac474494..2842022b4a 100644 --- a/testsuite/tests/basic/tailcalls.ml +++ b/testsuite/tests/basic/tailcalls.ml @@ -16,12 +16,52 @@ let rec tailcall16 a b c d e f g h i j k l m n o p = else tailcall16 (a-1) (b+1) (c+2) (d+3) (e+4) (f+5) (g+6) (h+7) (i+8) (j+9) (k+10) (l+11) (m+12) (n+13) (o+14) (p+15) +let rec tailcall32 a b c d e f g h i j k l m n o p + q r s t u v w x y z aa bb cc dd ee ff = + if a < 0 + then b + else tailcall32 (a-1) (b+1) (c+2) (d+3) (e+4) (f+5) (g+6) (h+7) + (i+8) (j+9) (k+10) (l+11) (m+12) (n+13) (o+14) (p+15) + (q+16) (r+17) (s+18) (t+19) (u+20) (v+21) (w+22) (x+23) + (y+24) (z+25) (aa+26) (bb+27) (cc+28) (dd+29) (ee+30) (ff+31) + let indtailcall8 fn a b c d e f g h = fn a b c d e f g h let indtailcall16 fn a b c d e f g h i j k l m n o p = fn a b c d e f g h i j k l m n o p +let rec muttailcall8 a b c d e f g h = + if a < 0 + then b + else auxtailcall8 (a-1) (b+1) (c+2) (d+3) (e+4) (f+5) (g+6) (h+7) + +and auxtailcall8 a b c d e f g h = + muttailcall8 a b c d e f g h + +let rec muttailcall16 a b c d e f g h i j k l m n o p = + if a < 0 + then b + else auxtailcall16 (a-1) (b+1) (c+2) (d+3) (e+4) (f+5) (g+6) (h+7) + (i+8) (j+9) (k+10) (l+11) (m+12) (n+13) (o+14) (p+15) + +and auxtailcall16 a b c d e f g h i j k l m n o p = + muttailcall16 a b c d e f g h i j k l m n o p + +let rec muttailcall32 a b c d e f g h i j k l m n o p + q r s t u v w x y z aa bb cc dd ee ff = + if a < 0 + then b + else auxtailcall32 (a-1) (b+1) (c+2) (d+3) (e+4) (f+5) (g+6) (h+7) + (i+8) (j+9) (k+10) (l+11) (m+12) (n+13) (o+14) (p+15) + (q+16) (r+17) (s+18) (t+19) (u+20) (v+21) (w+22) (x+23) + (y+24) (z+25) (aa+26) (bb+27) (cc+28) (dd+29) (ee+30) (ff+31) + +and auxtailcall32 a b c d e f g h i j k l m n o p + q r s t u v w x y z aa bb cc dd ee ff = + muttailcall32 a b c d e f g h i j k l m n o p + q r s t u v w x y z aa bb cc dd ee ff + (* regression test for PR#6441: *) let rec tailcall16_value_closures a b c d e f g h i j k l m n o p = if a < 0 @@ -36,8 +76,17 @@ let _ = print_int (tailcall8 10000000 0 0 0 0 0 0 0); print_newline(); print_int (tailcall16 10000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0); print_newline(); + print_int (tailcall32 10000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0); + print_newline(); print_int (indtailcall8 tailcall8 10 0 0 0 0 0 0 0); print_newline(); print_int (indtailcall16 tailcall16 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0); print_newline(); print_int (tailcall16_value_closures 10000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0); + print_newline(); + print_int (muttailcall8 10000000 0 0 0 0 0 0 0); print_newline(); + print_int (muttailcall16 10000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0); + print_newline(); + print_int (muttailcall32 10000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0); print_newline() diff --git a/testsuite/tests/basic/tailcalls.reference b/testsuite/tests/basic/tailcalls.reference index c7117bc954..d9ffdea10c 100644 --- a/testsuite/tests/basic/tailcalls.reference +++ b/testsuite/tests/basic/tailcalls.reference @@ -1,6 +1,10 @@ 10000001 10000001 10000001 +10000001 11 11 10000001 +10000001 +10000001 +10000001 -- cgit v1.2.1