(***********************************************************************) (* *) (* Objective Caml *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2000 Institut National de Recherche en Informatique et *) (* en Automatique. All rights reserved. This file is distributed *) (* under the terms of the Q Public License version 1.0. *) (* *) (***********************************************************************) (* $Id$ *) (* Emission of IA64 assembly code *) open Location open Printf open Misc open Cmm open Arch open Proc open Reg open Mach open Linearize open Emitaux (************** Part 1: assembly-level scheduler *******************) (* Representation of resources accessed or produced by instructions *) type resource = string (* A resource is either: - a register name - "stkN" for a stack location - "heap" for the Caml heap - "chkN" for the result of a checkbound instruction *) let is_memory_resource rsrc = String.length rsrc >= 4 && begin match String.sub rsrc 0 3 with "stk" -> true | "hea" -> true | "chk" -> true | _ -> false end let is_mutable_resource rsrc = rsrc <> "r0" && rsrc <> "p0" (* Description of instructions *) type instruction_kind = KA (* A type instruction (int or mem unit) *) | KB (* B type instruction (branch unit) *) | KI (* I type instruction (int unit *) | KF (* F type instruction (FP unit) *) | KM (* M type instruction (mem unit) *) | KB_exc (* B type instruction, exceptional condition, can be moved around *) type instruction_format = F_i (* op imm *) | F_i_pred (* (pred) op imm *) | F_ir_rr (* op p1,p2 = imm, r *) | F_ir_r (* op r = imm, r *) | F_ir_r_pred (* (pred) op r = imm, r *) | F_ld (* op r = [r] *) | F_ld_post (* op r = [r], imm *) | F_r (* op r *) | F_i_r (* op r = imm *) | F_i_r_pred (* (pred) op r = imm *) | F_ri_rr (* op p1,p2 = imm, r *) | F_ri_r (* op r = imm, r *) | F_r_r (* op r = r *) | F_r_r_pred (* (pred) op r = r *) | F_rr_rr (* op p1,p2 = r1, r2 *) | F_r_rir (* op r = r1, imm, r2 *) | F_rr_r (* op r = r1, r2 *) | F_rr_r_pred (* (pred) op r = r1, r2 *) | F_rri_r (* op r = r1, r2, imm *) | F_rrr_r (* op r = r1, r2, r3 *) | F_rrr_r_pred (* (pred) op r = r1, r2, r3 *) | F_st (* op [r] = r *) | F_st_post (* op [r] = r, imm *) type instruction_descr = { opcode: string; (* actual opcode *) latency: int; (* latency in cycles *) kind: instruction_kind; (* kind of instruction *) format: instruction_format } (* how to generate asm for it *) let instruction_table = create_hashtable 73 [ "add", {opcode = "add"; latency = 1; kind = KA; format = F_rr_r}; "add1", {opcode = "add"; latency = 1; kind = KA; format = F_rri_r}; "addcond", {opcode = "add"; latency = 1; kind = KA; format = F_rr_r_pred}; "addi", {opcode = "add"; latency = 1; kind = KA; format = F_ir_r}; "addicond", {opcode = "add"; latency = 1; kind = KA; format = F_ir_r_pred}; "and", {opcode = "and"; latency = 1; kind = KA; format = F_rr_r}; "andi", {opcode = "and"; latency = 1; kind = KA; format = F_ir_r}; "br", {opcode = "br.sptk.many"; latency = 1; kind = KB; format = F_i}; "brret", {opcode = "br.ret.sptk"; latency = 1; kind = KB; format = F_r}; "brcall", {opcode = "br.call.sptk.many"; latency = 1; kind = KB; format = F_i_r}; "brcallcond", {opcode = "br.call.spnt.many"; latency = 1; kind = KB; format = F_i_r_pred}; "brcallcondexc", {opcode = "br.call.spnt.many"; latency = 1; kind = KB_exc; format = F_i_r_pred}; "brcallind", {opcode = "br.call.sptk.many"; latency = 1; kind = KB; format = F_r_r}; "brcond", {opcode = "br.dpnt.many"; latency = 1; kind = KB; format = F_i_pred}; "brind", {opcode = "br.sptk.many"; latency = 1; kind = KB; format = F_r}; "cmp.eq", {opcode = "cmp.eq"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.ge", {opcode = "cmp.ge"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.geu", {opcode = "cmp.geu"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.gt", {opcode = "cmp.gt"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.gtu", {opcode = "cmp.gtu"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.le", {opcode = "cmp.le"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.leu", {opcode = "cmp.leu"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.lt", {opcode = "cmp.lt"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.ltu", {opcode = "cmp.ltu"; latency = 0; kind = KA; format = F_rr_rr}; "cmp.ne", {opcode = "cmp.ne"; latency = 0; kind = KA; format = F_rr_rr}; "cmpi.eq", {opcode = "cmp.eq"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.ge", {opcode = "cmp.ge"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.geu", {opcode = "cmp.geu"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.gt", {opcode = "cmp.gt"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.gtu", {opcode = "cmp.gtu"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.le", {opcode = "cmp.le"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.leu", {opcode = "cmp.leu"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.lt", {opcode = "cmp.lt"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.ltu", {opcode = "cmp.ltu"; latency = 0; kind = KA; format = F_ir_rr}; "cmpi.ne", {opcode = "cmp.ne"; latency = 0; kind = KA; format = F_ir_rr}; "cmpp.eq", {opcode = "cmp.eq"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.ge", {opcode = "cmp.ge"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.geu", {opcode = "cmp.geu"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.gt", {opcode = "cmp.gt"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.gtu", {opcode = "cmp.gtu"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.le", {opcode = "cmp.le"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.leu", {opcode = "cmp.leu"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.lt", {opcode = "cmp.lt"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.ltu", {opcode = "cmp.ltu"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.ne", {opcode = "cmp.ne"; latency = 1; kind = KA; format = F_rr_rr}; "cmpp.ne.and", {opcode = "cmp.ne.and"; latency = 1; kind = KA; format = F_rr_rr}; "cmppi.eq", {opcode = "cmp.eq"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.ge", {opcode = "cmp.ge"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.geu", {opcode = "cmp.geu"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.gt", {opcode = "cmp.gt"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.gtu", {opcode = "cmp.gtu"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.le", {opcode = "cmp.le"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.leu", {opcode = "cmp.leu"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.lt", {opcode = "cmp.lt"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.ltu", {opcode = "cmp.ltu"; latency = 1; kind = KA; format = F_ir_rr}; "cmppi.ne", {opcode = "cmp.ne"; latency = 1; kind = KA; format = F_ir_rr}; "extr.u", {opcode = "extr.u"; latency = 1; kind = KI; format = F_ri_r}; "fabs", {opcode = "fabs"; latency = 1; kind = KF; format = F_r_r}; "fadd.d", {opcode = "fadd.d"; latency = 5; kind = KF; format = F_rr_r}; "fcmp.eq", {opcode = "fcmp.eq"; latency = 1; kind = KF; format = F_rr_rr}; "fcmp.ge", {opcode = "fcmp.ge"; latency = 1; kind = KF; format = F_rr_rr}; "fcmp.gt", {opcode = "fcmp.gt"; latency = 1; kind = KF; format = F_rr_rr}; "fcmp.le", {opcode = "fcmp.le"; latency = 1; kind = KF; format = F_rr_rr}; "fcmp.lt", {opcode = "fcmp.lt"; latency = 1; kind = KF; format = F_rr_rr}; "fcmp.neq", {opcode = "fcmp.neq"; latency = 1; kind = KF; format = F_rr_rr}; "fcvt.fx.trunc", {opcode = "fcvt.fx.trunc"; latency = 7; kind = KF; format = F_r_r}; "fcvt.xf", {opcode = "fcvt.xf"; latency = 5; kind = KF; format = F_r_r}; "fma.d", {opcode = "fma.d"; latency = 5; kind = KF; format = F_rrr_r}; "fmacond", {opcode = "fma.d.s0"; latency = 5; kind = KF; format = F_rrr_r_pred}; "fmas1cond", {opcode = "fma.s1"; latency = 5; kind = KF; format = F_rrr_r_pred}; "fmads1cond", {opcode = "fma.d.s1"; latency = 5; kind = KF; format = F_rrr_r_pred}; "fmpy.d", {opcode = "fmpy.d"; latency = 5; kind = KF; format = F_rr_r}; "fms.d", {opcode = "fms.d"; latency = 5; kind = KF; format = F_rrr_r}; "fneg", {opcode = "fneg"; latency = 1; kind = KF; format = F_r_r}; "fnma.d", {opcode = "fnma.d"; latency = 5; kind = KF; format = F_rrr_r}; "fnmas1cond", {opcode = "fnma.s1"; latency = 5; kind = KF; format = F_rrr_r_pred}; "fnmads1cond", {opcode = "fnma.d.s1"; latency = 5; kind = KF; format = F_rrr_r_pred}; "fnorm.d", {opcode = "fnorm.d"; latency = 5; kind = KF; format = F_r_r}; "frcpa", {opcode = "frcpa.s0"; latency = 5; kind = KF; format = F_rr_rr}; "fsub.d", {opcode = "fsub.d"; latency = 5; kind = KF; format = F_rr_r}; "getf.sig", {opcode = "getf.sig"; latency = 2; kind = KM; format = F_r_r}; "ld1", {opcode = "ld1"; latency = 2; kind = KM; format = F_ld}; "ld2", {opcode = "ld2"; latency = 2; kind = KM; format = F_ld}; "ld4", {opcode = "ld4"; latency = 2; kind = KM; format = F_ld}; "ld8", {opcode = "ld8"; latency = 2; kind = KM; format = F_ld}; "ld8+", {opcode = "ld8"; latency = 2; kind = KM; format = F_ld_post}; "ldfd", {opcode = "ldfd"; latency = 9; kind = KM; format = F_ld}; "ldfd+", {opcode = "ldfd"; latency = 9; kind = KM; format = F_ld_post}; "ldfs", {opcode = "ldfs"; latency = 9; kind = KM; format = F_ld}; "mov", {opcode = "mov"; latency = 1; kind = KA; format = F_r_r}; "movcond", {opcode = "mov"; latency = 1; kind = KA; format = F_r_r_pred}; "movtb", {opcode = "mov"; latency = 9; kind = KI; format = F_r_r}; "movfb", {opcode = "mov"; latency = 2; kind = KI; format = F_r_r}; "movi", {opcode = "mov"; latency = 1; kind = KA; format = F_i_r}; "movicond", {opcode = "mov"; latency = 1; kind = KA; format = F_i_r_pred}; "movil", {opcode = "movl"; latency = 1; kind = KI; format = F_i_r}; "movpr", {opcode = "mov"; latency = 1; kind = KI; format = F_ri_r}; "or", {opcode = "or"; latency = 1; kind = KA; format = F_rr_r}; "ori", {opcode = "or"; latency = 1; kind = KA; format = F_ir_r}; "setf.d", {opcode = "setf.d"; latency = 8; kind = KM; format = F_r_r}; "setf.sig", {opcode = "setf.sig"; latency = 8; kind = KM; format = F_r_r}; "shl", {opcode = "shl"; latency = 2; kind = KI; format = F_rr_r}; "shladd", {opcode = "shladd"; latency = 1; kind = KA; format = F_r_rir}; "shli", {opcode = "shl"; latency = 1; kind = KI; format = F_ri_r}; "shr", {opcode = "shr"; latency = 2; kind = KI; format = F_rr_r}; "shri", {opcode = "shr"; latency = 1; kind = KI; format = F_ri_r}; "shru", {opcode = "shr.u"; latency = 2; kind = KI; format = F_rr_r}; "shrui", {opcode = "shr.u"; latency = 1; kind = KI; format = F_ri_r}; "st1", {opcode = "st1"; latency = 0; kind = KM; format = F_st}; "st2", {opcode = "st2"; latency = 0; kind = KM; format = F_st}; "st4", {opcode = "st4"; latency = 0; kind = KM; format = F_st}; "st8", {opcode = "st8"; latency = 0; kind = KM; format = F_st}; "st8+", {opcode = "st8"; latency = 1; kind = KM; format = F_st_post}; "stfd", {opcode = "stfd"; latency = 0; kind = KM; format = F_st}; "stfd+", {opcode = "stfd"; latency = 1; kind = KM; format = F_st_post}; "stfs", {opcode = "stfs"; latency = 0; kind = KM; format = F_st}; "sub", {opcode = "sub"; latency = 1; kind = KA; format = F_rr_r}; "sub1", {opcode = "sub"; latency = 1; kind = KA; format = F_rri_r}; "subi", {opcode = "sub"; latency = 1; kind = KA; format = F_ir_r}; "sxt1", {opcode = "sxt1"; latency = 1; kind = KI; format = F_r_r}; "sxt2", {opcode = "sxt2"; latency = 1; kind = KI; format = F_r_r}; "sxt4", {opcode = "sxt4"; latency = 1; kind = KI; format = F_r_r}; "tbit.nz", {opcode = "tbit.nz"; latency = 0; kind = KI; format = F_ri_rr}; "tbit.z", {opcode = "tbit.z"; latency = 0; kind = KI; format = F_ri_rr}; "xmpy.l", {opcode = "xmpy.l"; latency = 7; kind = KF; format = F_rr_r}; "xor", {opcode = "xor"; latency = 1; kind = KA; format = F_rr_r}; "xori", {opcode = "xor"; latency = 1; kind = KA; format = F_ir_r}; "#initbarrier", {opcode = "# init barrier"; latency = 0; kind = KI; format = F_i}; ] (* Nodes of the code DAG. Each node represents one instruction to be emitted. *) type code_dag_node = { instr: instruction_descr; (* the instruction *) imm: string; (* its immediate argument, if any *) iarg: resource array; (* arguments *) ires: resource array; (* results *) delay: int; (* how many cycles before result is available *) mutable sons: (code_dag_node * int) list; (* nodes that depend on this node *) mutable date: int; (* start date *) mutable length: int; (* length of longest path to result *) mutable ancestors: int; (* number of ancestors *) mutable emitted_ancestors: int } (* number of emitted ancestors *) (* The code dag itself is represented by two tables from resources to nodes: - "results" maps resources to the instructions that produced them; - "uses" maps resources to the instructions that use them. *) let code_results = (Hashtbl.create 31 : (resource, code_dag_node) Hashtbl.t) let code_uses = (Hashtbl.create 31 : (resource, code_dag_node) Hashtbl.t) let clear_code_dag () = Hashtbl.clear code_results; Hashtbl.clear code_uses (* The ready queue: a list of nodes that can be computed immediately (all arguments are available), kept sorted by decreasing length to results. The in progress queue: a list of nodes whose arguments are being computed, and thus can be computed at a later date, kept sorted by increasing availability date The branch list: a list of all branch instructions (to be emitted last) *) let ready_queue = ref ([] : code_dag_node list) let in_progress_queue = ref ([] : code_dag_node list) let branch_list = ref ([] : code_dag_node list) (* built in reverse order *) let clear_queues () = ready_queue := []; in_progress_queue := []; branch_list := [] let rec insert_queue prio node = function [] -> [node] | hd :: tl as queue -> if prio node hd then node :: queue else hd :: insert_queue prio node tl let length_prio n1 n2 = n1.length > n2.length let date_prio n1 n2 = n1.date < n2.date let add_ready node = ready_queue := insert_queue length_prio node !ready_queue let add_in_progress node = in_progress_queue := insert_queue date_prio node !in_progress_queue let add_branch node = branch_list := node :: !branch_list (* Add an edge to the code DAG *) let add_edge ancestor son delay = ancestor.sons <- (son, delay) :: ancestor.sons; son.ancestors <- son.ancestors + 1 let add_edge_after son ancestor = add_edge ancestor son 0 (* Add an instruction to the code DAG *) let insimm opc arg imm res = let instr = try Hashtbl.find instruction_table opc with Not_found -> fatal_error ("Unknown instruction " ^ opc) in let node = { instr = instr; imm = imm; iarg = arg; ires = res; delay = instr.latency; sons = []; (* to be filled later *) date = 0; (* to be adjusted later *) length = -1; (* to be computed later *) ancestors = 0; (* ditto *) emitted_ancestors = 0 } in (* ditto *) (* RAW dependencies: add edges from all instrs that define one of the resources used *) for i = 0 to Array.length arg - 1 do try let rsrc = arg.(i) in if is_mutable_resource rsrc then begin let anc = Hashtbl.find code_results rsrc in let delay = if is_memory_resource rsrc then 0 else anc.delay in (* Memory accesses are ordered by the hardware, so we can emit a memop 1, then a dependent memop 2 in the same cycle *) add_edge anc node delay end with Not_found -> () done; (* WAR dependencies: add edges from all instrs that use one of the resources defined by this instruction WAW dependencies: add edges from all instrs that define one of the resources defined by this instruction *) for i = 0 to Array.length res - 1 do let rsrc = res.(i) in if is_mutable_resource rsrc then begin (* WAR *) let anc = Hashtbl.find_all code_uses res.(i) in List.iter (add_edge_after node) anc; (* WAW *) try let anc = Hashtbl.find code_results rsrc in let delay = if is_memory_resource rsrc then 0 else 1 in add_edge anc node delay with Not_found -> () end done; (* Remember the results and uses of this instruction *) for i = 0 to Array.length res - 1 do Hashtbl.add code_results res.(i) node done; for i = 0 to Array.length arg - 1 do Hashtbl.add code_uses arg.(i) node done; (* Insert in appropriate queue *) if node.instr.kind = KB then add_branch node else if node.ancestors = 0 then add_ready node let insert opc arg res = insimm opc arg "" res (* Compute length of longest path to a result. *) let rec longest_path node = if node.length < 0 then begin node.length <- List.fold_left (fun len (son, delay) -> max len (longest_path son + delay)) 0 node.sons end; node.length (* Emit the assembly code for a node *) let emit_r = emit_string let emit_instr node = let opc = node.instr.opcode and a = node.iarg and r = node.ires and imm = node.imm in match node.instr.format with F_i -> ` {emit_string opc} {emit_string imm}\n` | F_i_pred -> ` ({emit_r a.(0)}) {emit_string opc} {emit_string imm}\n` | F_ir_rr -> ` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_string imm}, {emit_r a.(0)}\n` | F_ir_r -> ` {emit_string opc} {emit_r r.(0)} = {emit_string imm}, {emit_r a.(0)}\n` | F_ir_r_pred -> ` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_string imm}, {emit_r a.(1)}\n` | F_ld -> ` {emit_string opc} {emit_r r.(0)} = [{emit_r a.(0)}]\n` | F_ld_post -> ` {emit_string opc} {emit_r r.(0)} = [{emit_r a.(0)}], {emit_string imm}\n` | F_r -> ` {emit_string opc} {emit_r a.(0)}\n` | F_i_r -> ` {emit_string opc} {emit_r r.(0)} = {emit_string imm}\n` | F_i_r_pred -> ` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_string imm}\n` | F_ri_rr -> ` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_r a.(0)}, {emit_string imm}\n` | F_ri_r -> ` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_string imm}\n` | F_r_r -> ` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}\n` | F_r_r_pred -> ` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}\n` | F_rr_rr -> ` {emit_string opc} {emit_r r.(0)}, {emit_r r.(1)} = {emit_r a.(0)}, {emit_r a.(1)}\n` | F_r_rir -> ` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_string imm}, {emit_r a.(1)}\n` | F_rr_r -> ` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}\n` | F_rr_r_pred -> ` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}, {emit_r a.(2)}\n` | F_rri_r -> ` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}, {emit_string imm}\n` | F_rrr_r -> ` {emit_string opc} {emit_r r.(0)} = {emit_r a.(0)}, {emit_r a.(1)}, {emit_r a.(2)}\n` | F_rrr_r_pred -> ` ({emit_r a.(0)}) {emit_string opc} {emit_r r.(0)} = {emit_r a.(1)}, {emit_r a.(2)}, {emit_r a.(3)}\n` | F_st -> ` {emit_string opc} [{emit_r a.(0)}] = {emit_r a.(1)}\n` | F_st_post -> ` {emit_string opc} [{emit_r a.(0)}] = {emit_r a.(1)}, {emit_string imm}\n` (* Little state machine reflecting how many instructions the chip can issue in one cycle. We roughly follow the Itanium model: 2 int units, 2 mem units, 2 FP units, and 3 branch units, with a maximum of 6 instructions dispatched per clock cycle. *) let num_A = ref 0 let num_I = ref 0 let num_M = ref 0 let num_F = ref 0 let num_B = ref 0 let reset_issue () = num_A := 0; num_I := 0; num_M := 0; num_F := 0; num_B := 0 let can_issue instr = if !num_A + !num_I + !num_M + !num_F + !num_B >= 6 then false else begin match instr.kind with KA -> if !num_A + !num_I + !num_M < 4 then (incr num_A; true) else false | KF -> if !num_F < 2 then (incr num_F; true) else false | KI -> if !num_I < 2 && !num_A + !num_I + !num_M < 4 then (incr num_I; true) else false | KM -> if !num_M < 2 && !num_A + !num_I + !num_M < 4 then (incr num_M; true) else false | _ (* KB | KB_exc *) -> if !num_B < 3 then (incr num_B; true) else false end (* Emit one node, updating the completion date and number of ancestors emitted for all nodes that depend on this node. Enter the nodes that are no longer waiting on anything (all ancestors emitted) in the ready queue or in the in_progress queue, depending on latency. *) let emit_node date node = begin try (*`# Date: {emit_int date}; distance: {emit_int node.length}\n`;*) emit_instr node with x -> fatal_error ("Error while emitting " ^ node.instr.opcode) end; List.iter (fun (son, delay) -> let completion_date = date + delay in if son.date < completion_date then son.date <- completion_date; son.emitted_ancestors <- son.emitted_ancestors + 1; if son.emitted_ancestors = son.ancestors && son.instr.kind <> KB then begin (*`# {emit_string son.instr.opcode} will be ready at {emit_int son.date}\n`;*) if son.date = date then add_ready son else add_in_progress son end) node.sons (* Emit all ready nodes that we can emit given the architectural constraints. *) let rec emit_ready_nodes filter date = match !ready_queue with [] -> [] | node :: rem -> ready_queue := rem; if filter node && can_issue node.instr then begin emit_node date node; emit_ready_nodes filter date end else node :: emit_ready_nodes filter date let filter_MF node = match node.instr.kind with KM -> true | KF -> true | _ -> false let filter_non_MF node = not(filter_MF node) (* Add all instructions with date <= d to the ready queue, and remove them *) let rec extract_ready d = function [] -> [] | node :: rem as queue -> if node.date <= d then (add_ready node; extract_ready d rem) else queue (* Say if a branch is ready to be emitted now *) let branch_is_ready date br = br.emitted_ancestors = br.ancestors && br.date <= date (* Schedule the basic block, emitting all of its instructions *) let rec reschedule date = match (!ready_queue, !in_progress_queue) with ([], []) -> (* We're done with the regular instructions; finish with the branches *) begin match !branch_list with [] -> () | br -> List.iter emit_instr br; emit_string " ;;\n" end | ([], node :: _) -> (* Advance to the time node.date, extracting from in_progress_queue all instructions ready at that time and adding them to the ready queue *) in_progress_queue := extract_ready node.date !in_progress_queue; (* Try again *) reschedule node.date | (_, _) -> ` # time {emit_int date}\n`; (* Emit and remove as many ready instructions as we can *) (* Give priority to M and F instructions *) reset_issue(); ready_queue := emit_ready_nodes filter_MF date; ready_queue := emit_ready_nodes filter_non_MF date; (* Special hack: if the only remaining instructions are branches and they are all ready now, emit them in the current group of instructions *) if !ready_queue = [] && !in_progress_queue = [] && List.for_all (branch_is_ready date) !branch_list then begin List.iter emit_instr !branch_list; branch_list := [] end; (* Emit a stop to pause the processor *) emit_string " ;;\n"; (* Advance to the time date + 1, extracting from in_progress_queue all instructions ready at that time and adding them to the ready queue *) in_progress_queue := extract_ready (date + 1) !in_progress_queue; (* Try again *) reschedule (date + 1) (* Emit the code for the current basic block *) let end_basic_block () = (* Compute critical paths and rebuild ready queue sorted by decreasing criticality *) let r = !ready_queue in ready_queue := []; let max_length = List.fold_left (fun len node -> max len (longest_path node)) 0 r in List.iter add_ready r; branch_list := List.rev !branch_list; (* Emit the instructions by traversing the code DAG *) reschedule 0; if max_length > 0 then ` # basic block length {emit_int max_length}\n`; clear_code_dag (); clear_queues () (************** Part 2: the code emitter *******************) (* Tradeoff between code size and code speed *) let fastcode_flag = ref true (* Translate or output a label *) let label lbl = sprintf ".L%d" lbl let emit_label lbl = emit_string ".L"; emit_int lbl (* Translate or output a symbol *) let symbol s = let b = Buffer.create (String.length s + 1) in for i = 0 to String.length s - 1 do let c = s.[i] in match c with 'A'..'Z' | 'a'..'z' | '0'..'9' | '_' -> Buffer.add_char b c | _ -> Buffer.add_string b (sprintf "$%02x" (Char.code c)) done; Buffer.add_char b '#'; Buffer.contents b let emit_symbol s = Emitaux.emit_symbol '$' s (* Translate a pseudo-register *) let reg r = match r.loc with Reg r -> register_name r | _ -> assert false let regs r = Array.map reg r (* Output a pseudo-register *) let emit_reg r = match r.loc with Reg r -> emit_string (register_name r) | _ -> fatal_error "Emit_ia64.emit_reg" (* Translate a float as a 64-bit integer *) let float_bits f = let b = Buffer.create 18 in let bytes = (Obj.magic f : string) in Buffer.add_string b "0x"; for i = 7 downto 0 do (* little-endian *) Buffer.add_string b (sprintf "%02x" (Char.code (String.unsafe_get bytes i))) done; Buffer.contents b (* Translate an "ltoffset" reference to a global *) let ltoffset s = sprintf "@ltoff(%s)" (symbol s) let ltoffset_fptr s = sprintf "@ltoff(@fptr(%s))" (symbol s) (* Layout of the stack frame. All stack offsets are shifted by 16 to preserve the scratch area at bottom of stack. *) let stack_offset = ref 0 let frame_size () = let size = !stack_offset + 8 * (num_stack_slots.(0) + num_stack_slots.(1)) + (if !contains_calls then 8 else 0) in Misc.align size 16 let slot_offset loc cl = match loc with Incoming n -> frame_size() + n + 16 | Local n -> if cl = 0 then !stack_offset + n * 8 + 16 else !stack_offset + (num_stack_slots.(0) + n) * 8 + 16 | Outgoing n -> n + 16 let slot_offset_reg r = match r.loc with Stack l -> slot_offset l (register_class r) | _ -> assert false (* Record live pointers at call points *) type frame_descr = { fd_lbl: int; (* Return address *) fd_frame_size: int; (* Size of stack frame *) fd_live_offset: int list } (* Offsets/regs of live addresses *) let frame_descriptors = ref([] : frame_descr list) let record_frame_label live = let lbl = new_label() in let live_offset = ref [] in Reg.Set.iter (function {typ = Addr; loc = Reg r} -> live_offset := ((r lsl 1) + 1) :: !live_offset | {typ = Addr; loc = Stack s} as reg -> live_offset := slot_offset s (register_class reg) :: !live_offset | _ -> ()) live; frame_descriptors := { fd_lbl = lbl; fd_frame_size = frame_size(); fd_live_offset = !live_offset } :: !frame_descriptors; lbl let record_frame live = let lbl = record_frame_label live in `{emit_label lbl}:` let emit_frame fd = ` data8 {emit_label fd.fd_lbl}\n`; ` data2 {emit_int fd.fd_frame_size}\n`; ` data2 {emit_int (List.length fd.fd_live_offset)}\n`; List.iter (fun n -> ` data2 {emit_int n}\n`) fd.fd_live_offset; ` .align 8\n` (* Names of various instructions *) let name_for_int_operation = function Iadd -> "add" | Isub -> "sub" | Iand -> "and" | Ior -> "or" | Ixor -> "xor" | Ilsl -> "shl" | Ilsr -> "shru" | Iasr -> "shr" | _ -> Misc.fatal_error "Emit.name_for_int_operation" let name_for_float_operation = function Inegf -> "fneg" | Iabsf -> "fabs" | Iaddf -> "fadd.d" | Isubf -> "fsub.d" | Imulf -> "fmpy.d" | _ -> Misc.fatal_error "Emit.name_for_float_operation" let name_for_specific_operation = function Imultaddf -> "fma.d" | Imultsubf -> "fms.d" | Isubmultf -> "fnma.d" | _ -> Misc.fatal_error "Emit.name_for_specific_operation" let name_for_int_comparison = function Isigned Ceq -> "eq" | Isigned Cne -> "ne" | Isigned Cle -> "le" | Isigned Cgt -> "gt" | Isigned Clt -> "lt" | Isigned Cge -> "ge" | Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne" | Iunsigned Cle -> "leu" | Iunsigned Cgt -> "gtu" | Iunsigned Clt -> "ltu" | Iunsigned Cge -> "geu" let name_for_swapped_int_comparison = function Isigned Ceq -> "eq" | Isigned Cne -> "ne" | Isigned Cle -> "ge" | Isigned Cgt -> "lt" | Isigned Clt -> "gt" | Isigned Cge -> "le" | Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne" | Iunsigned Cle -> "geu" | Iunsigned Cgt -> "ltu" | Iunsigned Clt -> "gtu" | Iunsigned Cge -> "leu" let name_for_float_comparison cmp = match cmp with Ceq -> "eq" | Cne -> "neq" | Cle -> "le" | Cgt -> "gt" | Clt -> "lt" | Cge -> "ge" (* Immediate range for addl (move) and adds (general add) instructions *) let is_immediate_addl n = n >= -0x200000 && n < 0x200000 let is_immediate_addl_nat n = n >= Nativeint.of_int (-0x200000) && n < Nativeint.of_int 0x200000 let is_immediate_adds n = n >= -0x2000 && n < 0x2000 (* Return the positions of all "1" bits in the given integer, most significant bits first *) let ones_pos n = let rec ones p accu = if p >= 63 then accu else ones (p+1) (if n land (1 lsl p) = 0 then accu else p :: accu) in ones 0 [] (* Generate temporary registers *) let temp_generator temporaries = let counter = ref 0 in fun () -> let r = temporaries.(!counter) in incr counter; if !counter >= Array.length temporaries then counter := 0; r let new_temp_reg = temp_generator [| "r2"; "r3"; "r14"; "r15" |] let new_temp_float = temp_generator [| "f64"; "f65"; "f66"; "f67"; "f68"; "f69"; "f70"; "f71" |] let new_pred = temp_generator [| "p2"; "p3"; "p4"; "p5" |] (* Output the assembly code for an instruction *) (* Name of current function *) let function_name = ref "" (* Entry point for tail recursive calls *) let tailrec_entry_point = ref 0 let emit_instr i = match i.desc with Lend -> () | Lop(Imove | Ispill | Ireload) -> let src = i.arg.(0) and dst = i.res.(0) in if src.loc <> dst.loc then begin match (src.loc, dst.loc) with (Reg _, Reg _) -> insert "mov" (regs i.arg) (regs i.res) | (Reg _, Stack _) -> let offset = string_of_int (slot_offset_reg dst) in let r = new_temp_reg() in insimm "addi" [| "sp" |] offset [| r |]; insert (if i.res.(0).typ = Float then "stfd" else "st8") [| r; reg src |] [| "stk" ^ offset |] | (Stack _, Reg _) -> let offset = string_of_int (slot_offset_reg src) in let r = new_temp_reg() in insimm "addi" [| "sp" |] offset [| r |]; insert (if i.arg.(0).typ = Float then "ldfd" else "ld8") [| r; "stk" ^ offset |] (regs i.res) | (_, _) -> assert false end | Lop(Iconst_int n) -> let instr = if is_immediate_addl_nat n then "movi" else "movil" in insimm instr [||] (Nativeint.to_string n) (regs i.res) | Lop(Iconst_float s) -> let f = float_of_string s in begin match Int64.bits_of_float f with | 0x0000_0000_0000_0000L -> (* +0.0 *) insert "mov" [| "f0" |] (regs i.res) | 0x3FF0_0000_0000_0000L -> (* 1.0 *) insert "mov" [| "f1" |] (regs i.res) | _ -> let tmp = new_temp_reg() in insimm "movil" [||] (float_bits f) [| tmp |]; insert "setf.d" [| tmp |] (regs i.res) end | Lop(Iconst_symbol s) -> insimm "addi" [| "gp" |] (ltoffset s) (regs i.res); insert "ld8" (regs i.res) (regs i.res) | Lop(Icall_ind) -> insert "movtb" (regs i.arg) [| "b0" |]; insert "brcallind" [| "b0" |] [| "b0" |]; end_basic_block(); `{record_frame i.live}\n` | Lop(Icall_imm s) -> insimm "brcall" [||] (symbol s) [| "b0" |]; end_basic_block(); `{record_frame i.live}\n` | Lop(Itailcall_ind) -> let n = frame_size() in insert "movtb" (regs i.arg) [| "b6" |]; if !contains_calls then begin let tmp = new_temp_reg() in insimm "addi" [| "sp" |] (string_of_int (n + 8)) [| tmp |]; insert "ld8" [| tmp |] [| tmp |]; insert "mov" [| tmp |] [| "b0" |] end; if n > 0 then insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |]; insert "brind" [| "b6" |] [||]; end_basic_block() | Lop(Itailcall_imm s) -> if s = !function_name then begin insimm "br" [||] (label !tailrec_entry_point) [||] end else begin let n = frame_size() in if !contains_calls then begin let tmp = new_temp_reg() in insimm "addi" [| "sp" |] (string_of_int (n + 8)) [| tmp |]; insert "ld8" [| tmp |] [| tmp |]; insert "mov" [| tmp |] [| "b0" |] end; if n > 0 then insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |]; insimm "br" [||] (symbol s) [||] end; end_basic_block() | Lop(Iextcall(s, alloc)) -> if alloc then begin let tmp = new_temp_reg() in insimm "addi" [| "gp" |] (ltoffset_fptr s) [| tmp |]; insert "ld8" [| tmp |] [| "r2" |]; insimm "brcall" [||] "caml_c_call#" [| "b0" |]; end_basic_block(); `{record_frame i.live}\n` end else begin insert "mov" [| "gp" |] [| "r7" |]; insimm "brcall" [||] (symbol s) [| "b0" |]; end_basic_block(); insert "mov" [| "r7" |] [| "gp" |] end | Lop(Istackoffset n) -> end_basic_block(); insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |]; stack_offset := !stack_offset + n | Lop(Iload(chunk, addr)) -> let load_instr = match chunk with | Byte_unsigned -> "ld1" | Byte_signed -> "ld1" | Sixteen_unsigned -> "ld2" | Sixteen_signed -> "ld2" | Thirtytwo_unsigned -> "ld4" | Thirtytwo_signed -> "ld4" | Word -> "ld8" | Single -> "ldfs" | Double -> "ldfd" | Double_u -> "ldfd" in insert load_instr [| reg i.arg.(0); "heap" |] (regs i.res); let sext_instr = match chunk with Byte_signed -> "sxt1" | Sixteen_signed -> "sxt2" | Thirtytwo_signed -> "sxt4" | _ -> "" in if sext_instr <> "" then insert sext_instr (regs i.res) (regs i.res) | Lop(Istore(chunk, addr)) -> let store_instr = match chunk with | Byte_unsigned -> "st1" | Byte_signed -> "st1" | Sixteen_unsigned -> "st2" | Sixteen_signed -> "st2" | Thirtytwo_unsigned -> "st4" | Thirtytwo_signed -> "st4" | Word -> "st8" | Single -> "stfs" | Double -> "stfd" | Double_u -> "stfd" in insert store_instr [| reg i.arg.(1); reg i.arg.(0) |] [| "heap" |] | Lop(Ialloc n) -> if !fastcode_flag then begin insimm "addi" [| "r4" |] (string_of_int (-n)) [| "r4" |]; insert "cmp.ltu" [| "r4"; "r5" |] [| "p6"; "p0" |]; insimm "movi" [||] (string_of_int n) [| "r2" |]; insimm "brcallcond" [| "p6" |] "caml_call_gc#" [| "b0" |]; end_basic_block(); `{record_frame i.live}\n`; insimm "addi" [| "r4" |] "8" (regs i.res) end else begin insimm "movi" [||] (string_of_int n) [| "r2" |]; insimm "brcall" [||] "caml_allocN#" [| "b0" |]; end_basic_block(); `{record_frame i.live}\n`; insimm "addi" [| "r4" |] "8" (regs i.res) end | Lop(Iintop Imul) -> let t1 = new_temp_float() and t2 = new_temp_float() in insert "setf.sig" [|reg i.arg.(0)|] [| t1 |]; insert "setf.sig" [|reg i.arg.(1)|] [| t2 |]; insert "xmpy.l" [| t1; t2 |] [| t1 |]; insert "getf.sig" [| t1 |] (regs i.res) | Lop(Iintop(Icomp cmp)) -> let comp = "cmpp." ^ name_for_int_comparison cmp in let p1 = new_pred() and p2 = new_pred() in insert comp (regs i.arg) [| p1; p2 |]; insimm "movicond" [| p1 |] "1" (regs i.res); insimm "movicond" [| p2 |] "0" (regs i.res) | Lop(Iintop(Icheckbound)) -> insert "cmp.leu" (regs i.arg) [| "p6"; "p0" |]; insimm "brcallcondexc" [| "p6" |] "caml_ml_array_bound_error#" [| "b0"; "heap" |] | Lop(Iintop op) -> let instr = name_for_int_operation op in insert instr (regs i.arg) (regs i.res) | Lop(Iintop_imm(Imul, n)) -> let src = reg i.arg.(0) and dst = reg i.res.(0) in begin match ones_pos n with [] -> insimm "movi" [||] "0" [|dst|] | [n] -> insimm "shli" [|src|] (string_of_int n) [|dst|] | [n; 0] when n <= 4 -> insimm "shladd" [|src; src|] (string_of_int n) [|dst|] | n1::n2::lst -> let acc1 = new_temp_reg() and acc2 = new_temp_reg() and tmp1 = new_temp_reg() and tmp2 = new_temp_reg() in insimm "shli" [|src|] (string_of_int n1) [|acc1|]; insimm "shli" [|src|] (string_of_int n2) [|acc2|]; let rec add_shifts a1 t1 a2 t2 = function [] -> insert "add" [|a1; a2|] [|dst|] | n::rem -> if n = 0 then insert "add" [|src; a1|] [|a1|] else if n <= 4 then insimm "shladd" [|src; a1|] (string_of_int n) [|a1|] else begin insimm "shli" [|src|] (string_of_int n) [|t1|]; insert "add" [|t1; a1|] [|a1|] end; add_shifts a2 t2 a1 t1 rem in add_shifts acc1 tmp1 acc2 tmp2 lst end | Lop(Iintop_imm(Idiv, n)) -> (* n must be a power of 2 *) let src = regs i.arg and dst = regs i.res in let p1 = new_pred() and p2 = new_pred() in let l = Misc.log2 n in insert "cmpp.lt" [| src.(0); "r0" |] [| p1; p2 |]; if is_immediate_adds (n-1) then insimm "addicond" [| p1; src.(0) |] (string_of_int (n-1)) dst else begin let moveop = if is_immediate_addl (n-1) then "movi" else "movil" in insimm moveop [||] (string_of_int (n-1)) [| "r2" |]; insert "addcond" [| p1; src.(0); "r2" |] dst end; insert "movcond" [| p2; src.(0) |] dst; insimm "shri" dst (string_of_int l) dst | Lop(Iintop_imm(Imod, n)) -> (* n must be a power of 2 *) let src = regs i.arg and dst = regs i.res in let p = new_pred() in let l = Misc.log2 n in insert "cmpp.lt" [| src.(0); "r0" |] [| p; "p0" |]; insimm "extr.u" src (sprintf "0, %d" l) dst; insert "cmpp.ne.and" [| dst.(0); "r0"; p |] [| p; "p0" |]; if is_immediate_adds (-n) then insimm "addicond" [| p; dst.(0) |] (string_of_int (-n)) dst else begin let moveop = if is_immediate_addl (-n) then "movi" else "movil" in insimm moveop [||] (string_of_int (-n)) [| "r2" |]; insert "addcond" [| p; dst.(0); "r2" |] dst end | Lop(Iintop_imm(Icomp cmp, n)) -> let comp = "cmppi." ^ name_for_swapped_int_comparison cmp in let p1 = new_pred() and p2 = new_pred() in insimm comp (regs i.arg) (string_of_int n) [| p1; p2 |]; insimm "movicond" [| p1 |] "1" (regs i.res); insimm "movicond" [| p2 |] "0" (regs i.res) | Lop(Iintop_imm(Icheckbound, n)) -> insimm "cmpi.geu" (regs i.arg) (string_of_int n) [| "p6"; "p0" |]; insimm "brcallcondexc" [| "p6" |] "caml_ml_array_bound_error#" [| "b0"; "heap" |] | Lop(Iintop_imm(op, n)) -> let instr = name_for_int_operation op ^ "i" in insimm instr (regs i.arg) (string_of_int n) (regs i.res) | Lop(Inegf | Iabsf | Iaddf | Isubf | Imulf as op) -> let instr = name_for_float_operation op in insert instr (regs i.arg) (regs i.res) | Lop(Idivf) -> (* Straight from the IA64 application developer's architecture guide, section 13.3.3.1. Modified so that the destination may be equal to one of the operands *) let a = reg i.arg.(0) and b = reg i.arg.(1) and r = reg i.res.(0) and t1 = new_temp_float() and t2 = new_temp_float() and t3 = new_temp_float() and t4 = new_temp_float() and p = new_pred() in insert "frcpa" [| a; b |] [| t1; p |]; insert "fmas1cond" [| p; a; t1; "f0" |] [| t2 |]; insert "fnmas1cond" [| p; b; t1; "f1" |] [| t3 |]; insert "fmas1cond" [| p; t3; t3; t2 |] [| t2 |]; insert "fmas1cond" [| p; t3; t3; "f0" |] [| t4 |]; insert "fmas1cond" [| p; t3; t1; t1 |] [| t1 |]; insert "fmas1cond" [| p; t4; t2; t2 |] [| t2 |]; insert "fmas1cond" [| p; t4; t4; "f0" |] [| t3 |]; insert "fmas1cond" [| p; t4; t1; t1 |] [| t1 |]; insert "fmads1cond" [| p; t3; t2; t2 |] [| t2 |]; insert "fmas1cond" [| p; t3; t1; t1 |] [| t1 |]; insert "fnmads1cond" [| p; b; t2; a |] [| t3 |]; insert "mov" [| t1 |] [| r |]; insert "fmacond" [| p; t3; t1; t2 |] [| r |] | Lop(Ifloatofint) -> let src = regs i.arg and dst = regs i.res in insert "setf.sig" src dst; insert "fcvt.xf" dst dst; insert "fnorm.d" dst dst | Lop(Iintoffloat) -> let src = regs i.arg and dst = regs i.res and tmp = new_temp_float() in insert "fcvt.fx.trunc" src [| tmp |]; insert "getf.sig" [| tmp |] dst | Lop(Ispecific(Iadd1)) -> let s = if Array.length i.arg >= 2 then 1 else 0 in insimm "add1" [| reg i.arg.(0); reg i.arg.(s) |] "1" (regs i.res) | Lop(Ispecific(Isub1)) -> insimm "sub1" (regs i.arg) "1" (regs i.res) | Lop(Ispecific(Ishladd n)) -> insimm "shladd" (regs i.arg) (string_of_int n) (regs i.res) | Lop(Ispecific(Isignextend n)) -> let op = "sxt" ^ string_of_int n in insert op (regs i.arg) (regs i.res) | Lop(Ispecific (Imultaddf | Imultsubf | Isubmultf as sop)) -> let name = name_for_specific_operation sop in insert name (regs i.arg) (regs i.res) | Lop(Ispecific (Istoreincr n)) -> let op = if i.arg.(1).typ = Float then "stfd+" else "st8+" in insimm op [| reg i.arg.(0); reg i.arg.(1) |] (string_of_int n) [| reg i.res.(0); "heapinit" |] | Lop(Ispecific Iinitbarrier) -> insert "#initbarrier" [| "heapinit" |] [| "heap" |] | Lreloadretaddr -> let n = frame_size() + 8 in let tmp = new_temp_reg() in insimm "addi" [| "sp" |] (string_of_int n) [| tmp |]; insert "ld8" [| tmp |] [| tmp |]; insert "movtb" [| tmp |] [| "b0" |] | Lreturn -> let n = frame_size() in if n > 0 then insimm "addi" [| "sp" |] (string_of_int n) [| "sp" |]; insert "brret" [| "b0" |] [||]; end_basic_block() | Llabel lbl -> end_basic_block(); `{emit_label lbl}:\n` | Lbranch lbl -> insimm "br" [||] (label lbl) [||]; end_basic_block() | Lcondbranch(tst, lbl) -> begin match tst with Itruetest -> insimm "cmpi.ne" (regs i.arg) "0" [| "p6"; "p0" |] | Ifalsetest -> insimm "cmpi.eq" (regs i.arg) "0" [| "p6"; "p0" |] | Iinttest cmp -> let comp = "cmp." ^ name_for_int_comparison cmp in insert comp (regs i.arg) [| "p6"; "p0" |] | Iinttest_imm(cmp, n) -> let comp = "cmpi." ^ name_for_swapped_int_comparison cmp in insimm comp (regs i.arg) (string_of_int n) [| "p6"; "p0" |] | Ifloattest(cmp, neg) -> let comp = "fcmp." ^ name_for_float_comparison cmp in insert comp (regs i.arg) (if neg then [| "p0"; "p6" |] else [| "p6"; "p0" |]) | Ioddtest -> insimm "tbit.nz" (regs i.arg) "0" [| "p6"; "p0" |] | Ieventest -> insimm "tbit.z" (regs i.arg) "0" [| "p6"; "p0" |] end; insimm "brcond" [| "p6" |] (label lbl) [||]; end_basic_block() | Lcondbranch3(lbl0, lbl1, lbl2) -> end_basic_block(); let emit_compare n p = function None -> () | Some lbl -> ` cmp.eq p{emit_int p}, p0 = {emit_int n}, {emit_reg i.arg.(0)}\n` in let emit_branch p = function None -> () | Some lbl -> ` (p{emit_int p}) br {emit_label lbl}\n` in emit_compare 0 5 lbl0; emit_compare 1 6 lbl1; emit_compare 2 7 lbl2; emit_branch 5 lbl0; emit_branch 6 lbl1; emit_branch 7 lbl2; ` ;;\n` | Lswitch jumptbl -> end_basic_block(); let numcases = Array.length jumptbl in if numcases <= 9 then begin for j = 0 to numcases / 3 do let n = j * 3 in for k = 0 to 2 do if n + k < numcases - 1 then ` cmp.eq p{emit_int(k+5)}, p0 = {emit_int (n+k)}, {emit_reg i.arg.(0)}\n` done; for k = 0 to 2 do if n + k < numcases - 1 then ` (p{emit_int(k+5)}) br {emit_label jumptbl.(n+k)}\n` else if n + k = numcases - 1 then ` br {emit_label jumptbl.(n+k)}\n` done; ` ;;\n` done end else if numcases <= 47 then begin ` mov r2 = 1\n`; ` cmp.eq p6, p0 = 0, {emit_reg i.arg.(0)}\n`; ` (p6) br {emit_label jumptbl.(0)} ;;\n`; ` shl r2 = r2, {emit_reg i.arg.(0)}\n`; ` cmp.eq p7, p0 = 1, {emit_reg i.arg.(0)}\n`; ` (p7) br {emit_label jumptbl.(1)} ;;\n`; ` mov pr = r2, -1 ;;\n`; for i = 2 to numcases - 1 do ` (p{emit_int i}) br {emit_label jumptbl.(i)}\n` done; ` ;;\n` end else begin let lbl_jumptbl = new_label() in let lbl_ip = new_label() in `{emit_label lbl_ip}: mov r2 = ip ;;\n`; ` add r2 = {emit_label lbl_jumptbl} - {emit_label lbl_ip}, r2 ;;\n`; ` shladd r3 = {emit_reg i.arg.(0)}, 2, r2 ;;\n`; ` ld4 r3 = [r3] ;;\n`; ` sxt4 r3 = r3 ;;\n`; ` add r2 = r2, r3 ;;\n`; ` mov b6 = r2 ;;\n`; ` br b6 ;;\n`; ` .align 4\n`; `{emit_label lbl_jumptbl}:\n`; for i = 0 to numcases - 1 do ` data4 {emit_label jumptbl.(i)} - {emit_label lbl_jumptbl}\n` done; ` .align 16\n` end | Lsetuptrap lbl -> end_basic_block(); let lbl_ip = new_label() in let lbl_next = new_label() in `{emit_label lbl_ip}: mov r2 = ip ;;\n`; ` add r2 = {emit_label lbl_next} - {emit_label lbl_ip}, r2\n`; ` br.sptk {emit_label lbl} ;;\n`; `{emit_label lbl_next}:\n` | Lpushtrap -> end_basic_block(); stack_offset := !stack_offset + 16; (* Store trap pointer at sp, handler addr at sp+8, and decrement sp by 16. Remember, the bottom 16 bytes of the stack must be left free. *) ` add r3 = 8, sp\n`; ` st8 [sp] = r6, -16 ;;\n`; ` st8 [r3] = r2\n`; ` add r6 = 16, sp ;;\n` | Lpoptrap -> end_basic_block(); ` add sp = 16, sp ;;\n`; ` ld8 r6 = [sp] ;;\n`; stack_offset := !stack_offset - 16 | Lraise -> end_basic_block(); ` mov sp = r6\n`; ` add r2 = 8, r6\n`; ` ld8 r6 = [r6] ;;\n`; ` ld8 r2 = [r2] ;;\n`; ` mov b6 = r2 ;;\n`; ` br b6\n` let rec emit_all i = match i.desc with Lend -> () | _ -> emit_instr i; emit_all i.next (* Check if a function contains a tail call to itself *) let rec is_tailrec i = match i.desc with Lend -> false | Lop(Itailcall_imm s) when s = !function_name -> true | _ -> is_tailrec i.next (* Emission of a function declaration *) let fundecl f = function_name := f.fun_name; fastcode_flag := f.fun_fast; stack_offset := 0; ` .text\n`; ` .align 4\n`; ` .global {emit_symbol f.fun_name}#\n`; ` .proc {emit_symbol f.fun_name}#\n`; `{emit_symbol f.fun_name}:\n`; let n = frame_size() in if !contains_calls then begin insert "movfb" [| "b0" |] [| "r2" |]; insimm "addi" [| "sp" |] "8" [| "r3" |]; insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |]; insert "st8" [| "r3"; "r2" |] [||] end else if n > 0 then insimm "addi" [| "sp" |] (string_of_int (-n)) [| "sp" |]; if is_tailrec f.fun_body then begin tailrec_entry_point := new_label(); end_basic_block(); `{emit_label !tailrec_entry_point}:\n` end; emit_all f.fun_body; end_basic_block(); ` .endp {emit_symbol f.fun_name}#\n` (* Emission of data *) let emit_global_symbol s = ` .global {emit_symbol s}#\n`; ` .type {emit_symbol s}#, @object\n`; ` .size {emit_symbol s}#, 8\n` let emit_define_symbol s = emit_global_symbol s; `{emit_symbol s}:\n` let emit_item = function Cglobal_symbol s -> emit_global_symbol s | Cdefine_symbol s -> `{emit_symbol s}:\n` | Cdefine_label lbl -> `{emit_label (100000 + lbl)}:\n` | Cint8 n -> ` data1 {emit_int n}\n` | Cint16 n -> ` data2 {emit_int n}\n` | Cint32 n -> let n' = Nativeint.shift_right (Nativeint.shift_left n 32) 32 in ` data4 {emit_nativeint n'}\n` | Cint n -> ` data8 {emit_nativeint n}\n` | Csingle f -> emit_float32_directive "data4" f | Cdouble f -> emit_float64_directive "data8" f | Csymbol_address s -> ` data8 {emit_symbol s}#\n` | Clabel_address lbl -> ` data8 {emit_label (100000 + lbl)}\n` | Cstring s -> emit_string_directive " string " s | Cskip n -> if n > 0 then ` .skip {emit_int n}\n` | Calign n -> ` .align {emit_int n}\n` let data l = ` .data\n`; ` .align 8\n`; List.iter emit_item l (* Beginning / end of an assembly file *) let begin_assembly() = ` .data\n`; emit_define_symbol (Compilenv.make_symbol (Some "data_begin")); ` .text\n`; emit_define_symbol (Compilenv.make_symbol (Some "code_begin")) let end_assembly () = ` .data\n`; emit_define_symbol (Compilenv.make_symbol (Some "data_end")); ` .text\n`; emit_define_symbol (Compilenv.make_symbol (Some "code_end")); ` .rodata\n`; ` .align 8\n`; emit_define_symbol (Compilenv.make_symbol (Some "frametable")); ` data8 {emit_int (List.length !frame_descriptors)}\n`; List.iter emit_frame !frame_descriptors; frame_descriptors := []