1 files changed, 620 insertions, 299 deletions
diff --git a/asmcomp/arm/emit.mlp b/asmcomp/arm/emit.mlp
index dd995f26be..b0baf86523 100644
--- a/asmcomp/arm/emit.mlp
+++ b/asmcomp/arm/emit.mlp
@@ -2,11 +2,12 @@
 (*                                                                     *)
 (*                                OCaml                                *)
 (*                                                                     *)
-(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
+(*                  Benedikt Meurer, University of Siegen              *)
 (*                                                                     *)
-(*  Copyright 1998 Institut National de Recherche en Informatique et   *)
-(*  en Automatique.  All rights reserved.  This file is distributed    *)
-(*  under the terms of the Q Public License version 1.0.               *)
+(*    Copyright 1998 Institut National de Recherche en Informatique    *)
+(*    et en Automatique. Copyright 2012 Benedikt Meurer. All rights    *)
+(*    reserved.  This file is distributed  under the terms of the Q    *)
+(*    Public License version 1.0.                                      *)
 (*                                                                     *)
 (***********************************************************************)
 
@@ -36,16 +37,25 @@ let emit_label lbl =
 let emit_data_label lbl =
   emit_string ".Ld"; emit_int lbl
 
-(* Output a symbol *)
+(* Symbols *)
 
 let emit_symbol s =
   Emitaux.emit_symbol '$' s
 
+let emit_call s =
+  if !Clflags.dlcode || !pic_code
+  then `bl	{emit_symbol s}(PLT)`
+  else `bl	{emit_symbol s}`
+
+let emit_jump s =
+  if !Clflags.dlcode || !pic_code
+  then `b	{emit_symbol s}(PLT)`
+  else `b	{emit_symbol s}`
+
 (* Output a pseudo-register *)
 
-let emit_reg r =
-  match r.loc with
-  | Reg r -> emit_string (register_name r)
+let emit_reg = function
+    {loc = Reg r} -> emit_string (register_name r)
   | _ -> fatal_error "Emit_arm.emit_reg"
 
 (* Layout of the stack frame *)
@@ -56,14 +66,23 @@ let frame_size () =
   let sz =
     !stack_offset +
     4 * num_stack_slots.(0) +
+    8 * num_stack_slots.(1) +
+    8 * num_stack_slots.(2) +
     (if !contains_calls then 4 else 0)
   in Misc.align sz 8
 
 let slot_offset loc cl =
   match loc with
-    Incoming n -> frame_size() + n
-  | Local n -> !stack_offset + n * 4
-  | Outgoing n -> n
+    Incoming n ->
+      assert (n >= 0);
+      frame_size() + n
+  | Local n ->
+      if cl = 0
+      then !stack_offset + n * 4
+      else !stack_offset + num_stack_slots.(0) * 4 + n * 8
+  | Outgoing n ->
+      assert (n >= 0);
+      n
 
 (* Output a stack reference *)
 
@@ -82,20 +101,13 @@ let emit_addressing addr r n =
 
 (* Record live pointers at call points *)
 
-type frame_descr =
-  { fd_lbl: int;                        (* Return address *)
-    fd_frame_size: int;                 (* Size of stack frame *)
-    fd_live_offset: int list }          (* Offsets/regs of live addresses *)
-
-let frame_descriptors = ref([] : frame_descr list)
-
-let record_frame live =
+let record_frame_label live dbg =
   let lbl = new_label() in
   let live_offset = ref [] in
   Reg.Set.iter
     (function
         {typ = Addr; loc = Reg r} ->
-          live_offset := (r lsl 1) + 1 :: !live_offset
+          live_offset := ((r lsl 1) + 1) :: !live_offset
       | {typ = Addr; loc = Stack s} as reg ->
           live_offset := slot_offset s (register_class reg) :: !live_offset
       | _ -> ())
@@ -103,18 +115,57 @@ let record_frame live =
   frame_descriptors :=
     { fd_lbl = lbl;
       fd_frame_size = frame_size();
-      fd_live_offset = !live_offset } :: !frame_descriptors;
-  `{emit_label lbl}:`
-
-let emit_frame fd =
-  `	.word	{emit_label fd.fd_lbl} + 4\n`;
-  `	.short	{emit_int fd.fd_frame_size}\n`;
-  `	.short	{emit_int (List.length fd.fd_live_offset)}\n`;
-  List.iter
-    (fun n ->
-      `	.short	{emit_int n}\n`)
-    fd.fd_live_offset;
-  `	.align	2\n`
+      fd_live_offset = !live_offset;
+      fd_debuginfo = dbg } :: !frame_descriptors;
+  lbl
+
+let record_frame live dbg =
+  let lbl = record_frame_label live dbg in `{emit_label lbl}:`
+
+(* Record calls to the GC -- we've moved them out of the way *)
+
+type gc_call =
+  { gc_lbl: label;                      (* Entry label *)
+    gc_return_lbl: label;               (* Where to branch after GC *)
+    gc_frame_lbl: label }               (* Label of frame descriptor *)
+
+let call_gc_sites = ref ([] : gc_call list)
+
+let emit_call_gc gc =
+  `{emit_label gc.gc_lbl}:	{emit_call "caml_call_gc"}\n`;
+  `{emit_label gc.gc_frame_lbl}:	b	{emit_label gc.gc_return_lbl}\n`
+
+(* Record calls to caml_ml_array_bound_error.
+   In debug mode, we maintain one call to caml_ml_array_bound_error
+   per bound check site. Otherwise, we can share a single call. *)
+
+type bound_error_call =
+  { bd_lbl: label;                    (* Entry label *)
+    bd_frame_lbl: label }             (* Label of frame descriptor *)
+
+let bound_error_sites = ref ([] : bound_error_call list)
+
+let bound_error_label dbg =
+  if !Clflags.debug || !bound_error_sites = [] then begin
+    let lbl_bound_error = new_label() in
+    let lbl_frame = record_frame_label Reg.Set.empty dbg in
+    bound_error_sites :=
+      { bd_lbl = lbl_bound_error;
+        bd_frame_lbl = lbl_frame } :: !bound_error_sites;
+    lbl_bound_error
+  end else begin
+    let bd = List.hd !bound_error_sites in bd.bd_lbl
+  end
+
+let emit_call_bound_error bd =
+  `{emit_label bd.bd_lbl}:	{emit_call "caml_ml_array_bound_error"}\n`;
+  `{emit_label bd.bd_frame_lbl}:\n`
+
+(* Negate a comparison *)
+
+let negate_integer_comparison = function
+    Isigned cmp   -> Isigned(negate_comparison cmp)
+  | Iunsigned cmp -> Iunsigned(negate_comparison cmp)
 
 (* Names of various instructions *)
 
@@ -124,22 +175,13 @@ let name_for_comparison = function
   | Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne" | Iunsigned Cle -> "ls"
   | Iunsigned Cge -> "cs" | Iunsigned Clt -> "cc" | Iunsigned Cgt -> "hi"
 
-let name_for_float_comparison cmp neg =
-  match cmp with
-    Ceq -> if neg then "ne" else "eq"
-  | Cne -> if neg then "eq" else "ne"
-  | Cle -> if neg then "hi" else "ls"
-  | Cge -> if neg then "lt" else "ge"
-  | Clt -> if neg then "pl" else "mi"
-  | Cgt -> if neg then "le" else "gt"
-
 let name_for_int_operation = function
     Iadd -> "add"
   | Isub -> "sub"
   | Imul -> "mul"
-  | Iand  -> "and"
-  | Ior   -> "orr"
-  | Ixor  -> "eor"
+  | Iand -> "and"
+  | Ior  -> "orr"
+  | Ixor -> "eor"
   | _ -> assert false
 
 let name_for_shift_operation = function
@@ -148,193 +190,306 @@ let name_for_shift_operation = function
   | Iasr -> "asr"
   | _ -> assert false
 
-let name_for_shift_int_operation = function
-    Ishiftadd -> "add"
-  | Ishiftsub -> "sub"
-  | Ishiftsubrev -> "rsb"
-
-(* Recognize immediate operands *)
-
-(* Immediate operands are 8-bit immediate values, zero-extended, and rotated
-   right by 0, 2, 4, ... 30 bits.
-   We check only with 8-bit values shifted left 0 to 24 bits. *)
-
-let rec is_immed n shift =
-  shift <= 24 &&
-  (Nativeint.logand n (Nativeint.shift_left (Nativeint.of_int 0xFF) shift) = n
-   || is_immed n (shift + 2))
-
-let is_immediate n = is_immed n 0
-
 (* General functional to decompose a non-immediate integer constant
-   into 8-bit chunks shifted left 0 ... 24 bits *)
+   into 8-bit chunks shifted left 0 ... 30 bits. *)
 
 let decompose_intconst n fn =
   let i = ref n in
   let shift = ref 0 in
   let ninstr = ref 0 in
-  while !i <> 0n do
-    if Nativeint.to_int (Nativeint.shift_right !i !shift) land 3 = 0 then
+  while !i <> 0l do
+    if Int32.logand (Int32.shift_right !i !shift) 3l = 0l then
       shift := !shift + 2
     else begin
-      let mask = Nativeint.shift_left 0xFFn !shift in
-      let bits = Nativeint.logand !i mask in
-      fn bits;
+      let bits = Int32.logand !i (Int32.shift_left 0xffl !shift) in
+      i := Int32.sub !i bits;
       shift := !shift + 8;
-      i := Nativeint.sub !i bits;
-      incr ninstr
+      incr ninstr;
+      fn bits
     end
   done;
   !ninstr
 
 (* Load an integer constant into a register *)
 
-let emit_intconst r n =
-  let nr = Nativeint.lognot n in
+let emit_intconst dst n =
+  let nr = Int32.lognot n in
   if is_immediate n then begin
-    `	mov	{emit_reg r}, #{emit_nativeint n}\n`; 1
+    (* Use movs here to enable 16-bit T1 encoding *)
+    `	movs	{emit_reg dst}, #{emit_int32 n}\n`; 1
   end else if is_immediate nr then begin
-    `	mvn	{emit_reg r}, #{emit_nativeint nr}\n`; 1
+    `	mvn	{emit_reg dst}, #{emit_int32 nr}\n`; 1
+  end else if !arch > ARMv6 then begin
+    let nl = Int32.logand 0xffffl n in
+    let nh = Int32.logand 0xffffl (Int32.shift_right_logical n 16) in
+    if nh = 0l then begin
+      `	movw	{emit_reg dst}, #{emit_int32 nl}\n`; 1
+    end else if Int32.logand nl 0xffl = nl then begin
+      `	movs	{emit_reg dst}, #{emit_int32 nl}\n`;
+      `	movt	{emit_reg dst}, #{emit_int32 nh}\n`; 2
+    end else begin
+      `	movw	{emit_reg dst}, #{emit_int32 nl}\n`;
+      `	movt	{emit_reg dst}, #{emit_int32 nh}\n`; 2
+    end
   end else begin
     let first = ref true in
     decompose_intconst n
       (fun bits ->
         if !first
-        then `	mov	{emit_reg r}, #{emit_nativeint bits} @ {emit_nativeint n}\n`
-        else `	add	{emit_reg r}, {emit_reg r}, #{emit_nativeint bits}\n`;
+        then `	mov	{emit_reg dst}, #{emit_int32 bits} @ {emit_int32 n}\n`
+        else `	add	{emit_reg dst}, {emit_reg dst}, #{emit_int32 bits}\n`;
         first := false)
   end
 
 (* Adjust sp (up or down) by the given byte amount *)
 
-let emit_stack_adjustment instr n =
-  if n <= 0 then 0 else
-    decompose_intconst (Nativeint.of_int n)
-      (fun bits ->
-        `	{emit_string instr}	sp, sp, #{emit_nativeint bits}\n`)
+let emit_stack_adjustment n =
+  if n = 0 then 0 else begin
+    let instr = if n < 0 then "sub" else "add" in
+    let ninstr = decompose_intconst (Int32.of_int (abs n))
+                   (fun bits ->
+                     `	{emit_string instr}	sp, sp, #{emit_int32 bits}\n`) in
+    cfi_adjust_cfa_offset (-n);
+    ninstr
+  end
+
+(* Deallocate the stack frame before a return or tail call *)
+
+let output_epilogue f =
+  let n = frame_size() in
+  if n > 0 then begin
+    let ninstr = emit_stack_adjustment n in
+    let ninstr = ninstr + f () in
+    (* reset CFA back cause function body may continue *)
+    cfi_adjust_cfa_offset n;
+    ninstr
+  end else
+    f ()
 
 (* Name of current function *)
 let function_name = ref ""
 (* Entry point for tail recursive calls *)
 let tailrec_entry_point = ref 0
-(* Table of symbols referenced *)
-let symbol_constants = (Hashtbl.create 11 : (string, int) Hashtbl.t)
-(* Table of floating-point literals *)
-let float_constants = (Hashtbl.create 11 : (string, int) Hashtbl.t)
-(* Total space (in word) occupied by pending literals *)
+(* Pending floating-point literals *)
+let float_literals = ref ([] : (string * label) list)
+(* Pending relative references to the global offset table *)
+let gotrel_literals = ref ([] : (label * label) list)
+(* Pending symbol literals *)
+let symbol_literals = ref ([] : (string * label) list)
+(* Total space (in words) occupied by pending literals *)
 let num_literals = ref 0
 
-(* Label a symbol or float constant *)
-let label_constant tbl s size =
+(* Label a floating-point literal *)
+let float_literal f =
   try
-    Hashtbl.find tbl s
+    List.assoc f !float_literals
   with Not_found ->
     let lbl = new_label() in
-    Hashtbl.add tbl s lbl;
-    num_literals := !num_literals + size;
+    num_literals := !num_literals + 2;
+    float_literals := (f, lbl) :: !float_literals;
     lbl
 
-(* Emit all pending constants *)
-
-let emit_constants () =
-  Hashtbl.iter
-    (fun s lbl ->
-      `{emit_label lbl}:        .word   {emit_symbol s}\n`)
-    symbol_constants;
-  Hashtbl.iter
-    (fun s lbl ->
-      `{emit_label lbl}:        .double {emit_string s}\n`)
-    float_constants;
-  Hashtbl.clear symbol_constants;
-  Hashtbl.clear float_constants;
+(* Label a GOTREL literal *)
+let gotrel_literal l =
+  let lbl = new_label() in
+  num_literals := !num_literals + 1;
+  gotrel_literals := (l, lbl) :: !gotrel_literals;
+  lbl
+
+(* Label a symbol literal *)
+let symbol_literal s =
+  try
+    List.assoc s !symbol_literals
+  with Not_found ->
+    let lbl = new_label() in
+    num_literals := !num_literals + 1;
+    symbol_literals := (s, lbl) :: !symbol_literals;
+    lbl
+
+(* Emit all pending literals *)
+let emit_literals() =
+  if !float_literals <> [] then begin
+    `	.align	3\n`;
+    List.iter
+      (fun (f, lbl) ->
+        `{emit_label lbl}:	.double	{emit_string f}\n`)
+      !float_literals;
+    float_literals := []
+  end;
+  if !symbol_literals <> [] then begin
+    let offset = if !thumb then 4 else 8 in
+    let suffix = if !pic_code then "(GOT)" else "" in
+    `	.align	2\n`;
+    List.iter
+      (fun (l, lbl) ->
+        `{emit_label lbl}:	.word	_GLOBAL_OFFSET_TABLE_-({emit_label l}+{emit_int offset})\n`)
+      !gotrel_literals;
+    List.iter
+      (fun (s, lbl) ->
+        `{emit_label lbl}:	.word	{emit_symbol s}{emit_string suffix}\n`)
+      !symbol_literals;
+    gotrel_literals := [];
+    symbol_literals := []
+  end;
   num_literals := 0
 
+(* Emit code to load the address of a symbol *)
+
+let emit_load_symbol_addr dst s =
+  if !pic_code then begin
+    let lbl_pic = new_label() in
+    let lbl_got = gotrel_literal lbl_pic in
+    let lbl_sym = symbol_literal s in
+    (* Both r3 and r12 are marked as clobbered in PIC mode (cf. proc.ml),
+       so use r12 as temporary scratch register unless the destination is
+       r12, then we use r3 instead. *)
+    let tmp = if dst.loc = Reg 8 (*r12*)
+              then phys_reg 3 (*r3*)
+              else phys_reg 8 (*r12*) in
+    `	ldr	{emit_reg tmp}, {emit_label lbl_got}\n`;
+    `	ldr	{emit_reg dst}, {emit_label lbl_sym}\n`;
+    `{emit_label lbl_pic}:	add	{emit_reg tmp}, pc, {emit_reg tmp}\n`;
+    `	ldr	{emit_reg dst}, [{emit_reg tmp}, {emit_reg dst}] @ {emit_symbol s}\n`;
+    4
+  end else if !arch > ARMv6 && not !Clflags.dlcode && !fastcode_flag then begin
+    `	movw	{emit_reg dst}, #:lower16:{emit_symbol s}\n`;
+    `	movt	{emit_reg dst}, #:upper16:{emit_symbol s}\n`;
+    2
+  end else begin
+    let lbl = symbol_literal s in
+    `	ldr	{emit_reg dst}, {emit_label lbl} @ {emit_symbol s}\n`;
+    1
+  end
+
 (* Output the assembly code for an instruction *)
 
 let emit_instr i =
+    emit_debug_info i.dbg;
     match i.desc with
       Lend -> 0
     | Lop(Imove | Ispill | Ireload) ->
         let src = i.arg.(0) and dst = i.res.(0) in
         if src.loc = dst.loc then 0 else begin
-          match (src, dst) with
-            {loc = Reg rs; typ = Int|Addr}, {loc = Reg rd; typ = Int|Addr} ->
-              `	mov	{emit_reg dst}, {emit_reg src}\n`; 1
-          | {loc = Reg rs; typ = Int|Addr}, {loc = Stack sd} ->
-              `	str	{emit_reg src}, {emit_stack dst}\n`; 1
-          | {loc = Stack ss; typ = Int|Addr}, {loc = Reg rd} ->
-              `	ldr	{emit_reg dst}, {emit_stack src}\n`; 1
+          begin match (src, dst) with
+            {loc = Reg _; typ = Float}, {loc = Reg _} ->
+              `	fcpyd	{emit_reg dst}, {emit_reg src}\n`
+          | {loc = Reg _}, {loc = Reg _} ->
+              `	mov	{emit_reg dst}, {emit_reg src}\n`
+          | {loc = Reg _; typ = Float}, _ ->
+              `	fstd	{emit_reg src}, {emit_stack dst}\n`
+          | {loc = Reg _}, _ ->
+              `	str	{emit_reg src}, {emit_stack dst}\n`
+          | {typ = Float}, _ ->
+              `	fldd	{emit_reg dst}, {emit_stack src}\n`
           | _ ->
-              assert false
+              `	ldr	{emit_reg dst}, {emit_stack src}\n`
+          end; 1
         end
     | Lop(Iconst_int n) ->
-        emit_intconst i.res.(0) n
-    | Lop(Iconst_float s) ->
-        let bits = Int64.bits_of_float (float_of_string s) in
-        let high_bits = Int64.to_nativeint (Int64.shift_right_logical bits 32)
-        and low_bits = Int64.to_nativeint bits in
-        if is_immediate low_bits && is_immediate high_bits then begin
-          `	mov	{emit_reg i.res.(0)}, #{emit_nativeint low_bits} @ {emit_string s}\n`;
-          `	mov	{emit_reg i.res.(1)}, #{emit_nativeint high_bits}\n`;
-          2
+        emit_intconst i.res.(0) (Nativeint.to_int32 n)
+    | Lop(Iconst_float f) when !fpu = Soft ->
+        `	@ {emit_string f}\n`;
+        let bits = Int64.bits_of_float (float_of_string f) in
+        let high_bits = Int64.to_int32 (Int64.shift_right_logical bits 32)
+        and low_bits = Int64.to_int32 bits in
+        if is_immediate low_bits || is_immediate high_bits then begin
+          let ninstr_low = emit_intconst i.res.(0) low_bits
+          and ninstr_high = emit_intconst i.res.(1) high_bits in
+          ninstr_low + ninstr_high
         end else begin
-          let lbl = label_constant float_constants s 2 in
-          `	ldr	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_string s}\n`;
+          let lbl = float_literal f in
+          `	ldr	{emit_reg i.res.(0)}, {emit_label lbl}\n`;
           `	ldr	{emit_reg i.res.(1)}, {emit_label lbl} + 4\n`;
           2
         end
+    | Lop(Iconst_float f) ->
+        let encode imm =
+          let sg = Int64.to_int (Int64.shift_right_logical imm 63) in
+          let ex = Int64.to_int (Int64.shift_right_logical imm 52) in
+          let ex = (ex land 0x7ff) - 1023 in
+          let mn = Int64.logand imm 0xfffffffffffffL in
+          if Int64.logand mn 0xffffffffffffL <> 0L || ex < -3 || ex > 4
+          then
+            None
+          else begin
+            let mn = Int64.to_int (Int64.shift_right_logical mn 48) in
+            if mn land 0x0f <> mn then
+              None
+            else
+              let ex = ((ex + 3) land 0x07) lxor 0x04 in
+              Some((sg lsl 7) lor (ex lsl 4) lor mn)
+          end in
+        begin match encode (Int64.bits_of_float (float_of_string f)) with
+          None ->
+            let lbl = float_literal f in
+            `	fldd	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_string f}\n`
+        | Some imm8 ->
+            `	fconstd	{emit_reg i.res.(0)}, #{emit_int imm8} @ {emit_string f}\n`
+        end; 1
     | Lop(Iconst_symbol s) ->
-        let lbl = label_constant symbol_constants s 1 in
-        `	ldr	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_symbol s}\n`; 1
+        emit_load_symbol_addr i.res.(0) s
     | Lop(Icall_ind) ->
-        `	mov	lr, pc\n`;
-        `{record_frame i.live}  bx	{emit_reg i.arg.(0)}\n`; 2
+        if !arch >= ARMv5 then begin
+          `	blx	{emit_reg i.arg.(0)}\n`;
+          `{record_frame i.live i.dbg}\n`; 1
+        end else begin
+          `	mov	lr, pc\n`;
+          `	bx	{emit_reg i.arg.(0)}\n`;
+          `{record_frame i.live i.dbg}\n`; 2
+        end
     | Lop(Icall_imm s) ->
-        `{record_frame i.live}  bl      {emit_symbol s}\n`; 1
+        `	{emit_call s}\n`;
+        `{record_frame i.live i.dbg}\n`; 1
     | Lop(Itailcall_ind) ->
-        let n = frame_size() in
-        if !contains_calls then
-          `	ldr	lr, [sp, #{emit_int (n-4)}]\n`;
-        let ninstr = emit_stack_adjustment "add" n in
-        `	bx	{emit_reg i.arg.(0)}\n`;
-        2 + ninstr
+        output_epilogue begin fun () ->
+          if !contains_calls then
+            `	ldr	lr, [sp, #{emit_int (-4)}]\n`;
+          `	bx	{emit_reg i.arg.(0)}\n`; 2
+        end
     | Lop(Itailcall_imm s) ->
         if s = !function_name then begin
           `	b	{emit_label !tailrec_entry_point}\n`; 1
         end else begin
-          let n = frame_size() in
-          if !contains_calls then
-            `	ldr	lr, [sp, #{emit_int (n-4)}]\n`;
-          let ninstr = emit_stack_adjustment "add" n in
-          `	b	{emit_symbol s}\n`;
-          2 + ninstr
-        end
-    | Lop(Iextcall(s, alloc)) ->
-        if alloc then begin
-          let lbl = label_constant symbol_constants s 1 in
-          `	ldr	r12, {emit_label lbl} @ {emit_symbol s}\n`;
-          `{record_frame i.live}	bl	caml_c_call\n`; 2
-        end else begin
-          `	bl	{emit_symbol s}\n`; 1
+          output_epilogue begin fun () ->
+            if !contains_calls then
+              `	ldr	lr, [sp, #{emit_int (-4)}]\n`;
+            `	{emit_jump s}\n`; 2
+          end
         end
+    | Lop(Iextcall(s, false)) ->
+        `	{emit_call s}\n`; 1
+    | Lop(Iextcall(s, true)) ->
+        let ninstr = emit_load_symbol_addr (phys_reg 7 (* r7 *)) s in
+        `	{emit_call "caml_c_call"}\n`;
+        `{record_frame i.live i.dbg}\n`;
+        1 + ninstr
     | Lop(Istackoffset n) ->
         assert (n mod 8 = 0);
-        let ninstr =
-          if n >= 0
-          then emit_stack_adjustment "sub" n
-          else emit_stack_adjustment "add" (-n) in
+        let ninstr = emit_stack_adjustment (-n) in
         stack_offset := !stack_offset + n;
         ninstr
-    | Lop(Iload((Double | Double_u), addr)) ->
-        let addr' = offset_addressing addr 4 in
-        if i.res.(0).loc <> i.arg.(0).loc then begin
-          `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`;
-          `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`
-        end else begin
-          `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`;
-          `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
-        end;
-        2
+    | Lop(Iload(Single, addr)) when !fpu >= VFPv3_D16 ->
+        `	flds	s14, {emit_addressing addr i.arg 0}\n`;
+        `	fcvtds	{emit_reg i.res.(0)}, s14\n`; 2
+    | Lop(Iload((Double | Double_u), addr)) when !fpu = Soft ->
+        (* Use LDM or LDRD if possible *)
+        begin match i.res.(0), i.res.(1), addr with
+          {loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
+          when rt < rt2 ->
+            `	ldm	{emit_reg i.arg.(0)}, \{{emit_reg i.res.(0)}, {emit_reg i.res.(1)}}\n`; 1
+        | {loc = Reg rt}, {loc = Reg rt2}, addr
+          when !arch >= ARMv5TE && rt mod 2 == 0 && rt2 = rt + 1 ->
+            `	ldrd	{emit_reg i.res.(0)}, {emit_reg i.res.(1)}, {emit_addressing addr i.arg 0}\n`; 1
+        | _ ->
+            let addr' = offset_addressing addr 4 in
+            if i.res.(0).loc <> i.arg.(0).loc then begin
+              `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`;
+              `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`
+            end else begin
+              `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`;
+              `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
+            end; 2
+        end
     | Lop(Iload(size, addr)) ->
         let r = i.res.(0) in
         let instr =
@@ -343,65 +498,114 @@ let emit_instr i =
           | Byte_signed -> "ldrsb"
           | Sixteen_unsigned -> "ldrh"
           | Sixteen_signed -> "ldrsh"
+          | Double
+          | Double_u -> "fldd"
           | _ (* 32-bit quantities *) -> "ldr" in
-        `	{emit_string	instr}     {emit_reg r}, {emit_addressing addr i.arg 0}\n`;
-        1
-    | Lop(Istore((Double | Double_u), addr)) ->
-        let addr' = offset_addressing addr 4 in
-        `	str	{emit_reg i.arg.(0)}, {emit_addressing addr i.arg 2}\n`;
-        `	str	{emit_reg i.arg.(1)}, {emit_addressing addr' i.arg 2}\n`;
-        2
+        `	{emit_string instr}	{emit_reg r}, {emit_addressing addr i.arg 0}\n`; 1
+    | Lop(Istore(Single, addr)) when !fpu >= VFPv3_D16 ->
+        `	fcvtsd	s14, {emit_reg i.arg.(0)}\n`;
+        `	fsts	s14, {emit_addressing addr i.arg 1}\n`; 2
+    | Lop(Istore((Double | Double_u), addr)) when !fpu = Soft ->
+        (* Use STM or STRD if possible *)
+        begin match i.arg.(0), i.arg.(1), addr with
+          {loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
+          when rt < rt2 ->
+            `	stm	{emit_reg i.arg.(2)}, \{{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}}\n`; 1
+        | {loc = Reg rt}, {loc = Reg rt2}, addr
+          when !arch >= ARMv5TE && rt mod 2 == 0 && rt2 = rt + 1 ->
+            `	strd	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_addressing addr i.arg 2}\n`; 1
+        | _ ->
+            let addr' = offset_addressing addr 4 in
+            `	str	{emit_reg i.arg.(0)}, {emit_addressing addr i.arg 2}\n`;
+            `	str	{emit_reg i.arg.(1)}, {emit_addressing addr' i.arg 2}\n`; 2
+        end
     | Lop(Istore(size, addr)) ->
         let r = i.arg.(0) in
         let instr =
           match size with
-            Byte_unsigned | Byte_signed -> "strb"
-          | Sixteen_unsigned | Sixteen_signed -> "strh"
+            Byte_unsigned
+          | Byte_signed -> "strb"
+          | Sixteen_unsigned
+          | Sixteen_signed -> "strh"
+          | Double
+          | Double_u -> "fstd"
           | _ (* 32-bit quantities *) -> "str" in
-        `	{emit_string	instr}     {emit_reg r}, {emit_addressing addr i.arg 1}\n`;
-        1
+        `	{emit_string instr}	{emit_reg r}, {emit_addressing addr i.arg 1}\n`; 1
     | Lop(Ialloc n) ->
+        let lbl_frame = record_frame_label i.live i.dbg in
         if !fastcode_flag then begin
-          let ni = emit_intconst (phys_reg 8 (*r12*)) (Nativeint.of_int n) in
-          `	sub	alloc_ptr, alloc_ptr, r12\n`;
+          let lbl_redo = new_label() in
+          `{emit_label lbl_redo}:`;
+          let ninstr = decompose_intconst
+                         (Int32.of_int n)
+                         (fun i ->
+                           `	sub	alloc_ptr, alloc_ptr, #{emit_int32 i}\n`) in
           `	cmp	alloc_ptr, alloc_limit\n`;
-          `{record_frame i.live}	blcc    caml_call_gc\n`;
           `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
-          4 + ni
-        end else if n = 8 || n = 12 || n = 16 then begin
-          `{record_frame i.live}	bl	caml_alloc{emit_int ((n-4)/4)}\n`;
-          `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`; 2
+          let lbl_call_gc = new_label() in
+          `	bcc	{emit_label lbl_call_gc}\n`;
+          call_gc_sites :=
+            { gc_lbl = lbl_call_gc;
+              gc_return_lbl = lbl_redo;
+              gc_frame_lbl = lbl_frame } :: !call_gc_sites;
+          3 + ninstr
         end else begin
-          let ni = emit_intconst (phys_reg 8 (*r12*)) (Nativeint.of_int n) in
-          `{record_frame i.live}	bl	caml_allocN\n`;
-          `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
-          2 + ni
+          let ninstr =
+            begin match n with
+               8 -> `	{emit_call "caml_alloc1"}\n`; 1
+            | 12 -> `	{emit_call "caml_alloc2"}\n`; 1
+            | 16 -> `	{emit_call "caml_alloc3"}\n`; 1
+            |  _ -> let ninstr = emit_intconst (phys_reg 7) (Int32.of_int n) in
+                    `	{emit_call "caml_allocN"}\n`; 1 + ninstr
+            end in
+          `{emit_label lbl_frame}:	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
+          1 + ninstr
         end
     | Lop(Iintop(Ilsl | Ilsr | Iasr as op)) ->
         let shift = name_for_shift_operation op in
         `	mov	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_string shift} {emit_reg i.arg.(1)}\n`; 1
     | Lop(Iintop(Icomp cmp)) ->
-        let comp = name_for_comparison cmp in
+        let compthen = name_for_comparison cmp in
+        let compelse = name_for_comparison (negate_integer_comparison cmp) in
         `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
-        `	mov	{emit_reg i.res.(0)}, #0\n`;
-        `	mov{emit_string	comp}   {emit_reg i.res.(0)}, #1\n`; 3
-    | Lop(Iintop(Icheckbound)) ->
+        `	ite	{emit_string compthen}\n`;
+        `	mov{emit_string	compthen}	{emit_reg i.res.(0)}, #1\n`;
+        `	mov{emit_string compelse}	{emit_reg i.res.(0)}, #0\n`; 4
+    | Lop(Iintop_imm(Icomp cmp, n)) ->
+        let compthen = name_for_comparison cmp in
+        let compelse = name_for_comparison (negate_integer_comparison cmp) in
+        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
+        `	ite	{emit_string compthen}\n`;
+        `	mov{emit_string	compthen}	{emit_reg i.res.(0)}, #1\n`;
+        `	mov{emit_string compelse}	{emit_reg i.res.(0)}, #0\n`; 4
+    | Lop(Iintop Icheckbound) ->
+        let lbl = bound_error_label i.dbg in
         `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
-        `	blls	caml_ml_array_bound_error\n`; 2
+        `	bls	{emit_label lbl}\n`; 2
+    | Lop(Iintop_imm(Icheckbound, n)) ->
+        let lbl = bound_error_label i.dbg in
+        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
+        `	bls	{emit_label lbl}\n`; 2
+    | Lop(Ispecific(Ishiftcheckbound shift)) ->
+        let lbl = bound_error_label i.dbg in
+        `	cmp	{emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, lsr #{emit_int shift}\n`;
+        `	bcs	{emit_label lbl}\n`; 2
     | Lop(Iintop op) ->
         let instr = name_for_int_operation op in
-        `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
+        `	{emit_string instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
     | Lop(Iintop_imm(Idiv, n)) -> (* n is a power of 2 *)
         let l = Misc.log2 n in
         let r = i.res.(0) in
         `	movs	{emit_reg r}, {emit_reg i.arg.(0)}\n`;
-        if n <= 256 then
+        if n <= 256 then begin
+          `	it	lt\n`;
           `	addlt	{emit_reg r}, {emit_reg r}, #{emit_int (n-1)}\n`
-        else begin
+        end else begin
+          `	itt	lt\n`;
           `	addlt	{emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
           `	sublt	{emit_reg r}, {emit_reg r}, #1\n`
         end;
-        `	mov	{emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 4
+        `	mov	{emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 5
     | Lop(Iintop_imm(Imod, n)) -> (* n is a power of 2 *)
         let l = Misc.log2 n in
         let a = i.arg.(0) in
@@ -412,47 +616,78 @@ let emit_instr i =
         `	mov	{emit_reg r}, {emit_reg r}, lsr #{emit_int (32-l)}\n`;
         `	bpl	{emit_label lbl}\n`;
         `	cmp	{emit_reg r}, #0\n`;
+        `	it	ne\n`;
         `	subne	{emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
-        `{emit_label lbl}:\n`; 6
+        `{emit_label lbl}:\n`; 7
     | Lop(Iintop_imm((Ilsl | Ilsr | Iasr as op), n)) ->
         let shift = name_for_shift_operation op in
         `	mov	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_string shift} #{emit_int n}\n`; 1
-    | Lop(Iintop_imm(Icomp cmp, n)) ->
-        let comp = name_for_comparison cmp in
-        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
-        `	mov	{emit_reg i.res.(0)}, #0\n`;
-        `	mov{emit_string	comp}   {emit_reg i.res.(0)}, #1\n`; 3
-    | Lop(Iintop_imm(Icheckbound, n)) ->
-        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
-        `	blls	caml_ml_array_bound_error\n`; 2
     | Lop(Iintop_imm(op, n)) ->
         let instr = name_for_int_operation op in
-        `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
-    | Lop(Inegf) -> (* argument and result in (r0, r1) *)
-        `	eor     r1, r1, #0x80000000\n`; 1
-    | Lop(Iabsf) -> (* argument and result in (r0, r1) *)
-        `	bic     r1, r1, #0x80000000\n`; 1
-    | Lop(Ifloatofint | Iintoffloat | Iaddf | Isubf | Imulf | Idivf) ->
-        assert false
+        `	{emit_string instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
+    | Lop(Iabsf | Inegf as op) when !fpu = Soft ->
+        let instr = (match op with
+                       Iabsf -> "bic"
+                     | Inegf -> "eor"
+                     | _     -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(1)}, {emit_reg i.arg.(1)}, #0x80000000\n`; 1
+    | Lop(Iabsf | Inegf | Ispecific Isqrtf as op) ->
+        let instr = (match op with
+                       Iabsf            -> "fabsd"
+                     | Inegf            -> "fnegd"
+                     | Ispecific Isqrtf -> "fsqrtd"
+                     | _                -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`; 1
+    | Lop(Ifloatofint) ->
+        `	fmsr	s14, {emit_reg i.arg.(0)}\n`;
+        `	fsitod	{emit_reg i.res.(0)}, s14\n`; 2
+    | Lop(Iintoffloat) ->
+        `	ftosizd	s14, {emit_reg i.arg.(0)}\n`;
+        `	fmrs	{emit_reg i.res.(0)}, s14\n`; 2
+    | Lop(Iaddf | Isubf | Imulf | Idivf | Ispecific Inegmulf as op) ->
+        let instr = (match op with
+                       Iaddf              -> "faddd"
+                     | Isubf              -> "fsubd"
+                     | Imulf              -> "fmuld"
+                     | Idivf              -> "fdivd"
+                     | Ispecific Inegmulf -> "fnmuld"
+                     | _                  -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
+        1
+    | Lop(Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf as op)) ->
+        let instr = (match op with
+                       Imuladdf    -> "fmacd"
+                     | Inegmuladdf -> "fnmacd"
+                     | Imulsubf    -> "fmscd"
+                     | Inegmulsubf -> "fnmscd"
+                     | _ -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`;
+        1
     | Lop(Ispecific(Ishiftarith(op, shift))) ->
-        let instr = name_for_shift_int_operation op in
-        `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}`;
+        let instr = (match op with
+                       Ishiftadd    -> "add"
+                     | Ishiftsub    -> "sub"
+                     | Ishiftsubrev -> "rsb") in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}`;
         if shift >= 0
         then `, lsl #{emit_int shift}\n`
         else `, asr #{emit_int (-shift)}\n`;
         1
-    | Lop(Ispecific(Ishiftcheckbound shift)) ->
-        `	cmp	{emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, lsr #{emit_int shift}\n`;
-        `	blcs	caml_ml_array_bound_error\n`; 2
     | Lop(Ispecific(Irevsubimm n)) ->
         `	rsb	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
+    | Lop(Ispecific(Imuladd | Imulsub as op)) ->
+        let instr = (match op with
+                       Imuladd -> "mla"
+                     | Imulsub -> "mls"
+                     | _ -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`; 1
     | Lreloadretaddr ->
         let n = frame_size() in
         `	ldr	lr, [sp, #{emit_int(n-4)}]\n`; 1
     | Lreturn ->
-        let ninstr = emit_stack_adjustment "add" (frame_size()) in
-        `	bx	lr\n`;
-        ninstr + 1
+        output_epilogue begin fun () ->
+          `	bx	lr\n`; 1
+        end
     | Llabel lbl ->
         `{emit_label lbl}:\n`; 0
     | Lbranch lbl ->
@@ -461,29 +696,41 @@ let emit_instr i =
         begin match tst with
           Itruetest ->
             `	cmp	{emit_reg i.arg.(0)}, #0\n`;
-            `	bne	{emit_label lbl}\n`
+            `	bne	{emit_label lbl}\n`; 2
         | Ifalsetest ->
             `	cmp	{emit_reg i.arg.(0)}, #0\n`;
-            `	beq	{emit_label lbl}\n`
+            `	beq	{emit_label lbl}\n`; 2
         | Iinttest cmp ->
             `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
             let comp = name_for_comparison cmp in
-            `	b{emit_string comp}	{emit_label lbl}\n`
+            `	b{emit_string comp}	{emit_label lbl}\n`; 2
         | Iinttest_imm(cmp, n) ->
             `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
             let comp = name_for_comparison cmp in
-            `	b{emit_string comp}	{emit_label lbl}\n`
+            `	b{emit_string comp}	{emit_label lbl}\n`; 2
         | Ifloattest(cmp, neg) ->
-            assert false
+            let comp = (match (cmp, neg) with
+                          (Ceq, false) | (Cne, true) -> "eq"
+                        | (Cne, false) | (Ceq, true) -> "ne"
+                        | (Clt, false) -> "cc"
+                        | (Clt, true)  -> "cs"
+                        | (Cle, false) -> "ls"
+                        | (Cle, true)  -> "hi"
+                        | (Cgt, false) -> "gt"
+                        | (Cgt, true)  -> "le"
+                        | (Cge, false) -> "ge"
+                        | (Cge, true)  -> "lt") in
+            `	fcmpd	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
+            `	fmstat\n`;
+            `	b{emit_string comp}	{emit_label lbl}\n`; 3
         | Ioddtest ->
             `	tst	{emit_reg i.arg.(0)}, #1\n`;
-            `	bne	{emit_label lbl}\n`
+            `	bne	{emit_label lbl}\n`; 2
         | Ieventest ->
             `	tst	{emit_reg i.arg.(0)}, #1\n`;
-            `	beq	{emit_label lbl}\n`
-        end;
-        2
-  | Lcondbranch3(lbl0, lbl1, lbl2) ->
+            `	beq	{emit_label lbl}\n`; 2
+        end
+    | Lcondbranch3(lbl0, lbl1, lbl2) ->
         `	cmp	{emit_reg i.arg.(0)}, #1\n`;
         begin match lbl0 with
           None -> ()
@@ -498,108 +745,151 @@ let emit_instr i =
         | Some lbl -> `	bgt	{emit_label lbl}\n`
         end;
         4
-  | Lswitch jumptbl ->
-        `	ldr	pc, [pc, {emit_reg i.arg.(0)}, lsl #2]\n`;
-        `	mov	r0, r0\n`;      (* nop *)
-        for i = 0 to Array.length jumptbl - 1 do
-          `	.word	{emit_label jumptbl.(i)}\n`
-        done;
+    | Lswitch jumptbl ->
+        if !arch > ARMv6 && !thumb then begin
+          (* The Thumb-2 TBH instruction supports only forward branches,
+             so we need to generate appropriate trampolines for all labels
+             that appear before this switch instruction (PR#5623) *)
+          let tramtbl = Array.copy jumptbl in
+          `	tbh	[pc, {emit_reg i.arg.(0)}, lsl #1]\n`;
+          for j = 0 to Array.length tramtbl - 1 do
+            let rec label i =
+              match i.desc with
+                Lend -> new_label()
+              | Llabel lbl when lbl = tramtbl.(j) -> lbl
+              | _ -> label i.next in
+            tramtbl.(j) <- label i.next;
+            `	.short	({emit_label tramtbl.(j)}-.)/2+{emit_int j}\n`
+          done;
+          (* Generate the necessary trampolines *)
+          for j = 0 to Array.length tramtbl - 1 do
+            if tramtbl.(j) <> jumptbl.(j) then
+              `{emit_label tramtbl.(j)}:	b	{emit_label jumptbl.(j)}\n`
+          done
+        end else if not !pic_code then begin
+          `	ldr	pc, [pc, {emit_reg i.arg.(0)}, lsl #2]\n`;
+          `	nop\n`;
+          for j = 0 to Array.length jumptbl - 1 do
+            `	.word	{emit_label jumptbl.(j)}\n`
+          done
+        end else begin
+          (* Slightly slower, but position-independent *)
+          `	add	pc, pc, {emit_reg i.arg.(0)}, lsl #2\n`;
+          `	nop\n`;
+          for j = 0 to Array.length jumptbl - 1 do
+            `	b	{emit_label jumptbl.(j)}\n`
+          done
+        end;
         2 + Array.length jumptbl
     | Lsetuptrap lbl ->
         `	bl	{emit_label lbl}\n`; 1
     | Lpushtrap ->
         stack_offset := !stack_offset + 8;
-        `	stmfd	sp!, \{trap_ptr, lr}\n`;
+        `	push	\{trap_ptr, lr}\n`;
+        cfi_adjust_cfa_offset 8;
         `	mov	trap_ptr, sp\n`; 2
     | Lpoptrap ->
-        `	ldmfd	sp!, \{trap_ptr, lr}\n`;
+        `	pop	\{trap_ptr, lr}\n`;
+        cfi_adjust_cfa_offset (-8);
         stack_offset := !stack_offset - 8; 1
     | Lraise ->
-        `	mov	sp, trap_ptr\n`;
-        `	ldmfd	sp!, \{trap_ptr, pc}\n`; 2
+        if !Clflags.debug then begin
+          `	{emit_call "caml_raise_exn"}\n`;
+          `{record_frame Reg.Set.empty i.dbg}\n`; 1
+        end else begin
+          `	mov	sp, trap_ptr\n`;
+          `	pop	\{trap_ptr, pc}\n`; 2
+        end
 
 (* Emission of an instruction sequence *)
 
-let no_fallthrough = function
-    Lop(Itailcall_ind | Itailcall_imm _) -> true
-  | Lreturn -> true
-  | Lbranch _ -> true
-  | Lswitch _ -> true
-  | Lraise -> true
-  | _ -> false
-
 let rec emit_all ninstr i =
   if i.desc = Lend then () else begin
     let n = emit_instr i in
     let ninstr' = ninstr + n in
-    let limit = 511 - !num_literals in
-    if ninstr' >= limit - 64 && no_fallthrough i.desc then begin
-      emit_constants();
+    (* fldd can address up to +/-1KB, ldr can address up to +/-4KB *)
+    let limit = (if !fpu >= VFPv3_D16 && !float_literals <> []
+                 then 127
+                 else 511) in
+    let limit = limit - !num_literals in
+    if ninstr' >= limit - 64 && not(has_fallthrough i.desc) then begin
+      emit_literals();
       emit_all 0 i.next
-    end else
-    if ninstr' >= limit then begin
+    end else if !num_literals != 0 && ninstr' >= limit then begin
       let lbl = new_label() in
       `	b	{emit_label lbl}\n`;
-      emit_constants();
+      emit_literals();
       `{emit_label lbl}:\n`;
       emit_all 0 i.next
     end else
       emit_all ninstr' i.next
   end
 
+(* Emission of the profiling prelude *)
+
+let emit_profile() =
+  match Config.system with
+    "linux_eabi" | "linux_eabihf" ->
+      `	push	\{lr}\n`;
+      `	{emit_call "__gnu_mcount_nc"}\n`
+  | _ -> ()
+
 (* Emission of a function declaration *)
 
 let fundecl fundecl =
   function_name := fundecl.fun_name;
   fastcode_flag := fundecl.fun_fast;
   tailrec_entry_point := new_label();
+  float_literals := [];
+  gotrel_literals := [];
+  symbol_literals := [];
   stack_offset := 0;
-  Hashtbl.clear symbol_constants;
-  Hashtbl.clear float_constants;
+  call_gc_sites := [];
+  bound_error_sites := [];
   `	.text\n`;
   `	.align	2\n`;
-  `	.global	{emit_symbol fundecl.fun_name}\n`;
+  `	.globl	{emit_symbol fundecl.fun_name}\n`;
+  if !arch > ARMv6 && !thumb then
+    `	.thumb\n`
+  else
+    `	.arm\n`;
   `	.type	{emit_symbol fundecl.fun_name}, %function\n`;
   `{emit_symbol fundecl.fun_name}:\n`;
+  emit_debug_info fundecl.fun_dbg;
+  cfi_startproc();
+  if !Clflags.gprofile then emit_profile();
   let n = frame_size() in
-  ignore(emit_stack_adjustment "sub" n);
-  if !contains_calls then
-    `	str	lr, [sp, #{emit_int(n - 4)}]\n`;
+  if n > 0 then begin
+    ignore(emit_stack_adjustment (-n));
+    if !contains_calls then
+      `	str	lr, [sp, #{emit_int(n - 4)}]\n`
+  end;
   `{emit_label !tailrec_entry_point}:\n`;
   emit_all 0 fundecl.fun_body;
-  emit_constants()
+  emit_literals();
+  List.iter emit_call_gc !call_gc_sites;
+  List.iter emit_call_bound_error !bound_error_sites;
+  cfi_endproc();
+  `	.type	{emit_symbol fundecl.fun_name}, %function\n`;
+  `	.size	{emit_symbol fundecl.fun_name}, .-{emit_symbol fundecl.fun_name}\n`
 
 (* Emission of data *)
 
 let emit_item = function
-    Cglobal_symbol s ->
-      `	.global	{emit_symbol s}\n`;
-  | Cdefine_symbol s ->
-      `{emit_symbol s}:\n`
-  | Cdefine_label lbl ->
-      `{emit_data_label lbl}:\n`
-  | Cint8 n ->
-      `	.byte	{emit_int n}\n`
-  | Cint16 n ->
-      `	.short	{emit_int n}\n`
-  | Cint32 n ->
-      `	.word	{emit_nativeint	n}\n`
-  | Cint n ->
-      `	.word	{emit_nativeint n}\n`
-  | Csingle f ->
-      emit_float32_directive ".long" f
-  | Cdouble f ->
-      emit_float64_split_directive ".long" f
-  | Csymbol_address s ->
-      `	.word	{emit_symbol s}\n`
-  | Clabel_address lbl ->
-      `	.word	{emit_data_label lbl}\n`
-  | Cstring s ->
-      emit_string_directive "	.ascii  " s
-  | Cskip n ->
-      if n > 0 then `	.space	{emit_int n}\n`
-  | Calign n ->
-      `	.align	{emit_int(Misc.log2 n)}\n`
+    Cglobal_symbol s -> `	.globl	{emit_symbol s}\n`;
+  | Cdefine_symbol s -> `{emit_symbol s}:\n`
+  | Cdefine_label lbl -> `{emit_data_label lbl}:\n`
+  | Cint8 n -> `	.byte	{emit_int n}\n`
+  | Cint16 n -> `	.short	{emit_int n}\n`
+  | Cint32 n -> `	.long	{emit_int32 (Nativeint.to_int32 n)}\n`
+  | Cint n -> `	.long	{emit_int32 (Nativeint.to_int32 n)}\n`
+  | Csingle f -> `	.single	{emit_string f}\n`
+  | Cdouble f -> `	.double	{emit_string f}\n`
+  | Csymbol_address s -> `	.word	{emit_symbol s}\n`
+  | Clabel_address lbl -> `	.word	{emit_data_label lbl}\n`
+  | Cstring s -> emit_string_directive "	.ascii  " s
+  | Cskip n -> if n > 0 then `	.space	{emit_int n}\n`
+  | Calign n -> `	.align	{emit_int(Misc.log2 n)}\n`
 
 let data l =
   `	.data\n`;
@@ -608,32 +898,63 @@ let data l =
 (* Beginning / end of an assembly file *)
 
 let begin_assembly() =
-  `trap_ptr     .req    r11\n`;
-  `alloc_ptr    .req    r8\n`;
-  `alloc_limit  .req    r10\n`;
+  reset_debug_info();
+  `	.syntax	unified\n`;
+  begin match !arch with
+  | ARMv4   -> `	.arch	armv4t\n`
+  | ARMv5   -> `	.arch	armv5t\n`
+  | ARMv5TE -> `	.arch	armv5te\n`
+  | ARMv6   -> `	.arch	armv6\n`
+  | ARMv6T2 -> `	.arch	armv6t2\n`
+  | ARMv7   -> `	.arch	armv7-a\n`
+  end;
+  begin match !fpu with
+    Soft      -> `	.fpu	softvfp\n`
+  | VFPv3_D16 -> `	.fpu	vfpv3-d16\n`
+  | VFPv3     -> `	.fpu	vfpv3\n`
+  end;
+  `trap_ptr	.req	r8\n`;
+  `alloc_ptr	.req	r10\n`;
+  `alloc_limit	.req	r11\n`;
   let lbl_begin = Compilenv.make_symbol (Some "data_begin") in
   `	.data\n`;
-  `	.global	{emit_symbol lbl_begin}\n`;
+  `	.globl	{emit_symbol lbl_begin}\n`;
   `{emit_symbol lbl_begin}:\n`;
   let lbl_begin = Compilenv.make_symbol (Some "code_begin") in
   `	.text\n`;
-  `	.global	{emit_symbol lbl_begin}\n`;
+  `	.globl	{emit_symbol lbl_begin}\n`;
   `{emit_symbol lbl_begin}:\n`
 
 let end_assembly () =
   let lbl_end = Compilenv.make_symbol (Some "code_end") in
   `	.text\n`;
-  `	.global	{emit_symbol lbl_end}\n`;
+  `	.globl	{emit_symbol lbl_end}\n`;
   `{emit_symbol lbl_end}:\n`;
   let lbl_end = Compilenv.make_symbol (Some "data_end") in
   `	.data\n`;
-  `	.global	{emit_symbol lbl_end}\n`;
+  `	.globl	{emit_symbol lbl_end}\n`;
   `{emit_symbol lbl_end}:\n`;
-  `	.word	0\n`;
+  `	.long	0\n`;
   let lbl = Compilenv.make_symbol (Some "frametable") in
-  `	.data\n`;
-  `	.global	{emit_symbol lbl}\n`;
+  `	.globl	{emit_symbol lbl}\n`;
   `{emit_symbol lbl}:\n`;
-  `	.word	{emit_int (List.length !frame_descriptors)}\n`;
-  List.iter emit_frame !frame_descriptors;
-  frame_descriptors := []
+  emit_frames
+    { efa_label = (fun lbl ->
+                       `	.type	{emit_label lbl}, %function\n`;
+                       `	.word	{emit_label lbl}\n`);
+      efa_16 = (fun n -> `	.short	{emit_int n}\n`);
+      efa_32 = (fun n -> `	.long	{emit_int32 n}\n`);
+      efa_word = (fun n -> `	.word	{emit_int n}\n`);
+      efa_align = (fun n -> `	.align	{emit_int(Misc.log2 n)}\n`);
+      efa_label_rel = (fun lbl ofs ->
+                           `	.word	{emit_label lbl} - . + {emit_int32 ofs}\n`);
+      efa_def_label = (fun lbl -> `{emit_label lbl}:\n`);
+      efa_string = (fun s -> emit_string_directive "	.asciz	" s) };
+  `	.type	{emit_symbol lbl}, %object\n`;
+  `	.size	{emit_symbol lbl}, .-{emit_symbol lbl}\n`;
+  begin match Config.system with
+    "linux_eabihf" | "linux_eabi" ->
+      (* Mark stack as non-executable *)
+      `	.section	.note.GNU-stack,\"\",%progbits\n`
+  | _ -> ()
+  end