1 files changed, 2359 insertions, 220 deletions
diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp
index 06873cd709..d7e8f70d83 100644
--- a/erts/emulator/beam/jit/arm/instr_bs.cpp
+++ b/erts/emulator/beam/jit/arm/instr_bs.cpp
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2020-2022. All Rights Reserved.
+ * Copyright Ericsson AB 2020-2023. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
  */
 
 #include "beam_asm.hpp"
+#include <numeric>
 
 extern "C"
 {
@@ -29,8 +30,6 @@ extern "C"
 
 /* Clobbers TMP1+TMP2
  *
- * If max_size > 0, we jump to the fail label when Size > max_size
- *
  * Returns -1 when the field check always fails, 1 if it may fail, and 0 if it
  * never fails. */
 int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size,
@@ -55,18 +54,40 @@ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size,
         return -1;
     } else {
         auto size_reg = load_source(Size, TMP2);
+        bool can_fail = true;
+
+        if (always_small(Size)) {
+            auto [min, max] = getClampedRange(Size);
+            can_fail =
+                    !(0 <= min && (max >> (SMALL_BITS - ERL_UNIT_BITS)) == 0);
+        }
 
         /* Negating the tag bits lets us guard against non-smalls, negative
          * numbers, and overflow with a single `tst` instruction. */
         ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK);
         ASSERT(unit <= 1024);
 
-        a.eor(out, size_reg.reg, imm(_TAG_IMMED1_SMALL));
-        a.tst(out, imm(0xFFF0000000000000UL | _TAG_IMMED1_MASK));
+        if (!can_fail) {
+            comment("simplified segment size checks because "
+                    "the types are known");
+        }
+
+        if (unit == 1 && !can_fail) {
+            a.lsr(out, size_reg.reg, imm(_TAG_IMMED1_SIZE));
+        } else {
+            a.eor(out, size_reg.reg, imm(_TAG_IMMED1_SMALL));
+        }
+
+        if (can_fail) {
+            a.tst(out, imm(0xFFF0000000000000UL | _TAG_IMMED1_MASK));
+        }
 
         if (unit == 0) {
             /* Silly but legal.*/
             mov_imm(out, 0);
+        } else if (unit == 1 && !can_fail) {
+            /* The result is already in the out register. */
+            ;
         } else if (Support::isPowerOf2(unit)) {
             int trailing_bits = Support::ctz<Eterm>(unit);
 
@@ -88,9 +109,11 @@ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size,
             a.mul(out, out, TMP1);
         }
 
-        a.b_ne(fail);
+        if (can_fail) {
+            a.b_ne(fail);
+        }
 
-        return 1;
+        return can_fail;
     }
 }
 
@@ -102,7 +125,7 @@ void BeamModuleAssembler::emit_i_bs_init_heap(const ArgWord &Size,
     mov_arg(ARG5, Heap);
     mov_arg(ARG6, Live);
 
-    emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+    emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs |
                        Update::eReductions>(Live.get());
 
     a.mov(ARG1, c_p);
@@ -110,7 +133,7 @@ void BeamModuleAssembler::emit_i_bs_init_heap(const ArgWord &Size,
     load_erl_bits_state(ARG3);
     runtime_call<6>(beam_jit_bs_init);
 
-    emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+    emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs |
                        Update::eReductions>(Live.get());
 
     mov_arg(Dst, ARG1);
@@ -148,7 +171,7 @@ void BeamModuleAssembler::emit_i_bs_init_fail_heap(const ArgSource &Size,
         mov_arg(ARG5, Heap);
         mov_arg(ARG6, Live);
 
-        emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+        emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs |
                            Update::eReductions>(Live.get());
 
         a.mov(ARG1, c_p);
@@ -156,7 +179,7 @@ void BeamModuleAssembler::emit_i_bs_init_fail_heap(const ArgSource &Size,
         load_erl_bits_state(ARG3);
         runtime_call<6>(beam_jit_bs_init);
 
-        emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+        emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs |
                            Update::eReductions>(Live.get());
 
         mov_arg(Dst, ARG1);
@@ -207,7 +230,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_heap(const ArgWord &NumBits,
     mov_arg(ARG5, Alloc);
     mov_arg(ARG6, Live);
 
-    emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+    emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs |
                        Update::eReductions>(Live.get());
 
     a.mov(ARG1, c_p);
@@ -215,7 +238,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_heap(const ArgWord &NumBits,
     load_erl_bits_state(ARG3);
     runtime_call<6>(beam_jit_bs_init_bits);
 
-    emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+    emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs |
                        Update::eReductions>(Live.get());
 
     mov_arg(Dst, ARG1);
@@ -248,7 +271,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_fail_heap(
         mov_arg(ARG5, Alloc);
         mov_arg(ARG6, Live);
 
-        emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+        emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs |
                            Update::eReductions>(Live.get());
 
         a.mov(ARG1, c_p);
@@ -256,7 +279,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_fail_heap(
         load_erl_bits_state(ARG3);
         runtime_call<6>(beam_jit_bs_init_bits);
 
-        emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+        emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs |
                            Update::eReductions>(Live.get());
 
         mov_arg(Dst, ARG1);
@@ -541,19 +564,18 @@ void BeamModuleAssembler::emit_i_bs_start_match3(const ArgRegister &Src,
 
     a.bind(is_binary);
     {
-        /* Src is not guaranteed to be inside the live range, so we need to
-         * stash it during GC. */
-        emit_gc_test_preserve(ArgVal(ArgVal::Word, ERL_BIN_MATCHSTATE_SIZE(0)),
+        emit_gc_test_preserve(ArgWord(ERL_BIN_MATCHSTATE_SIZE(0)),
                               Live,
+                              Src,
                               ARG2);
 
-        emit_enter_runtime<Update::eStack | Update::eHeap>(Live.get());
+        emit_enter_runtime<Update::eHeapOnlyAlloc>(Live.get());
 
         a.mov(ARG1, c_p);
         /* ARG2 was set above */
         runtime_call<2>(erts_bs_start_match_3);
 
-        emit_leave_runtime<Update::eStack | Update::eHeap>(Live.get());
+        emit_leave_runtime<Update::eHeapOnlyAlloc>(Live.get());
 
         a.add(ARG2, ARG1, imm(TAG_PRIMARY_BOXED));
     }
@@ -585,9 +607,8 @@ void BeamModuleAssembler::emit_i_bs_match_string(const ArgRegister &Ctx,
         a.and_(ARG4, TMP2, imm(7));
 
         /* ARG3 = mb->base + (mb->offset >> 3) */
-        a.lsr(TMP2, TMP2, imm(3));
         a.ldur(TMP1, emit_boxed_val(ctx_reg.reg, base_offset));
-        a.add(ARG3, TMP1, TMP2);
+        a.add(ARG3, TMP1, TMP2, arm::lsr(3));
     }
 
     emit_enter_runtime();
@@ -624,77 +645,89 @@ void BeamModuleAssembler::emit_i_bs_get_position(const ArgRegister &Ctx,
     flush_var(dst_reg);
 }
 
-void BeamModuleAssembler::emit_i_bs_get_fixed_integer(const ArgRegister &Ctx,
-                                                      const ArgLabel &Fail,
-                                                      const ArgWord &Live,
-                                                      const ArgWord &Flags,
-                                                      const ArgWord &Bits,
-                                                      const ArgRegister &Dst) {
-    auto ctx = load_source(Ctx, TMP1);
-    int flags, bits;
-
-    flags = Flags.get();
-    bits = Bits.get();
+void BeamModuleAssembler::emit_bs_get_integer2(const ArgLabel &Fail,
+                                               const ArgRegister &Ctx,
+                                               const ArgWord &Live,
+                                               const ArgSource &Sz,
+                                               const ArgWord &Unit,
+                                               const ArgWord &Flags,
+                                               const ArgRegister &Dst) {
+    Uint size;
+    Uint flags = Flags.get();
 
-    if (bits >= SMALL_BITS) {
-        emit_gc_test_preserve(ArgVal(ArgVal::Word, BIG_NEED_FOR_BITS(bits)),
-                              Live,
-                              ctx.reg);
+    if (flags & BSF_NATIVE) {
+        flags &= ~BSF_NATIVE;
+        flags |= BSF_LITTLE;
     }
 
-    lea(ARG4, emit_boxed_val(ctx.reg, offsetof(ErlBinMatchState, mb)));
-
-    if (bits >= SMALL_BITS) {
-        emit_enter_runtime<Update::eHeap>(Live.get());
+    if (Sz.isSmall() && Sz.as<ArgSmall>().getUnsigned() < 8 * sizeof(Uint) &&
+        (size = Sz.as<ArgSmall>().getUnsigned() * Unit.get()) <
+                8 * sizeof(Uint)) {
+        /* Segment of a fixed size supported by bs_match. */
+        const ArgVal match[] = {ArgAtom(am_ensure_at_least),
+                                ArgWord(size),
+                                ArgWord(1),
+                                ArgAtom(am_integer),
+                                Live,
+                                ArgWord(flags),
+                                ArgWord(size),
+                                ArgWord(1),
+                                Dst};
+
+        const Span<ArgVal> args(match, sizeof(match) / sizeof(match[0]));
+        emit_i_bs_match(Fail, Ctx, args);
     } else {
-        emit_enter_runtime(Live.get());
-    }
-
-    a.mov(ARG1, c_p);
-    a.mov(ARG2, bits);
-    a.mov(ARG3, flags);
-    /* ARG4 set above. */
-    runtime_call<4>(erts_bs_get_integer_2);
-
-    if (bits >= SMALL_BITS) {
-        emit_leave_runtime<Update::eHeap>(Live.get());
-    } else {
-        emit_leave_runtime(Live.get());
-    }
-
-    emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown));
-    mov_arg(Dst, ARG1);
-}
-
-void BeamModuleAssembler::emit_i_bs_get_integer(const ArgRegister &Ctx,
-                                                const ArgLabel &Fail,
-                                                const ArgWord &Live,
-                                                const ArgWord &FlagsAndUnit,
-                                                const ArgSource &Sz,
-                                                const ArgRegister &Dst) {
-    Label fail;
-    int unit;
-
-    fail = resolve_beam_label(Fail, dispUnknown);
-    unit = FlagsAndUnit.get() >> 3;
-
-    if (emit_bs_get_field_size(Sz, unit, fail, ARG5) >= 0) {
-        mov_arg(ARG3, Ctx);
-        mov_arg(ARG4, FlagsAndUnit);
-        mov_arg(ARG6, Live);
+        Label fail = resolve_beam_label(Fail, dispUnknown);
+        int unit = Unit.get();
+
+        if (emit_bs_get_field_size(Sz, unit, fail, ARG5) >= 0) {
+            /* This operation can be expensive if a bignum can be
+             * created because there can be a garbage collection. */
+            auto max = std::get<1>(getClampedRange(Sz));
+            bool potentially_expensive =
+                    max >= SMALL_BITS || (max * Unit.get()) >= SMALL_BITS;
+
+            mov_arg(ARG3, Ctx);
+            mov_imm(ARG4, flags);
+            if (potentially_expensive) {
+                mov_arg(ARG6, Live);
+            } else {
+#ifdef DEBUG
+                /* Never actually used. */
+                mov_imm(ARG6, 1023);
+#endif
+            }
 
-        emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
-                           Update::eReductions>(Live.get());
+            if (potentially_expensive) {
+                emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs |
+                                   Update::eReductions>(Live.get());
+            } else {
+                comment("simplified entering runtime because result is always "
+                        "small");
+                emit_enter_runtime(Live.get());
+            }
 
-        a.mov(ARG1, c_p);
-        load_x_reg_array(ARG2);
-        runtime_call<6>(beam_jit_bs_get_integer);
+            a.mov(ARG1, c_p);
+            if (potentially_expensive) {
+                load_x_reg_array(ARG2);
+            } else {
+#ifdef DEBUG
+                /* Never actually used. */
+                mov_imm(ARG2, 0);
+#endif
+            }
+            runtime_call<6>(beam_jit_bs_get_integer);
 
-        emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
-                           Update::eReductions>(Live.get());
+            if (potentially_expensive) {
+                emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs |
+                                   Update::eReductions>(Live.get());
+            } else {
+                emit_leave_runtime(Live.get());
+            }
 
-        emit_branch_if_not_value(ARG1, fail);
-        mov_arg(Dst, ARG1);
+            emit_branch_if_not_value(ARG1, fail);
+            mov_arg(Dst, ARG1);
+        }
     }
 }
 
@@ -738,11 +771,7 @@ void BeamModuleAssembler::emit_i_bs_get_binary_all2(const ArgRegister &Ctx,
 
     mov_arg(ARG1, Ctx);
 
-    /* Ctx is not guaranteed to be inside the live range, so we need to stash
-     * it during GC. */
-    emit_gc_test_preserve(ArgVal(ArgVal::Word, EXTRACT_SUB_BIN_HEAP_NEED),
-                          Live,
-                          ARG1);
+    emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, Ctx, ARG1);
 
     /* Make field fetching slightly more compact by pre-loading the match
      * buffer into the right argument slot for `erts_bs_get_binary_all_2`. */
@@ -770,13 +799,13 @@ void BeamModuleAssembler::emit_i_bs_get_binary_all2(const ArgRegister &Ctx,
         }
     }
 
-    emit_enter_runtime<Update::eHeap>(Live.get());
+    emit_enter_runtime<Update::eHeapOnlyAlloc>(Live.get());
 
     a.mov(ARG1, c_p);
     /* ARG2 was set above. */
     runtime_call<2>(erts_bs_get_binary_all_2);
 
-    emit_leave_runtime<Update::eHeap>(Live.get());
+    emit_leave_runtime<Update::eHeapOnlyAlloc>(Live.get());
 
     mov_arg(Dst, ARG1);
 }
@@ -796,11 +825,11 @@ void BeamGlobalAssembler::emit_bs_get_tail_shared() {
     a.sub(ARG5, TMP1, ARG4);
 
     emit_enter_runtime_frame();
-    emit_enter_runtime<Update::eHeap>();
+    emit_enter_runtime<Update::eHeapOnlyAlloc>();
 
     runtime_call<5>(erts_extract_sub_binary);
 
-    emit_leave_runtime<Update::eHeap>();
+    emit_leave_runtime<Update::eHeapOnlyAlloc>();
     emit_leave_runtime_frame();
 
     a.ret(a64::x30);
@@ -811,11 +840,7 @@ void BeamModuleAssembler::emit_bs_get_tail(const ArgRegister &Ctx,
                                            const ArgWord &Live) {
     mov_arg(ARG1, Ctx);
 
-    /* Ctx is not guaranteed to be inside the live range, so we need to stash
-     * it during GC. */
-    emit_gc_test_preserve(ArgVal(ArgVal::Word, EXTRACT_SUB_BIN_HEAP_NEED),
-                          Live,
-                          ARG1);
+    emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, Ctx, ARG1);
 
     fragment_call(ga->get_bs_get_tail_shared());
 
@@ -841,12 +866,34 @@ void BeamModuleAssembler::emit_bs_skip_bits(const ArgLabel &Fail,
 }
 
 void BeamModuleAssembler::emit_i_bs_skip_bits2(const ArgRegister &Ctx,
-                                               const ArgRegister &Bits,
+                                               const ArgRegister &Size,
                                                const ArgLabel &Fail,
                                                const ArgWord &Unit) {
     Label fail = resolve_beam_label(Fail, dispUnknown);
 
-    if (emit_bs_get_field_size(Bits, Unit.get(), fail, ARG1) >= 0) {
+    bool can_fail = true;
+
+    if (always_small(Size)) {
+        auto [min, max] = getClampedRange(Size);
+        can_fail = !(0 <= min && (max >> (SMALL_BITS - ERL_UNIT_BITS)) == 0);
+    }
+
+    if (!can_fail && Unit.get() == 1) {
+        comment("simplified skipping because the types are known");
+
+        const int position_offset = offsetof(ErlBinMatchState, mb.offset);
+        const int size_offset = offsetof(ErlBinMatchState, mb.size);
+        auto [ctx, size] = load_sources(Ctx, TMP1, Size, TMP2);
+
+        a.ldur(TMP3, emit_boxed_val(ctx.reg, position_offset));
+        a.ldur(TMP4, emit_boxed_val(ctx.reg, size_offset));
+
+        a.add(TMP3, TMP3, size.reg, arm::lsr(_TAG_IMMED1_SIZE));
+        a.cmp(TMP3, TMP4);
+        a.b_hi(resolve_beam_label(Fail, disp1MB));
+
+        a.stur(TMP3, emit_boxed_val(ctx.reg, position_offset));
+    } else if (emit_bs_get_field_size(Size, Unit.get(), fail, ARG1) >= 0) {
         emit_bs_skip_bits(Fail, Ctx);
     }
 }
@@ -875,22 +922,21 @@ void BeamModuleAssembler::emit_i_bs_get_binary2(const ArgRegister &Ctx,
 
         mov_arg(ARG4, Ctx);
 
-        /* Ctx is not guaranteed to be inside the live range, so we need to
-         * stash it during GC. */
-        emit_gc_test_preserve(ArgVal(ArgVal::Word, EXTRACT_SUB_BIN_HEAP_NEED),
+        emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED),
                               Live,
+                              Ctx,
                               ARG4);
 
         lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb)));
 
-        emit_enter_runtime<Update::eHeap>(Live.get());
+        emit_enter_runtime<Update::eHeapOnlyAlloc>(Live.get());
 
         a.mov(ARG1, c_p);
         a.ldr(ARG2, TMP_MEM1q);
         mov_imm(ARG3, Flags.get());
         runtime_call<4>(erts_bs_get_binary_2);
 
-        emit_leave_runtime<Update::eHeap>(Live.get());
+        emit_leave_runtime<Update::eHeapOnlyAlloc>(Live.get());
 
         emit_branch_if_not_value(ARG1, fail);
 
@@ -912,20 +958,18 @@ void BeamModuleAssembler::emit_i_bs_get_float2(const ArgRegister &Ctx,
 
     mov_arg(ARG4, Ctx);
 
-    /* Ctx is not guaranteed to be inside the live range, so we need to stash
-     * it during GC. */
-    emit_gc_test_preserve(ArgWord(FLOAT_SIZE_OBJECT), Live, ARG4);
+    emit_gc_test_preserve(ArgWord(FLOAT_SIZE_OBJECT), Live, Ctx, ARG4);
 
     if (emit_bs_get_field_size(Sz, unit, fail, ARG2) >= 0) {
         lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb)));
 
-        emit_enter_runtime<Update::eHeap>(Live.get());
+        emit_enter_runtime<Update::eHeapOnlyAlloc>(Live.get());
 
         a.mov(ARG1, c_p);
         mov_imm(ARG3, Flags.get());
         runtime_call<4>(erts_bs_get_float_2);
 
-        emit_leave_runtime<Update::eHeap>(Live.get());
+        emit_leave_runtime<Update::eHeapOnlyAlloc>(Live.get());
 
         emit_branch_if_not_value(ARG1, fail);
 
@@ -983,18 +1027,283 @@ void BeamModuleAssembler::emit_i_bs_put_utf8(const ArgLabel &Fail,
     }
 }
 
+/*
+ * ARG1 = pointer to match state
+ * ARG2 = number of bits left in binary (< 32)
+ * ARG3 = position in binary in bits
+ * ARG4 = base pointer to binary data
+ *
+ * See the comment for emit_bs_get_utf8_shared() for details about the
+ * return value.
+ */
+void BeamGlobalAssembler::emit_bs_get_utf8_short_shared() {
+    const int position_offset = offsetof(ErlBinMatchBuffer, offset);
+
+    const arm::Gp match_state = ARG1;
+    const arm::Gp bitdata = ARG2;
+    const arm::Gp bin_position = ARG3;
+    const arm::Gp bin_base = ARG4;
+
+    Label two = a.newLabel();
+    Label three_or_more = a.newLabel();
+    Label four = a.newLabel();
+    Label read_done = a.newLabel();
+    Label ascii = a.newLabel();
+    Label error = a.newLabel();
+
+    /* Calculate the number of bytes remaining in the binary and error
+     * out if less than one. */
+    a.lsr(bitdata, bitdata, imm(3));
+    a.cbz(bitdata, error);
+
+    /* Calculate a byte mask so we can zero out trailing garbage. */
+    a.neg(TMP5, bitdata, arm::lsl(3));
+    mov_imm(TMP4, -1);
+    a.lsl(TMP4, TMP4, TMP5);
+
+    /* If the position in the binary is not byte-aligned, we'll need
+     * to read one more byte. */
+    a.ands(TMP1, bin_position, imm(7));
+    a.cinc(bitdata, bitdata, imm(arm::CondCode::kNE));
+
+    /* Set up pointer to the first byte to read. */
+    a.add(TMP2, bin_base, bin_position, arm::lsr(3));
+
+    a.cmp(bitdata, 2);
+    a.b_eq(two);
+    a.b_hi(three_or_more);
+
+    /* Read one byte (always byte-aligned). */
+    a.ldrb(bitdata.w(), arm::Mem(TMP2));
+    a.b(read_done);
+
+    /* Read two bytes. */
+    a.bind(two);
+    a.ldrh(bitdata.w(), arm::Mem(TMP2));
+    a.b(read_done);
+
+    a.bind(three_or_more);
+    a.cmp(bitdata, 3);
+    a.b_ne(four);
+
+    /* Read three bytes. */
+    a.ldrh(bitdata.w(), arm::Mem(TMP2));
+    a.ldrb(TMP3.w(), arm::Mem(TMP2, 2));
+    a.orr(bitdata, bitdata, TMP3, arm::lsl(16));
+    a.b(read_done);
+
+    /* Read four bytes (always unaligned). */
+    a.bind(four);
+    a.ldr(bitdata.w(), arm::Mem(TMP2));
+
+    /* Handle the bytes read. */
+    a.bind(read_done);
+    a.rev64(bitdata, bitdata);
+    a.lsl(bitdata, bitdata, TMP1);
+    a.and_(bitdata, bitdata, TMP4);
+    a.tbz(bitdata, imm(63), ascii);
+    a.b(labels[bs_get_utf8_shared]);
+
+    /* Handle plain old ASCII (code point < 128). */
+    a.bind(ascii);
+    a.add(bin_position, bin_position, imm(8));
+    a.str(bin_position, arm::Mem(match_state, position_offset));
+    a.mov(ARG1, imm(_TAG_IMMED1_SMALL));
+    a.orr(ARG1, ARG1, bitdata, arm::lsr(56 - _TAG_IMMED1_SIZE));
+    a.ret(a64::x30);
+
+    /* Signal error. */
+    a.bind(error);
+    mov_imm(ARG1, 0);
+    a.ret(a64::x30);
+}
+
+/*
+ * ARG1 = pointer to match state
+ * ARG2 = 4 bytes read from the binary in big-endian order
+ * ARG3 = position in binary in bits
+ *
+ * On successful return, the extracted code point is a term tagged
+ * small in ARG1 and the position in the match state has been updated. On
+ * failure, ARG1 contains an invalid term where the tags bits are zero.
+ */
+void BeamGlobalAssembler::emit_bs_get_utf8_shared() {
+    const int position_offset = offsetof(ErlBinMatchBuffer, offset);
+
+    const arm::Gp match_state = ARG1;
+    const arm::Gp bitdata = ARG2;
+    const arm::Gp bin_position = ARG3;
+
+    const arm::Gp byte_count = ARG4;
+
+    const arm::Gp shift = TMP4;
+    const arm::Gp control_mask = TMP5;
+    const arm::Gp error_mask = TMP6;
+
+    /* UTF-8 has the following layout, where 'x' are data bits:
+     *
+     * 1 byte:  0xxxxxxx (not handled by this path)
+     * 2 bytes: 110xxxxx, 10xxxxxx
+     * 3 bytes: 1110xxxx, 10xxxxxx 10xxxxxx
+     * 4 bytes: 11110xxx, 10xxxxxx 10xxxxxx 10xxxxxx
+     *
+     * Note that the number of leading bits is equal to the number of bytes,
+     * which makes it very easy to create masks for extraction and error
+     * checking. */
+
+    /* Calculate the number of bytes. */
+    a.cls(byte_count, bitdata);
+    a.add(byte_count, byte_count, imm(1));
+
+    /* Get rid of the prefix bits. */
+    a.lsl(bitdata, bitdata, byte_count);
+    a.lsr(bitdata, bitdata, byte_count);
+
+    /* Calculate the bit shift now before we start to corrupt the
+     * byte_count. */
+    mov_imm(shift, 64);
+    a.sub(shift, shift, byte_count, arm::lsl(3));
+
+    /* Shift down the value to the least significant part of the word. */
+    a.lsr(bitdata, bitdata, shift);
+
+    /* Matches the '10xxxxxx' components, leaving the header byte alone. */
+    mov_imm(error_mask, 0x00808080ull << 32);
+    a.lsr(error_mask, error_mask, shift);
+
+    /* Construct the control mask '0x00C0C0C0' (already shifted). */
+    a.orr(control_mask, error_mask, error_mask, arm::lsr(1));
+
+    /* Assert that the header bits of each '10xxxxxx' component are correct,
+     * signaling errors by trashing the byte count with an illegal
+     * value (0). */
+    a.and_(TMP3, bitdata, control_mask);
+    a.cmp(TMP3, error_mask);
+
+    a.ubfx(TMP1, bitdata, imm(8), imm(6));
+    a.ubfx(TMP2, bitdata, imm(16), imm(6));
+    a.ubfx(TMP3, bitdata, imm(24), imm(3));
+    a.ubfx(bitdata, bitdata, imm(0), imm(6));
+
+    a.orr(bitdata, bitdata, TMP1, arm::lsl(6));
+    a.orr(bitdata, bitdata, TMP2, arm::lsl(12));
+    a.orr(bitdata, bitdata, TMP3, arm::lsl(18));
+
+    /* Check for too large code point. */
+    mov_imm(TMP1, 0x10FFFF);
+    a.ccmp(bitdata, TMP1, imm(NZCV::kCF), arm::CondCode::kEQ);
+
+    /* Check for the illegal range 16#D800 - 16#DFFF. */
+    a.lsr(TMP1, bitdata, imm(11));
+    a.ccmp(TMP1, imm(0xD800 >> 11), imm(NZCV::kZF), arm::CondCode::kLS);
+    a.csel(byte_count, byte_count, ZERO, imm(arm::CondCode::kNE));
+
+    /* Test for overlong UTF-8 sequence. That can be done by testing
+     * that the bits marked y below are all zero.
+     *
+     * 1 byte:  0xxxxxxx (not handled by this path)
+     * 2 bytes: 110yyyyx, 10xxxxxx
+     * 3 bytes: 1110yyyy, 10yxxxxx 10xxxxxx
+     * 4 bytes: 11110yyy, 10yyxxxx 10xxxxxx 10xxxxxx
+     *
+     * 1 byte:                   xx'xxxxx
+     * 2 bytes:             y'yyyxx'xxxxx
+     * 3 bytes:       y'yyyyx'xxxxx'xxxxx
+     * 4 bytes: y'yyyyx'xxxxx'xxxxx'xxxxx
+     *
+     * The y bits can be isolated by shifting down by the number of bits
+     * shown in this table:
+     *
+     * 2:  7    (byte_count * 4 - 1)
+     * 3: 11    (byte_count * 4 - 1)
+     * 4: 16    (byte_count * 4)
+     */
+
+    /* Calculate number of bits to shift. */
+    a.lsl(TMP1, byte_count, imm(2));
+    a.cmp(byte_count, imm(4));
+    a.csetm(TMP2, imm(arm::CondCode::kNE));
+    a.add(TMP1, TMP1, TMP2);
+
+    /* Pre-fill the tag bits so that we can clear them on error. */
+    mov_imm(TMP2, _TAG_IMMED1_SMALL);
+
+    /* Now isolate the y bits and compare to zero. This check will
+     * be used in a CCMP further down. */
+    a.lsr(TMP1, bitdata, TMP1);
+    a.cmp(TMP1, 0);
+
+    /* Byte count must be 2, 3, or 4. */
+    a.sub(TMP1, byte_count, imm(2));
+    a.ccmp(TMP1, imm(2), imm(NZCV::kCF), imm(arm::CondCode::kNE));
+
+    /* If we have failed, we set byte_count to zero to ensure that the
+     * position update nops, and set the pre-tagged result to zero so
+     * that we can check for error in module code by testing the tag
+     * bits. */
+    a.csel(byte_count, byte_count, ZERO, imm(arm::CondCode::kLS));
+    a.csel(TMP2, TMP2, ZERO, imm(arm::CondCode::kLS));
+
+    a.add(bin_position, bin_position, byte_count, arm::lsl(3));
+    a.str(bin_position, arm::Mem(match_state, position_offset));
+    a.orr(ARG1, TMP2, bitdata, arm::lsl(_TAG_IMMED1_SIZE));
+
+    a.ret(a64::x30);
+}
+
 void BeamModuleAssembler::emit_bs_get_utf8(const ArgRegister &Ctx,
                                            const ArgLabel &Fail) {
-    mov_arg(ARG1, Ctx);
-    lea(ARG1, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb)));
+    const int base_offset = offsetof(ErlBinMatchBuffer, base);
+    const int position_offset = offsetof(ErlBinMatchBuffer, offset);
 
-    emit_enter_runtime();
+    const arm::Gp match_state = ARG1;
+    const arm::Gp bitdata = ARG2;
+    const arm::Gp bin_position = ARG3;
+    const arm::Gp bin_base = ARG4;
+    const arm::Gp bin_size = ARG5;
 
-    runtime_call<1>(erts_bs_get_utf8);
+    auto ctx = load_source(Ctx, ARG6);
 
-    emit_leave_runtime();
+    Label non_ascii = a.newLabel();
+    Label fallback = a.newLabel();
+    Label check = a.newLabel();
+    Label done = a.newLabel();
 
-    emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown));
+    lea(match_state, emit_boxed_val(ctx.reg, offsetof(ErlBinMatchState, mb)));
+    ERTS_CT_ASSERT_FIELD_PAIR(ErlBinMatchBuffer, offset, size);
+    a.ldp(bin_position, bin_size, arm::Mem(ARG1, position_offset));
+    a.ldr(bin_base, arm::Mem(ARG1, base_offset));
+    a.sub(bitdata, bin_size, bin_position);
+    a.cmp(bitdata, imm(32));
+    a.b_lo(fallback);
+
+    emit_read_bits(32, bin_base, bin_position, bitdata);
+    a.tbnz(bitdata, imm(63), non_ascii);
+
+    /* Handle plain old ASCII (code point < 128). */
+    a.add(bin_position, bin_position, imm(8));
+    a.str(bin_position, arm::Mem(ARG1, position_offset));
+    a.mov(ARG1, imm(_TAG_IMMED1_SMALL));
+    a.orr(ARG1, ARG1, bitdata, arm::lsr(56 - _TAG_IMMED1_SIZE));
+    a.b(done);
+
+    /* Handle code point >= 128. */
+    a.bind(non_ascii);
+    fragment_call(ga->get_bs_get_utf8_shared());
+    a.b(check);
+
+    /*
+     * Handle the case that there are not 4 bytes available in the binary.
+     */
+
+    a.bind(fallback);
+    fragment_call(ga->get_bs_get_utf8_short_shared());
+
+    a.bind(check);
+    ERTS_CT_ASSERT((_TAG_IMMED1_SMALL & 1) != 0);
+    a.tbz(ARG1, imm(0), resolve_beam_label(Fail, disp32K));
+
+    a.bind(done);
 }
 
 void BeamModuleAssembler::emit_i_bs_get_utf8(const ArgRegister &Ctx,
@@ -1291,14 +1600,14 @@ void BeamModuleAssembler::emit_i_bs_append(const ArgLabel &Fail,
 
     mov_arg(ArgXRegister(Live.get()), Bin);
 
-    emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+    emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs |
                        Update::eReductions>(Live.get() + 1);
 
     a.mov(ARG1, c_p);
     load_x_reg_array(ARG2);
     runtime_call<6>(erts_bs_append);
 
-    emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+    emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs |
                        Update::eReductions>(Live.get() + 1);
 
     if (Fail.get() != 0) {
@@ -1355,11 +1664,11 @@ void BeamModuleAssembler::emit_bs_init_writable() {
 
     /* We have an implicit liveness of 0, so we don't need to stash X
      * registers. */
-    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>(0);
+    emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(0);
 
     runtime_call<2>(erts_bs_init_writable);
 
-    emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>(0);
+    emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(0);
 
     a.mov(XREG0, ARG1);
 }
@@ -1367,7 +1676,7 @@ void BeamModuleAssembler::emit_bs_init_writable() {
 void BeamGlobalAssembler::emit_bs_create_bin_error_shared() {
     a.mov(XREG0, a64::x30);
 
-    emit_enter_runtime<Update::eStack | Update::eHeap>(0);
+    emit_enter_runtime<Update::eHeapAlloc>(0);
 
     /* ARG3 is already set by the caller */
     a.mov(ARG2, ARG4);
@@ -1375,7 +1684,7 @@ void BeamGlobalAssembler::emit_bs_create_bin_error_shared() {
     a.mov(ARG1, c_p);
     runtime_call<4>(beam_jit_bs_construct_fail_info);
 
-    emit_leave_runtime<Update::eStack | Update::eHeap>(0);
+    emit_leave_runtime<Update::eHeapAlloc>(0);
 
     a.mov(ARG4, ZERO);
     a.mov(ARG2, XREG0);
@@ -1429,10 +1738,49 @@ void BeamGlobalAssembler::emit_bs_bit_size_shared() {
     a.ret(a64::x30);
 }
 
+/*
+ * ARG1 = tagged bignum term
+ */
+void BeamGlobalAssembler::emit_get_sint64_shared() {
+    Label success = a.newLabel();
+    Label fail = a.newLabel();
+
+    emit_is_boxed(fail, ARG1);
+    arm::Gp boxed_ptr = emit_ptr_val(TMP3, ARG1);
+    a.ldr(TMP1, emit_boxed_val(boxed_ptr));
+    a.ldr(TMP2, emit_boxed_val(boxed_ptr, sizeof(Eterm)));
+    a.and_(TMP1, TMP1, imm(_TAG_HEADER_MASK));
+    a.cmp(TMP1, imm(POS_BIG_SUBTAG));
+    a.b_eq(success);
+
+    a.cmp(TMP1, imm(NEG_BIG_SUBTAG));
+    a.b_ne(fail);
+
+    a.neg(TMP2, TMP2);
+
+    a.bind(success);
+    {
+        a.mov(ARG1, TMP2);
+        /* Clear Z flag.
+         *
+         * TMP1 is known to be POS_BIG_SUBTAG or NEG_BIG_SUBTAG at this point.
+         */
+        ERTS_CT_ASSERT(POS_BIG_SUBTAG != 0 && NEG_BIG_SUBTAG != 0);
+        a.tst(TMP1, TMP1);
+        a.ret(a64::x30);
+    }
+
+    a.bind(fail);
+    {
+        a.tst(ZERO, ZERO);
+        a.ret(a64::x30);
+    }
+}
+
 struct BscSegment {
     BscSegment()
             : type(am_false), unit(1), flags(0), src(ArgNil()), size(ArgNil()),
-              error_info(0), effectiveSize(-1) {
+              error_info(0), effectiveSize(-1), action(action::DIRECT) {
     }
 
     Eterm type;
@@ -1443,19 +1791,443 @@ struct BscSegment {
 
     Uint error_info;
     Sint effectiveSize;
+
+    /* Here are sub actions for storing integer segments.
+     *
+     * We use the ACCUMULATE_FIRST and ACCUMULATE actions to shift the
+     * values of segments with known, small sizes (no more than 64 bits)
+     * into an accumulator register.
+     *
+     * When no more segments can be accumulated, the STORE action is
+     * used to store the value of the accumulator into the binary.
+     *
+     * The DIRECT action is used when it is not possible to use the
+     * accumulator (for unknown or too large sizes).
+     */
+    enum class action { DIRECT, ACCUMULATE_FIRST, ACCUMULATE, STORE } action;
 };
 
+static std::vector<BscSegment> bs_combine_segments(
+        const std::vector<BscSegment> segments) {
+    std::vector<BscSegment> segs;
+
+    for (auto seg : segments) {
+        switch (seg.type) {
+        case am_integer: {
+            if (!(0 < seg.effectiveSize && seg.effectiveSize <= 64)) {
+                /* Unknown or too large size. Handle using the default
+                 * DIRECT action. */
+                segs.push_back(seg);
+                continue;
+            }
+
+            if (seg.flags & BSF_LITTLE || segs.size() == 0 ||
+                segs.back().action == BscSegment::action::DIRECT) {
+                /* There are no previous compatible ACCUMULATE / STORE
+                 * actions. Create the first ones. */
+                seg.action = BscSegment::action::ACCUMULATE_FIRST;
+                segs.push_back(seg);
+                seg.action = BscSegment::action::STORE;
+                segs.push_back(seg);
+                continue;
+            }
+
+            auto prev = segs.back();
+            if (prev.flags & BSF_LITTLE) {
+                /* Little-endian segments cannot be combined with other
+                 * segments. Create new ACCUMULATE_FIRST / STORE actions. */
+                seg.action = BscSegment::action::ACCUMULATE_FIRST;
+                segs.push_back(seg);
+                seg.action = BscSegment::action::STORE;
+                segs.push_back(seg);
+                continue;
+            }
+
+            /* The current segment is compatible with the previous
+             * segment. Try combining them. */
+            if (prev.effectiveSize + seg.effectiveSize <= 64) {
+                /* The combined values of the segments fit in the
+                 * accumulator. Insert an ACCUMULATE action for the
+                 * current segment before the pre-existing STORE
+                 * action. */
+                segs.pop_back();
+                prev.effectiveSize += seg.effectiveSize;
+                seg.action = BscSegment::action::ACCUMULATE;
+                segs.push_back(seg);
+                segs.push_back(prev);
+            } else {
+                /* The size exceeds 64 bits. Can't combine. */
+                seg.action = BscSegment::action::ACCUMULATE_FIRST;
+                segs.push_back(seg);
+                seg.action = BscSegment::action::STORE;
+                segs.push_back(seg);
+            }
+            break;
+        }
+        default:
+            segs.push_back(seg);
+            break;
+        }
+    }
+    return segs;
+}
+
+/*
+ * In:
+ *    bin_offset = register to store the bit offset into the binary
+ *    bit_offset = current bit offset into binary, or -1 if unknown
+ *    size = size of segment to be constructed
+ *           (ignored if size_reg is valid register)
+ *    size_reg = if a valid register, it contains the size of
+ *               the segment to be constructed
+ *
+ * Out:
+ *    bin_offset register = if bit_offset is not byte aligned, the bit
+ *          offset into the binary
+ *    TMP1 = pointer to the current byte in the binary
+ *
+ *    Preserves all other ARG* registers.
+ */
+void BeamModuleAssembler::update_bin_state(arm::Gp bin_offset,
+                                           Sint bit_offset,
+                                           Sint size,
+                                           arm::Gp size_reg) {
+    int cur_bin_offset = offsetof(ErtsSchedulerRegisters,
+                                  aux_regs.d.erl_bits_state.erts_current_bin_);
+    arm::Mem mem_bin_base = arm::Mem(scheduler_registers, cur_bin_offset);
+    arm::Mem mem_bin_offset =
+            arm::Mem(scheduler_registers, cur_bin_offset + sizeof(Eterm));
+
+    if (bit_offset % 8 != 0) {
+        /* The bit offset is unknown or not byte-aligned. */
+        ERTS_CT_ASSERT_FIELD_PAIR(struct erl_bits_state,
+                                  erts_current_bin_,
+                                  erts_bin_offset_);
+        a.ldp(TMP2, bin_offset, mem_bin_base);
+
+        if (size_reg.isValid()) {
+            a.add(TMP1, bin_offset, size_reg);
+        } else {
+            add(TMP1, bin_offset, size);
+        }
+        a.str(TMP1, mem_bin_offset);
+
+        a.add(TMP1, TMP2, bin_offset, arm::lsr(3));
+    } else {
+        comment("optimized updating of binary construction state");
+        ASSERT(size >= 0 || size_reg.isValid());
+        ASSERT(bit_offset % 8 == 0);
+        a.ldr(TMP1, mem_bin_base);
+        if (size_reg.isValid()) {
+            if (bit_offset == 0) {
+                a.str(size_reg, mem_bin_offset);
+            } else {
+                add(TMP2, size_reg, bit_offset);
+                a.str(TMP2, mem_bin_offset);
+            }
+        } else {
+            mov_imm(TMP2, bit_offset + size);
+            a.str(TMP2, mem_bin_offset);
+        }
+        if (bit_offset != 0) {
+            add(TMP1, TMP1, bit_offset >> 3);
+        }
+    }
+}
+
+/*
+ * The size of the segment is assumed to be in ARG3.
+ */
+void BeamModuleAssembler::set_zero(Sint effectiveSize) {
+    Label store_units = a.newLabel();
+    Label less_than_a_store_unit = a.newLabel();
+    Sint store_unit = 1;
+
+    update_bin_state(ARG2, -1, -1, ARG3);
+
+    if (effectiveSize >= 256) {
+        /* Store four 64-bit words machine words when the size is
+         * known and at least 256 bits. */
+        store_unit = 4;
+        a.movi(a64::d31, 0);
+    } else if (effectiveSize >= 128) {
+        /* Store two 64-bit words machine words when the size is
+         * known and at least 128 bits. */
+        store_unit = 2;
+    }
+
+    if (effectiveSize < Sint(store_unit * 8 * sizeof(Eterm))) {
+        /* The size is either not known or smaller than a word. */
+        a.cmp(ARG3, imm(store_unit * 8 * sizeof(Eterm)));
+        a.b_lt(less_than_a_store_unit);
+    }
+
+    a.bind(store_units);
+    if (store_unit == 4) {
+        a.stp(a64::q31, a64::q31, arm::Mem(TMP1).post(sizeof(Eterm[4])));
+    } else if (store_unit == 2) {
+        a.stp(ZERO, ZERO, arm::Mem(TMP1).post(sizeof(Eterm[2])));
+    } else {
+        a.str(ZERO, arm::Mem(TMP1).post(sizeof(Eterm)));
+    }
+    a.sub(ARG3, ARG3, imm(store_unit * 8 * sizeof(Eterm)));
+
+    a.cmp(ARG3, imm(store_unit * 8 * sizeof(Eterm)));
+    a.b_ge(store_units);
+
+    a.bind(less_than_a_store_unit);
+    if (effectiveSize < 0) {
+        /* Unknown size. */
+        Label byte_loop = a.newLabel();
+        Label done = a.newLabel();
+
+        ASSERT(store_unit = 1);
+
+        a.cbz(ARG3, done);
+
+        a.bind(byte_loop);
+        a.strb(ZERO.w(), arm::Mem(TMP1).post(1));
+        a.subs(ARG3, ARG3, imm(8));
+        a.b_gt(byte_loop);
+
+        a.bind(done);
+    } else if (effectiveSize % (store_unit * 8 * sizeof(Eterm)) != 0) {
+        /* The size is known, and we know that there are less than
+         * 256 bits to initialize. */
+        if (store_unit == 4 && (effectiveSize & 255) >= 128) {
+            a.stp(ZERO, ZERO, arm::Mem(TMP1).post(16));
+        }
+
+        if ((effectiveSize & 127) >= 64) {
+            a.str(ZERO, arm::Mem(TMP1).post(8));
+        }
+
+        if ((effectiveSize & 63) >= 32) {
+            a.str(ZERO.w(), arm::Mem(TMP1).post(4));
+        }
+
+        if ((effectiveSize & 31) >= 16) {
+            a.strh(ZERO.w(), arm::Mem(TMP1).post(2));
+        }
+
+        if ((effectiveSize & 15) >= 8) {
+            a.strb(ZERO.w(), arm::Mem(TMP1).post(1));
+        }
+
+        if ((effectiveSize & 7) > 0) {
+            a.strb(ZERO.w(), arm::Mem(TMP1));
+        }
+    }
+}
+
+/*
+ * In:
+ *
+ *   ARG1 = valid unicode code point (=> 0x80) to encode
+ *
+ * Out:
+ *
+ *   ARG1 = the code point encoded in UTF-8.
+ *   ARG4 = number of bits of result (16, 24, or 32)
+ *
+ *   Preserves other ARG* registers, clobbers TMP* registers
+ */
+void BeamGlobalAssembler::emit_construct_utf8_shared() {
+    Label more_than_two_bytes = a.newLabel();
+    Label four_bytes = a.newLabel();
+    const arm::Gp value = ARG1;
+    const arm::Gp num_bits = ARG4;
+
+    a.cmp(value, imm(0x800));
+    a.b_hs(more_than_two_bytes);
+
+    /* Encode Unicode code point in two bytes. */
+    a.ubfiz(TMP1, value, imm(8), imm(6));
+    mov_imm(TMP2, 0x80c0);
+    a.orr(TMP1, TMP1, value, arm::lsr(6));
+    mov_imm(num_bits, 16);
+    a.orr(value, TMP1, TMP2);
+    a.ret(a64::x30);
+
+    /* Test whether the value should be encoded in four bytes. */
+    a.bind(more_than_two_bytes);
+    a.lsr(TMP1, value, imm(16));
+    a.cbnz(TMP1, four_bytes);
+
+    /* Encode Unicode code point in three bytes. */
+    a.lsl(TMP1, value, imm(2));
+    a.ubfiz(TMP2, value, imm(16), imm(6));
+    a.and_(TMP1, TMP1, imm(0x3f00));
+    mov_imm(num_bits, 24);
+    a.orr(TMP1, TMP1, value, arm::lsr(12));
+    a.orr(TMP1, TMP1, TMP2);
+    mov_imm(TMP2, 0x8080e0);
+    a.orr(value, TMP1, TMP2);
+    a.ret(a64::x30);
+
+    /* Encode Unicode code point in four bytes. */
+    a.bind(four_bytes);
+    a.lsl(TMP1, value, imm(10));
+    a.lsr(TMP2, value, imm(4));
+    a.and_(TMP1, TMP1, imm(0x3f0000));
+    a.and_(TMP2, TMP2, imm(0x3f00));
+    a.bfxil(TMP1, value, imm(18), imm(14));
+    mov_imm(num_bits, 32);
+    a.bfi(TMP1, value, imm(24), imm(6));
+    a.orr(TMP1, TMP1, TMP2);
+    mov_imm(TMP2, 0x808080f0);
+    a.orr(value, TMP1, TMP2);
+    a.ret(a64::x30);
+}
+
+void BeamModuleAssembler::emit_construct_utf8(const ArgVal &Src,
+                                              Sint bit_offset,
+                                              bool is_byte_aligned) {
+    Label prepare_store = a.newLabel();
+    Label store = a.newLabel();
+    Label next = a.newLabel();
+
+    comment("construct utf8 segment");
+    auto src = load_source(Src, ARG1);
+
+    a.lsr(ARG1, src.reg, imm(_TAG_IMMED1_SIZE));
+    mov_imm(ARG4, 8);
+    a.cmp(ARG1, imm(0x80));
+    a.b_lo(prepare_store);
+
+    fragment_call(ga->get_construct_utf8_shared());
+
+    a.bind(prepare_store);
+    arm::Gp bin_offset = ARG3;
+    update_bin_state(bin_offset, bit_offset, -1, ARG4);
+
+    if (!is_byte_aligned) {
+        /* Not known to be byte-aligned. Must test alignment. */
+        a.ands(TMP2, bin_offset, imm(7));
+        a.b_eq(store);
+
+        /* We must combine the last partial byte with the UTF-8
+         * encoded code point. */
+        a.ldrb(TMP5.w(), arm::Mem(TMP1));
+
+        a.rev64(TMP4, ARG1);
+        a.lsr(TMP4, TMP4, TMP2);
+        a.rev64(TMP4, TMP4);
+
+        a.lsl(TMP5, TMP5, TMP2);
+        a.and_(TMP5, TMP5, imm(~0xff));
+        a.lsr(TMP5, TMP5, TMP2);
+
+        a.orr(ARG1, TMP4, TMP5);
+
+        a.add(ARG4, ARG4, imm(8));
+    }
+
+    a.bind(store);
+    if (bit_offset % (4 * 8) == 0) {
+        /* This segment is aligned on a 4-byte boundary. This implies
+         * that a 4-byte write will be inside the allocated binary. */
+        a.str(ARG1.w(), arm::Mem(TMP1));
+    } else {
+        Label do_store_1 = a.newLabel();
+        Label do_store_2 = a.newLabel();
+
+        /* Unsuitable or unknown alignment. We must be careful not
+         * to write beyound the allocated end of the binary. */
+        a.cmp(ARG4, imm(8));
+        a.b_ne(do_store_1);
+
+        a.strb(ARG1.w(), arm::Mem(TMP1));
+        a.b(next);
+
+        a.bind(do_store_1);
+        a.cmp(ARG4, imm(24));
+        a.b_hi(do_store_2);
+
+        a.strh(ARG1.w(), arm::Mem(TMP1));
+        a.cmp(ARG4, imm(16));
+        a.b_eq(next);
+
+        a.lsr(ARG1, ARG1, imm(16));
+        a.strb(ARG1.w(), arm::Mem(TMP1, 2));
+        a.b(next);
+
+        a.bind(do_store_2);
+        a.str(ARG1.w(), arm::Mem(TMP1));
+
+        if (!is_byte_aligned) {
+            a.cmp(ARG4, imm(32));
+            a.b_eq(next);
+
+            a.lsr(ARG1, ARG1, imm(32));
+            a.strb(ARG1.w(), arm::Mem(TMP1, 4));
+        }
+    }
+
+    a.bind(next);
+}
+
+/*
+ * In:
+ *   TMP1 = pointer to current byte
+ *   ARG3 = bit offset
+ *   ARG4 = number of bits to write
+ *   ARG8 = data to write
+ */
+void BeamGlobalAssembler::emit_store_unaligned() {
+    Label loop = a.newLabel();
+    Label done = a.newLabel();
+    const arm::Gp left_bit_offset = ARG3;
+    const arm::Gp right_bit_offset = TMP6;
+    const arm::Gp num_bits = ARG4;
+    const arm::Gp bitdata = ARG8;
+
+    a.ldrb(TMP5.w(), arm::Mem(TMP1));
+
+    a.and_(TMP4, bitdata, imm(0xff));
+    a.lsr(TMP4, TMP4, left_bit_offset);
+
+    a.lsl(TMP5, TMP5, left_bit_offset);
+    a.and_(TMP5, TMP5, imm(~0xff));
+    a.lsr(TMP5, TMP5, left_bit_offset);
+
+    a.orr(TMP5, TMP4, TMP5);
+
+    a.strb(TMP5.w(), arm::Mem(TMP1).post(1));
+
+    mov_imm(right_bit_offset, 8);
+    a.sub(right_bit_offset, right_bit_offset, left_bit_offset);
+
+    a.rev64(bitdata, bitdata);
+    a.lsl(bitdata, bitdata, right_bit_offset);
+
+    a.subs(num_bits, num_bits, right_bit_offset);
+    a.b_le(done);
+
+    a.bind(loop);
+    a.ror(bitdata, bitdata, imm(56));
+    a.strb(bitdata.w(), arm::Mem(TMP1).post(1));
+    a.subs(num_bits, num_bits, imm(8));
+    a.b_gt(loop);
+
+    a.bind(done);
+    a.ret(a64::x30);
+}
+
 void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                                                const ArgWord &Alloc,
                                                const ArgWord &Live0,
                                                const ArgRegister &Dst,
                                                const Span<ArgVal> &args) {
     Uint num_bits = 0;
+    Uint estimated_num_bits = 0;
     std::size_t n = args.size();
     std::vector<BscSegment> segments;
-    Label error;
+    Label error; /* Intentionally uninitialized */
     ArgWord Live = Live0;
     arm::Gp sizeReg;
+    Sint allocated_size = -1;
+    bool need_error_handler = false;
 
     /*
      * Collect information about each segment and calculate sizes of
@@ -1501,17 +2273,67 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         seg.error_info = beam_jit_set_bsc_segment_op(bsc_segment, bsc_op);
 
         /*
+         * Test whether we can omit the code for the error handler.
+         */
+        switch (seg.type) {
+        case am_append:
+            if (!(exact_type<BeamTypeId::Bitstring>(seg.src) &&
+                  std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit)) {
+                need_error_handler = true;
+            }
+            break;
+        case am_binary:
+            if (!(seg.size.isAtom() && seg.size.as<ArgAtom>().get() == am_all &&
+                  exact_type<BeamTypeId::Bitstring>(seg.src) &&
+                  std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit)) {
+                need_error_handler = true;
+            }
+            break;
+        case am_integer:
+            if (!exact_type<BeamTypeId::Integer>(seg.src)) {
+                need_error_handler = true;
+            }
+            break;
+        case am_private_append:
+        case am_string:
+            break;
+        default:
+            need_error_handler = true;
+            break;
+        }
+
+        /*
          * Attempt to calculate the effective size of this segment.
-         * Give up is variable or invalid.
+         * Give up if variable or invalid.
          */
         if (seg.size.isSmall() && seg.unit != 0) {
             Uint unsigned_size = seg.size.as<ArgSmall>().getUnsigned();
 
-            if ((unsigned_size >> (sizeof(Eterm) - 1) * 8) == 0) {
+            if ((unsigned_size >> (sizeof(Eterm) - 1) * 8) != 0) {
+                /* Suppress creation of heap binary. */
+                estimated_num_bits += (ERL_ONHEAP_BIN_LIMIT + 1) * 8;
+            } else {
                 /* This multiplication cannot overflow. */
                 Uint seg_size = seg.unit * unsigned_size;
                 seg.effectiveSize = seg_size;
                 num_bits += seg_size;
+                estimated_num_bits += seg_size;
+            }
+        } else if (seg.unit > 0) {
+            auto max = std::min(std::get<1>(getClampedRange(seg.size)),
+                                Sint((ERL_ONHEAP_BIN_LIMIT + 1) * 8));
+            estimated_num_bits += max * seg.unit;
+        } else {
+            switch (seg.type) {
+            case am_utf8:
+            case am_utf16:
+            case am_utf32:
+                estimated_num_bits += 32;
+                break;
+            default:
+                /* Suppress creation of heap binary. */
+                estimated_num_bits += (ERL_ONHEAP_BIN_LIMIT + 1) * 8;
+                break;
             }
         }
 
@@ -1520,14 +2342,15 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             /* At least one segment will need a dynamic size
              * calculation. */
             sizeReg = ARG8;
+            need_error_handler = true;
         }
 
         segments.insert(segments.end(), seg);
     }
 
-    if (Fail.get() != 0) {
+    if (need_error_handler && Fail.get() != 0) {
         error = resolve_beam_label(Fail, dispUnknown);
-    } else {
+    } else if (need_error_handler) {
         Label past_error = a.newLabel();
 
         a.b(past_error);
@@ -1550,6 +2373,8 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         }
 
         a.bind(past_error);
+    } else {
+        comment("(cannot fail)");
     }
 
     /* We count the total number of bits in an unsigned integer. To
@@ -1575,13 +2400,49 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         if (seg.size.isAtom() && seg.size.as<ArgAtom>().get() == am_all &&
             seg.type == am_binary) {
             comment("size of an entire binary");
-            mov_arg(ARG1, seg.src);
-            a.mov(ARG3, ARG1);
-            fragment_call(ga->get_bs_bit_size_shared());
-            if (exact_type(seg.src, BEAM_TYPE_BITSTRING)) {
-                comment("skipped check for success since the source "
-                        "is always a bit string");
+            if (exact_type<BeamTypeId::Bitstring>(seg.src)) {
+                auto src = load_source(seg.src, ARG1);
+                arm::Gp boxed_ptr = emit_ptr_val(ARG1, src.reg);
+                auto unit = getSizeUnit(seg.src);
+                bool is_bitstring = unit == 0 || std::gcd(unit, 8) != 8;
+
+                if (is_bitstring) {
+                    comment("inlined size code because the value is always "
+                            "a bitstring");
+                } else {
+                    comment("inlined size code because the value is always "
+                            "a binary");
+                }
+
+                a.ldur(TMP2, emit_boxed_val(boxed_ptr, sizeof(Eterm)));
+
+                if (is_bitstring) {
+                    a.ldur(TMP1, emit_boxed_val(boxed_ptr));
+                }
+
+                a.add(sizeReg, sizeReg, TMP2, arm::lsl(3));
+
+                if (is_bitstring) {
+                    Label not_sub_bin = a.newLabel();
+                    const int bit_number = 3;
+                    ERTS_CT_ASSERT(
+                            (_TAG_HEADER_SUB_BIN & (1 << bit_number)) != 0 &&
+                            (_TAG_HEADER_REFC_BIN & (1 << bit_number)) == 0 &&
+                            (_TAG_HEADER_HEAP_BIN & (1 << bit_number)) == 0);
+
+                    a.tbz(TMP1, imm(bit_number), not_sub_bin);
+
+                    a.ldurb(TMP2.w(),
+                            emit_boxed_val(boxed_ptr,
+                                           offsetof(ErlSubBin, bitsize)));
+                    a.add(sizeReg, sizeReg, TMP2);
+
+                    a.bind(not_sub_bin);
+                }
             } else {
+                mov_arg(ARG1, seg.src);
+                a.mov(ARG3, ARG1);
+                fragment_call(ga->get_bs_bit_size_shared());
                 if (Fail.get() == 0) {
                     mov_imm(ARG4,
                             beam_jit_update_bsc_reason_info(seg.error_info,
@@ -1590,14 +2451,14 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                                                             BSC_VALUE_ARG3));
                 }
                 a.b_mi(resolve_label(error, disp1MB));
+                a.add(sizeReg, sizeReg, ARG1);
             }
-            a.add(sizeReg, sizeReg, ARG1);
         } else if (seg.unit != 0) {
             bool can_fail = true;
             comment("size binary/integer/float/string");
 
             if (always_small(seg.size)) {
-                auto [min, _] = getIntRange(seg.size);
+                auto min = std::get<0>(getClampedRange(seg.size));
                 if (min >= 0) {
                     can_fail = false;
                 }
@@ -1615,8 +2476,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
 
             if (always_small(seg.size)) {
                 comment("skipped test for small size since it is always small");
-            } else if (always_one_of(seg.size,
-                                     BEAM_TYPE_FLOAT | BEAM_TYPE_INTEGER)) {
+            } else if (always_one_of<BeamTypeId::Number>(seg.size)) {
                 comment("simplified test for small size since it is a number");
                 emit_is_not_boxed(error, ARG3);
             } else {
@@ -1627,10 +2487,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             if (can_fail) {
                 a.tbnz(ARG3, 63, resolve_label(error, disp32K));
             }
-            a.asr(TMP1, ARG3, imm(_TAG_IMMED1_SIZE));
             if (seg.unit == 1) {
-                a.add(sizeReg, sizeReg, TMP1);
+                a.add(sizeReg, sizeReg, ARG3, arm::asr(_TAG_IMMED1_SIZE));
             } else {
+                a.asr(TMP1, ARG3, imm(_TAG_IMMED1_SIZE));
                 if (Fail.get() == 0) {
                     mov_imm(ARG4,
                             beam_jit_update_bsc_reason_info(
@@ -1639,7 +2499,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                                     BSC_INFO_SIZE,
                                     BSC_VALUE_ARG3));
                 }
-                a.tst(TMP1, imm(0xffful << 52));
+                a.tst(TMP1, imm(0xffful << (SMALL_BITS - ERL_UNIT_BITS)));
                 a.b_ne(resolve_label(error, disp1MB));
                 mov_imm(TMP2, seg.unit);
                 a.madd(sizeReg, TMP1, TMP2, sizeReg);
@@ -1649,24 +2509,60 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             case am_utf8: {
                 comment("size utf8");
                 Label next = a.newLabel();
-                auto src_reg = load_source(seg.src, TMP1);
 
-                a.lsr(TMP1, src_reg.reg, imm(_TAG_IMMED1_SIZE));
-                mov_imm(TMP2, 1 * 8);
+                mov_arg(ARG3, seg.src);
+
+                if (Fail.get() == 0) {
+                    mov_imm(ARG4,
+                            beam_jit_update_bsc_reason_info(seg.error_info,
+                                                            BSC_REASON_BADARG,
+                                                            BSC_INFO_TYPE,
+                                                            BSC_VALUE_ARG3));
+                }
+
+                if (always_small(seg.src)) {
+                    comment("skipped test for small value since it is always "
+                            "small");
+                } else if (always_one_of<BeamTypeId::Integer,
+                                         BeamTypeId::AlwaysBoxed>(seg.src)) {
+                    comment("simplified test for small operand since other "
+                            "types are boxed");
+                    emit_is_not_boxed(resolve_label(error, dispUnknown), ARG3);
+                } else {
+                    a.and_(TMP1, ARG3, imm(_TAG_IMMED1_MASK));
+                    a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
+                    a.b_ne(resolve_label(error, disp1MB));
+                }
+
+                a.asr(TMP1, ARG3, imm(_TAG_IMMED1_SIZE));
+                mov_imm(TMP2, 1);
                 a.cmp(TMP1, imm(0x7F));
                 a.b_ls(next);
 
-                mov_imm(TMP2, 2 * 8);
+                mov_imm(TMP2, 2);
                 a.cmp(TMP1, imm(0x7FFUL));
                 a.b_ls(next);
 
+                /* Ensure that the value is not in the invalid range
+                 * 0xD800 through 0xDFFF. */
+                a.lsr(TMP3, TMP1, imm(11));
+                a.cmp(TMP3, 0x1b);
+                a.b_eq(resolve_label(error, disp1MB));
+
                 a.cmp(TMP1, imm(0x10000UL));
-                mov_imm(TMP2, 3 * 8);
-                mov_imm(TMP3, 4 * 8);
-                a.csel(TMP2, TMP2, TMP3, arm::CondCode::kLO);
+                a.cset(TMP2, arm::CondCode::kHS);
+                a.add(TMP2, TMP2, imm(3));
+
+                auto [min, max] = getClampedRange(seg.src);
+                if (0 <= min && max < 0x110000) {
+                    comment("skipped range check for unicode code point");
+                } else {
+                    a.cmp(TMP1, 0x110000);
+                    a.b_hs(resolve_label(error, disp1MB));
+                }
 
                 a.bind(next);
-                a.add(sizeReg, sizeReg, TMP2);
+                a.add(sizeReg, sizeReg, TMP2, arm::lsl(3));
                 break;
             }
             case am_utf16: {
@@ -1742,21 +2638,28 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         a.mov(ARG1, c_p);
         load_x_reg_array(ARG2);
 
-        emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+        emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs |
                            Update::eReductions>(Live.get() + 1);
         runtime_call<6>(erts_bs_append_checked);
-        emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
+        emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs |
                            Update::eReductions>(Live.get() + 1);
 
-        if (Fail.get() == 0) {
-            mov_arg(ARG3, ArgXRegister(Live.get()));
-            mov_imm(ARG4,
-                    beam_jit_update_bsc_reason_info(seg.error_info,
-                                                    BSC_REASON_BADARG,
-                                                    BSC_INFO_FVALUE,
-                                                    BSC_VALUE_ARG3));
+        if (exact_type<BeamTypeId::Bitstring>(seg.src) &&
+            std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) {
+            /* There is no way the call can fail with a system_limit
+             * exception on a 64-bit architecture. */
+            comment("skipped test for success because units are compatible");
+        } else {
+            if (Fail.get() == 0) {
+                mov_arg(ARG3, ArgXRegister(Live.get()));
+                mov_imm(ARG4,
+                        beam_jit_update_bsc_reason_info(seg.error_info,
+                                                        BSC_REASON_BADARG,
+                                                        BSC_INFO_FVALUE,
+                                                        BSC_VALUE_ARG3));
+            }
+            emit_branch_if_not_value(ARG1, resolve_label(error, dispUnknown));
         }
-        emit_branch_if_not_value(ARG1, resolve_label(error, dispUnknown));
     } else if (segments[0].type == am_private_append) {
         BscSegment seg = segments[0];
         comment("private append to binary");
@@ -1773,6 +2676,82 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         runtime_call<4>(erts_bs_private_append_checked);
         emit_leave_runtime(Live.get());
         /* There is no way the call can fail on a 64-bit architecture. */
+    } else if (estimated_num_bits % 8 == 0 &&
+               estimated_num_bits / 8 <= ERL_ONHEAP_BIN_LIMIT) {
+        static constexpr auto cur_bin_offset =
+                offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
+                offsetof(struct erl_bits_state, erts_current_bin_);
+        Uint need;
+
+        arm::Mem mem_bin_base = arm::Mem(scheduler_registers, cur_bin_offset);
+
+        if (sizeReg.isValid()) {
+            Label after_gc_check = a.newLabel();
+
+            comment("allocate heap binary of dynamic size (=< %ld bits)",
+                    estimated_num_bits);
+
+            /* Calculate number of bytes to allocate. */
+            need = (heap_bin_size(0) + Alloc.get() + S_RESERVED);
+            a.lsr(sizeReg, sizeReg, imm(3));
+            a.add(TMP3, sizeReg, imm(7));
+            a.and_(TMP3, TMP3, imm(-8));
+            a.add(TMP1, TMP3, imm(need * sizeof(Eterm)));
+
+            /* Do a GC test. */
+            a.add(ARG3, HTOP, TMP1);
+            a.cmp(ARG3, E);
+            a.b_ls(after_gc_check);
+
+            a.stp(sizeReg, TMP3, TMP_MEM1q);
+
+            mov_imm(ARG4, Live.get());
+            fragment_call(ga->get_garbage_collect());
+
+            a.ldp(sizeReg, TMP3, TMP_MEM1q);
+
+            a.bind(after_gc_check);
+
+            mov_imm(TMP1, header_heap_bin(0));
+            a.lsr(TMP4, TMP3, imm(3));
+            a.add(TMP1, TMP1, TMP4, arm::lsl(_HEADER_ARITY_OFFS));
+
+            /* Create the heap binary. */
+            a.add(ARG1, HTOP, imm(TAG_PRIMARY_BOXED));
+            a.stp(TMP1, sizeReg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+
+            /* Initialize the erl_bin_state struct. */
+            a.stp(HTOP, ZERO, mem_bin_base);
+
+            /* Update HTOP. */
+            a.add(HTOP, HTOP, TMP3);
+        } else {
+            Uint num_bytes = num_bits / 8;
+
+            comment("allocate heap binary of static size");
+
+            allocated_size = (num_bytes + 7) & (-8);
+
+            /* Ensure that there is sufficient room on the heap. */
+            need = heap_bin_size(num_bytes) + Alloc.get();
+            emit_gc_test(ArgWord(0), ArgWord(need), Live);
+
+            mov_imm(TMP1, header_heap_bin(num_bytes));
+            mov_imm(TMP2, num_bytes);
+
+            /* Create the heap binary. */
+            a.add(ARG1, HTOP, imm(TAG_PRIMARY_BOXED));
+            a.stp(TMP1, TMP2, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+
+            /* Initialize the erl_bin_state struct. */
+            ERTS_CT_ASSERT_FIELD_PAIR(struct erl_bits_state,
+                                      erts_current_bin_,
+                                      erts_bin_offset_);
+            a.stp(HTOP, ZERO, mem_bin_base);
+
+            /* Update HTOP. */
+            a.add(HTOP, HTOP, imm(allocated_size));
+        }
     } else {
         comment("allocate binary");
         mov_arg(ARG5, Alloc);
@@ -1780,30 +2759,43 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         load_erl_bits_state(ARG3);
         load_x_reg_array(ARG2);
         a.mov(ARG1, c_p);
-        emit_enter_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap | Update::eXRegs>(Live.get());
+        emit_enter_runtime<Update::eReductions | Update::eHeapAlloc |
+                           Update::eXRegs>(Live.get());
         if (sizeReg.isValid()) {
             comment("(size in bits)");
             a.mov(ARG4, sizeReg);
             runtime_call<6>(beam_jit_bs_init_bits);
-        } else if (num_bits % 8 == 0) {
-            comment("(size in bytes)");
-            mov_imm(ARG4, num_bits / 8);
-            runtime_call<6>(beam_jit_bs_init);
         } else {
+            allocated_size = (num_bits + 7) / 8;
+            if (allocated_size <= ERL_ONHEAP_BIN_LIMIT) {
+                allocated_size = (allocated_size + 7) & (-8);
+            }
             mov_imm(ARG4, num_bits);
             runtime_call<6>(beam_jit_bs_init_bits);
         }
-        emit_leave_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap | Update::eXRegs>(Live.get());
+        emit_leave_runtime<Update::eReductions | Update::eHeapAlloc |
+                           Update::eXRegs>(Live.get());
     }
     a.str(ARG1, TMP_MEM1q);
 
+    segments = bs_combine_segments(segments);
+
+    /* Keep track of the bit offset from the being of the binary.
+     * Set to -1 if offset is not known (when a segment of unknown
+     * size has been seen). */
+    Sint bit_offset = 0;
+
+    /* Keep track of whether the current segment is byte-aligned.  (A
+     * segment can be known to be byte-aligned even if the bit offset
+     * is unknown.) */
+    bool is_byte_aligned = true;
+
     /* Build each segment of the binary. */
     for (auto seg : segments) {
         switch (seg.type) {
         case am_append:
         case am_private_append:
+            bit_offset = -1;
             break;
         case am_binary: {
             Uint error_info;
@@ -1838,8 +2830,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                                                              BSC_REASON_BADARG,
                                                              BSC_INFO_UNIT,
                                                              BSC_VALUE_FVALUE);
-                if (seg.unit == 1) {
-                    comment("skipped test for success because unit =:= 1");
+                if (exact_type<BeamTypeId::Bitstring>(seg.src) &&
+                    std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) {
+                    comment("skipped test for success because units are "
+                            "compatible");
                     can_fail = false;
                 }
             } else {
@@ -1847,8 +2841,8 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                  * the value is a non-negative small in the
                  * appropriate range. Multiply the size with the
                  * unit. */
-                mov_arg(ARG3, seg.size);
-                a.asr(ARG3, ARG3, imm(_TAG_IMMED1_SIZE));
+                auto r = load_source(seg.size, ARG3);
+                a.asr(ARG3, r.reg, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(TMP1, seg.unit);
                     a.mul(ARG3, ARG3, TMP1);
@@ -1879,8 +2873,8 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             if (seg.effectiveSize >= 0) {
                 mov_imm(ARG3, seg.effectiveSize);
             } else {
-                mov_arg(ARG3, seg.size);
-                a.asr(ARG3, ARG3, imm(_TAG_IMMED1_SIZE));
+                auto r = load_source(seg.size, ARG3);
+                a.asr(ARG3, r.reg, imm(_TAG_IMMED1_SIZE));
                 if (seg.unit != 1) {
                     mov_imm(TMP1, seg.unit);
                     a.mul(ARG3, ARG3, TMP1);
@@ -1904,38 +2898,281 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             emit_branch_if_value(ARG1, resolve_label(error, dispUnknown));
             break;
         case am_integer:
-            comment("construct integer segment");
-            if (seg.effectiveSize >= 0) {
-                mov_imm(ARG3, seg.effectiveSize);
-            } else {
-                mov_arg(ARG3, seg.size);
-                a.asr(ARG3, ARG3, imm(_TAG_IMMED1_SIZE));
-                if (seg.unit != 1) {
-                    mov_imm(TMP1, seg.unit);
-                    a.mul(ARG3, ARG3, TMP1);
+            switch (seg.action) {
+            case BscSegment::action::ACCUMULATE_FIRST:
+            case BscSegment::action::ACCUMULATE: {
+                /* Shift an integer of known size (no more than 64 bits)
+                 * into a word-size accumulator. */
+                Label value_is_small = a.newLabel();
+                Label done = a.newLabel();
+
+                comment("accumulate value for integer segment");
+                auto src = load_source(seg.src, ARG1);
+                if (seg.effectiveSize < 64 &&
+                    seg.action == BscSegment::action::ACCUMULATE) {
+                    a.lsl(ARG8, ARG8, imm(seg.effectiveSize));
+                }
+
+                if (!always_small(seg.src)) {
+                    if (always_one_of<BeamTypeId::Integer,
+                                      BeamTypeId::AlwaysBoxed>(seg.src)) {
+                        comment("simplified small test since all other types "
+                                "are boxed");
+                        emit_is_boxed(value_is_small, seg.src, src.reg);
+                    } else {
+                        a.and_(TMP1, src.reg, imm(_TAG_IMMED1_MASK));
+                        a.cmp(TMP1, imm(_TAG_IMMED1_SMALL));
+                        a.b_eq(value_is_small);
+                    }
+
+                    /* The value is boxed. If it is a bignum, extract the
+                     * least significant 64 bits. */
+                    mov_var(ARG1, src);
+                    fragment_call(ga->get_get_sint64_shared());
+                    if (seg.effectiveSize == 64) {
+                        a.mov(ARG8, ARG1);
+                    } else {
+                        a.bfxil(ARG8,
+                                ARG1,
+                                arm::lsr(0),
+                                imm(seg.effectiveSize));
+                    }
+
+                    if (exact_type<BeamTypeId::Integer>(seg.src)) {
+                        a.b(done);
+                    } else {
+                        a.b_ne(done);
+
+                        /* Not a bignum. Signal error. */
+                        if (Fail.get() == 0) {
+                            mov_imm(ARG4,
+                                    beam_jit_update_bsc_reason_info(
+                                            seg.error_info,
+                                            BSC_REASON_BADARG,
+                                            BSC_INFO_TYPE,
+                                            BSC_VALUE_ARG1));
+                        }
+                        a.b(resolve_label(error, disp128MB));
+                    }
+                }
+
+                a.bind(value_is_small);
+                if (seg.effectiveSize == 64) {
+                    a.asr(ARG8, src.reg, imm(_TAG_IMMED1_SIZE));
+                } else if (seg.effectiveSize + _TAG_IMMED1_SIZE > 64) {
+                    a.asr(TMP1, src.reg, imm(_TAG_IMMED1_SIZE));
+                    a.bfxil(ARG8, TMP1, arm::lsr(0), imm(seg.effectiveSize));
+                } else {
+                    a.bfxil(ARG8,
+                            src.reg,
+                            arm::lsr(_TAG_IMMED1_SIZE),
+                            imm(seg.effectiveSize));
                 }
+
+                a.bind(done);
+                break;
             }
-            mov_arg(ARG2, seg.src);
-            mov_imm(ARG4, seg.flags);
-            load_erl_bits_state(ARG1);
+            case BscSegment::action::STORE: {
+                /* The accumulator is now full or the next segment is
+                 * not possible to accumulate, so it's time to store
+                 * the accumulator to the current position in the
+                 * binary. */
+                Label store = a.newLabel();
+                Label done = a.newLabel();
+
+                comment("construct integer segment from accumulator");
+
+                /* First we'll need to ensure that the value in the
+                 * accumulator is in little endian format. */
+                ASSERT(seg.effectiveSize >= 0);
+                if (seg.effectiveSize % 8) {
+                    Uint complete_bytes = 8 * (seg.effectiveSize / 8);
+                    Uint num_partial = seg.effectiveSize % 8;
+                    if (seg.flags & BSF_LITTLE) {
+                        a.ubfx(TMP1,
+                               ARG8,
+                               imm(complete_bytes),
+                               imm(num_partial));
+                        a.bfc(ARG8,
+                              arm::lsr(complete_bytes),
+                              imm(64 - complete_bytes));
+                        a.bfi(ARG8,
+                              TMP1,
+                              imm(complete_bytes + 8 - num_partial),
+                              imm(num_partial));
+                    } else {
+                        a.lsl(ARG8, ARG8, imm(64 - seg.effectiveSize));
+                        a.rev64(ARG8, ARG8);
+                    }
+                } else if ((seg.flags & BSF_LITTLE) == 0) {
+                    switch (seg.effectiveSize) {
+                    case 8:
+                        break;
+                    case 16:
+                        a.rev16(ARG8, ARG8);
+                        break;
+                    case 32:
+                        a.rev32(ARG8, ARG8);
+                        break;
+                    case 64:
+                        a.rev64(ARG8, ARG8);
+                        break;
+                    default:
+                        a.rev64(ARG8, ARG8);
+                        a.lsr(ARG8, ARG8, imm(64 - seg.effectiveSize));
+                    }
+                }
 
-            emit_enter_runtime(Live.get());
-            runtime_call<4>(erts_new_bs_put_integer);
-            emit_leave_runtime(Live.get());
+                arm::Gp bin_offset = ARG3;
+                arm::Gp bin_data = ARG8;
+
+                update_bin_state(bin_offset,
+                                 bit_offset,
+                                 seg.effectiveSize,
+                                 arm::Gp());
+
+                if (!is_byte_aligned) {
+                    if (bit_offset < 0) {
+                        /* Bit offset is unknown. Must test alignment. */
+                        a.ands(bin_offset, bin_offset, imm(7));
+                        a.b_eq(store);
+                    } else if (bit_offset >= 0) {
+                        /* Alignment is known to be unaligned. */
+                        mov_imm(bin_offset, bit_offset & 7);
+                    }
+
+                    /* Bit offset is tested or known to be unaligned. */
+                    mov_imm(ARG4, seg.effectiveSize);
+                    fragment_call(ga->get_store_unaligned());
+
+                    if (bit_offset < 0) {
+                        /* The bit offset is unknown, which implies that
+                         * there exists store code that we will need to
+                         * branch past. */
+                        a.b(done);
+                    }
+                }
 
-            if (exact_type(seg.src, BEAM_TYPE_INTEGER)) {
-                comment("skipped test for success because construction can't "
-                        "fail");
-            } else {
-                if (Fail.get() == 0) {
-                    mov_arg(ARG3, seg.src);
-                    mov_imm(ARG4,
-                            beam_jit_update_bsc_reason_info(seg.error_info,
-                                                            BSC_REASON_BADARG,
-                                                            BSC_INFO_TYPE,
-                                                            BSC_VALUE_ARG3));
+                a.bind(store);
+
+                if (bit_offset < 0 || is_byte_aligned) {
+                    /* Bit offset is tested or known to be
+                     * byte-aligned. Emit inline code to store the
+                     * value of the accumulator into the binary. */
+                    int num_bytes = (seg.effectiveSize + 7) / 8;
+
+                    /* If more than one instruction is required for
+                     * doing the store, test whether it would be safe
+                     * to do a single 32 or 64 bit store. */
+                    switch (num_bytes) {
+                    case 3:
+                        if (bit_offset >= 0 &&
+                            allocated_size * 8 - bit_offset >= 32) {
+                            comment("simplified complicated store");
+                            num_bytes = 4;
+                        }
+                        break;
+                    case 5:
+                    case 6:
+                    case 7:
+                        if (bit_offset >= 0 &&
+                            allocated_size * 8 - bit_offset >= 64) {
+                            comment("simplified complicated store");
+                            num_bytes = 8;
+                        }
+                        break;
+                    }
+
+                    do {
+                        switch (num_bytes) {
+                        case 1:
+                            a.strb(bin_data.w(), arm::Mem(TMP1));
+                            break;
+                        case 2:
+                            a.strh(bin_data.w(), arm::Mem(TMP1));
+                            break;
+                        case 3:
+                            a.strh(bin_data.w(), arm::Mem(TMP1));
+                            a.lsr(bin_data, bin_data, imm(16));
+                            a.strb(bin_data.w(), arm::Mem(TMP1, 2));
+                            break;
+                        case 4:
+                            a.str(bin_data.w(), arm::Mem(TMP1));
+                            break;
+                        case 5:
+                        case 6:
+                        case 7:
+                            a.str(bin_data.w(), arm::Mem(TMP1).post(4));
+                            a.lsr(bin_data, bin_data, imm(32));
+                            break;
+                        case 8:
+                            a.str(bin_data, arm::Mem(TMP1));
+                            num_bytes = 0;
+                            break;
+                        }
+                        num_bytes -= 4;
+                    } while (num_bytes > 0);
+                }
+
+                a.bind(done);
+                break;
+            }
+            case BscSegment::action::DIRECT:
+                /* This segment either has a size exceeding the maximum
+                 * accumulator size of 64 bits or has a variable size.
+                 *
+                 * First load the effective size (size * unit) into ARG3.
+                 */
+                comment("construct integer segment");
+                if (seg.effectiveSize >= 0) {
+                    mov_imm(ARG3, seg.effectiveSize);
+                } else {
+                    auto size = load_source(seg.size, TMP1);
+                    a.lsr(ARG3, size.reg, imm(_TAG_IMMED1_SIZE));
+                    if (Support::isPowerOf2(seg.unit)) {
+                        Uint trailing_bits = Support::ctz<Eterm>(seg.unit);
+                        if (trailing_bits) {
+                            a.lsl(ARG3, ARG3, imm(trailing_bits));
+                        }
+                    } else {
+                        mov_imm(TMP1, seg.unit);
+                        a.mul(ARG3, ARG3, TMP1);
+                    }
+                }
+
+                if (is_byte_aligned && seg.src.isSmall() &&
+                    seg.src.as<ArgSmall>().getSigned() == 0) {
+                    /* Optimize the special case of setting a known
+                     * byte-aligned segment to zero. */
+                    comment("optimized setting segment to 0");
+                    set_zero(seg.effectiveSize);
+                } else {
+                    /* Call the helper function to fetch and store the
+                     * integer into the binary. */
+                    mov_arg(ARG2, seg.src);
+                    mov_imm(ARG4, seg.flags);
+                    load_erl_bits_state(ARG1);
+
+                    emit_enter_runtime(Live.get());
+                    runtime_call<4>(erts_new_bs_put_integer);
+                    emit_leave_runtime(Live.get());
+
+                    if (exact_type<BeamTypeId::Integer>(seg.src)) {
+                        comment("skipped test for success because construction "
+                                "can't fail");
+                    } else {
+                        if (Fail.get() == 0) {
+                            mov_arg(ARG3, seg.src);
+                            mov_imm(ARG4,
+                                    beam_jit_update_bsc_reason_info(
+                                            seg.error_info,
+                                            BSC_REASON_BADARG,
+                                            BSC_INFO_TYPE,
+                                            BSC_VALUE_ARG3));
+                        }
+                        a.cbz(ARG1, resolve_label(error, disp1MB));
+                    }
                 }
-                a.cbz(ARG1, resolve_label(error, disp1MB));
             }
             break;
         case am_string: {
@@ -1953,27 +3190,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             emit_leave_runtime(Live.get());
             break;
         }
-        case am_utf8:
-            comment("construct utf8 segment");
-            mov_arg(ARG2, seg.src);
-            load_erl_bits_state(ARG1);
-
-            emit_enter_runtime(Live.get());
-            runtime_call<2>(erts_bs_put_utf8);
-
-            emit_leave_runtime(Live.get());
-            if (Fail.get() == 0) {
-                mov_arg(ARG3, seg.src);
-                mov_imm(ARG4,
-                        beam_jit_update_bsc_reason_info(seg.error_info,
-                                                        BSC_REASON_BADARG,
-                                                        BSC_INFO_TYPE,
-                                                        BSC_VALUE_ARG3));
-            }
-            a.cbz(ARG1, resolve_label(error, disp1MB));
+        case am_utf8: {
+            emit_construct_utf8(seg.src, bit_offset, is_byte_aligned);
             break;
+        }
         case am_utf16:
-            comment("construct utf8 segment");
+            comment("construct utf16 segment");
             mov_arg(ARG2, seg.src);
             a.mov(ARG3, seg.flags);
             load_erl_bits_state(ARG1);
@@ -2016,8 +3238,925 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             ASSERT(0);
             break;
         }
+
+        /* Try to keep track of the bit offset. */
+        if (bit_offset >= 0 && (seg.action == BscSegment::action::DIRECT ||
+                                seg.action == BscSegment::action::STORE)) {
+            if (seg.effectiveSize >= 0) {
+                bit_offset += seg.effectiveSize;
+            } else {
+                bit_offset = -1;
+            }
+        }
+
+        /* Try to keep track whether the next segment is byte
+         * aligned. */
+        if (seg.type == am_append || seg.type == am_private_append) {
+            if (!exact_type<BeamTypeId::Bitstring>(seg.src) ||
+                std::gcd(getSizeUnit(seg.src), 8) != 8) {
+                is_byte_aligned = false;
+            }
+        } else if (bit_offset % 8 == 0) {
+            is_byte_aligned = true;
+        } else if (seg.effectiveSize >= 0) {
+            if (seg.effectiveSize % 8 != 0) {
+                is_byte_aligned = false;
+            }
+        } else if (std::gcd(seg.unit, 8) != 8) {
+            is_byte_aligned = false;
+        }
     }
 
     comment("done");
     mov_arg(Dst, TMP_MEM1q);
 }
+
+/*
+ * Here follows the bs_match instruction and friends.
+ */
+
+struct BsmSegment {
+    BsmSegment()
+            : action(action::TEST_HEAP), live(ArgNil()), size(0), unit(1),
+              flags(0), dst(ArgXRegister(0)){};
+
+    enum class action {
+        TEST_HEAP,
+        ENSURE_AT_LEAST,
+        ENSURE_EXACTLY,
+        READ,
+        EXTRACT_BINARY,
+        EXTRACT_INTEGER,
+        GET_INTEGER,
+        GET_BINARY,
+        SKIP,
+        DROP,
+        GET_TAIL,
+        EQ
+    } action;
+    ArgVal live;
+    Uint size;
+    Uint unit;
+    Uint flags;
+    ArgRegister dst;
+};
+
+void BeamModuleAssembler::emit_read_bits(Uint bits,
+                                         const arm::Gp bin_base,
+                                         const arm::Gp bin_offset,
+                                         const arm::Gp bitdata) {
+    Label handle_partial = a.newLabel();
+    Label rev64 = a.newLabel();
+    Label shift = a.newLabel();
+    Label read_done = a.newLabel();
+
+    bool need_rev64 = false;
+
+    const arm::Gp bin_byte_ptr = TMP2;
+    const arm::Gp bit_offset = TMP4;
+    const arm::Gp tmp = TMP5;
+
+    auto num_partial = bits % 8;
+
+    ASSERT(1 <= bits && bits <= 64);
+
+    a.add(bin_byte_ptr, bin_base, bin_offset, arm::lsr(3));
+
+    if (bits <= 8) {
+        a.ands(bit_offset, bin_offset, imm(7));
+
+        if (num_partial == 0) {
+            /* Byte-sized segment. If bit_offset is not byte-aligned,
+             * this segment always spans two bytes. */
+            a.b_ne(handle_partial);
+        } else if (num_partial > 1) {
+            /* The segment is smaller than one byte but more than one
+             * bit. Test whether it fits within the current byte. */
+            a.cmp(bit_offset, imm(8 - num_partial));
+            a.b_gt(handle_partial);
+        }
+
+        /* The segment fits in the current byte. */
+        a.ldrb(bitdata.w(), arm::Mem(bin_byte_ptr));
+        if (num_partial == 0) {
+            a.rev64(bitdata, bitdata);
+            a.b(read_done);
+        } else if (num_partial > 1) {
+            a.b(rev64);
+        }
+
+        /* The segment is unaligned and spans two bytes. */
+        a.bind(handle_partial);
+        if (num_partial != 1) {
+            a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr));
+        }
+        need_rev64 = true;
+    } else if (bits <= 16) {
+        a.ands(bit_offset, bin_offset, imm(7));
+
+        /* We always need to read at least two bytes. */
+        a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr));
+        a.rev64(bitdata, bitdata);
+        a.b_eq(read_done); /* Done if segment is byte-aligned. */
+
+        /* The segment is unaligned. If its size is 9, it always fits
+         * in two bytes and we fall through to the shift instruction. */
+        a.bind(handle_partial);
+        if (num_partial > 1) {
+            /* If segment size is less than 15 bits or less, it is
+             * possible that it fits into two bytes. */
+            a.cmp(bit_offset, imm(8 - num_partial));
+            a.b_le(shift);
+        }
+
+        if (num_partial != 1) {
+            /* The segment spans three bytes. Read an additional byte and
+             * shift into place (right below the already read two bytes a
+             * the top of the word). */
+            a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 2));
+            a.orr(bitdata, bitdata, tmp, arm::lsl(40));
+        }
+    } else if (bits <= 24) {
+        a.ands(bit_offset, bin_offset, imm(7));
+
+        if (num_partial == 0) {
+            /* Byte-sized segment. If bit_offset is not byte-aligned,
+             * this segment always spans four bytes. */
+            a.b_ne(handle_partial);
+        } else if (num_partial > 1) {
+            /* The segment is smaller than three bytes. Test whether
+             * it spans three or four bytes. */
+            a.cmp(bit_offset, imm(8 - num_partial));
+            a.b_gt(handle_partial);
+        }
+
+        /* This segment spans three bytes. */
+        a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr));
+        a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 2));
+        a.orr(bitdata, bitdata, tmp, arm::lsl(16));
+        if (num_partial == 0) {
+            a.rev64(bitdata, bitdata);
+            a.b(read_done);
+        } else if (num_partial > 1) {
+            a.b(rev64);
+        }
+
+        /* This segment spans four bytes. */
+        a.bind(handle_partial);
+        if (num_partial != 1) {
+            a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr));
+        }
+        need_rev64 = true;
+    } else if (bits <= 32) {
+        a.ands(bit_offset, bin_offset, imm(7));
+
+        /* We always need to read at least four bytes. */
+        a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr));
+        a.rev64(bitdata, bitdata);
+        a.b_eq(read_done);
+
+        a.bind(handle_partial);
+        if (num_partial > 0) {
+            a.cmp(bit_offset, imm(8 - num_partial));
+            a.b_le(shift);
+        }
+
+        if (num_partial != 1) {
+            /* The segment spans five bytes. Read an additional byte and
+             * shift into place. */
+            a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 4));
+            a.orr(bitdata, bitdata, tmp, arm::lsl(24));
+        }
+    } else if (bits <= 40) {
+        a.ands(bit_offset, bin_offset, imm(7));
+
+        /* We always need to read four bytes. */
+        a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr));
+        a.rev64(bitdata, bitdata);
+
+        if (num_partial == 0) {
+            /* Byte-sized segment. If bit_offset is not byte-aligned,
+             * this segment always spans six bytes. */
+            a.b_ne(handle_partial);
+        } else if (num_partial > 1) {
+            /* The segment is smaller than five bytes. Test whether it
+             * spans five or six bytes. */
+            a.cmp(bit_offset, imm(8 - num_partial));
+            a.b_gt(handle_partial);
+        }
+
+        /* This segment spans five bytes. Read an additional byte. */
+        a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 4));
+        a.orr(bitdata, bitdata, tmp, arm::lsl(24));
+        if (num_partial == 0) {
+            a.b(read_done);
+        } else if (num_partial > 1) {
+            a.b(shift);
+        }
+
+        a.bind(handle_partial);
+        if (num_partial != 1) {
+            /* This segment spans six bytes. Read two additional bytes. */
+            a.ldrh(tmp.w(), arm::Mem(bin_byte_ptr, 4));
+            a.rev16(tmp.w(), tmp.w());
+            a.orr(bitdata, bitdata, tmp, arm::lsl(16));
+        }
+    } else if (bits <= 48) {
+        a.ands(bit_offset, bin_offset, imm(7));
+        a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr));
+        a.ldrh(tmp.w(), arm::Mem(bin_byte_ptr, 4));
+        a.orr(bitdata, bitdata, tmp, arm::lsl(32));
+        a.rev64(bitdata, bitdata);
+        a.b_eq(read_done);
+
+        a.bind(handle_partial);
+        if (num_partial > 1) {
+            a.cmp(bit_offset, imm(8 - num_partial));
+            a.b_le(shift);
+        }
+
+        if (num_partial != 1) {
+            a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 6));
+            a.orr(bitdata, bitdata, tmp, arm::lsl(8));
+        }
+    } else if (bits <= 56) {
+        a.ands(bit_offset, bin_offset, imm(7));
+
+        if (num_partial == 0) {
+            /* Byte-sized segment. If bit_offset is not byte-aligned,
+             * this segment always spans 8 bytes. */
+            a.b_ne(handle_partial);
+        } else if (num_partial > 1) {
+            /* The segment is smaller than 8 bytes. Test whether it
+             * spans 7 or 8 bytes. */
+            a.cmp(bit_offset, imm(8 - num_partial));
+            a.b_gt(handle_partial);
+        }
+
+        /* This segment spans 7 bytes. */
+        a.ldr(bitdata, arm::Mem(bin_byte_ptr, -1));
+        a.lsr(bitdata, bitdata, imm(8));
+        a.b(rev64);
+
+        /* This segment spans 8 bytes. */
+        a.bind(handle_partial);
+        if (num_partial != 1) {
+            a.ldr(bitdata, arm::Mem(bin_byte_ptr));
+        }
+        need_rev64 = true;
+    } else if (bits <= 64) {
+        a.ands(bit_offset, bin_offset, imm(7));
+        a.ldr(bitdata, arm::Mem(bin_byte_ptr));
+        a.rev64(bitdata, bitdata);
+
+        if (num_partial == 0) {
+            /* Byte-sized segment. If it is aligned it spans 8 bytes
+             * and we are done. */
+            a.b_eq(read_done);
+        } else if (num_partial == 1) {
+            /* This segment is 57 bits wide. It always spans 8 bytes. */
+            a.b(shift);
+        } else {
+            /* The segment is smaller than 8 bytes. Test whether it
+             * spans 8 or 9 bytes. */
+            a.cmp(bit_offset, imm(8 - num_partial));
+            a.b_le(shift);
+        }
+
+        /* This segments spans 9 bytes. Read an additional byte. */
+        a.bind(handle_partial);
+        if (num_partial != 1) {
+            a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 8));
+            a.lsl(bitdata, bitdata, bit_offset);
+            a.lsl(tmp, tmp, bit_offset);
+            a.orr(bitdata, bitdata, tmp, arm::lsr(8));
+            a.b(read_done);
+        }
+    }
+
+    a.bind(rev64);
+    if (need_rev64) {
+        a.rev64(bitdata, bitdata);
+    }
+
+    /* Shift the read data into the most significant bits of the
+     * word. */
+    a.bind(shift);
+    a.lsl(bitdata, bitdata, bit_offset);
+
+    a.bind(read_done);
+}
+
+void BeamModuleAssembler::emit_extract_integer(const arm::Gp bitdata,
+                                               Uint flags,
+                                               Uint bits,
+                                               const ArgRegister &Dst) {
+    Label big = a.newLabel();
+    Label done = a.newLabel();
+    arm::Gp data_reg;
+    auto dst = init_destination(Dst, TMP1);
+    Uint num_partial = bits % 8;
+    Uint num_complete = 8 * (bits / 8);
+
+    if (bits <= 8) {
+        /* Endian does not matter for values that fit in a byte. */
+        flags &= ~BSF_LITTLE;
+    }
+
+    /* If this segment is little-endian, reverse endianness. */
+    if ((flags & BSF_LITTLE) != 0) {
+        comment("reverse endian for a little-endian segment");
+    }
+    data_reg = TMP2;
+    if ((flags & BSF_LITTLE) == 0) {
+        data_reg = bitdata;
+    } else if (bits == 16) {
+        a.rev16(TMP2, bitdata);
+    } else if (bits == 32) {
+        a.rev32(TMP2, bitdata);
+    } else if (num_partial == 0) {
+        a.rev64(TMP2, bitdata);
+        a.lsr(TMP2, TMP2, arm::lsr(64 - bits));
+    } else {
+        a.ubfiz(TMP3, bitdata, imm(num_complete), imm(num_partial));
+        a.ubfx(TMP2, bitdata, imm(num_partial), imm(num_complete));
+        a.rev64(TMP2, TMP2);
+        a.orr(TMP2, TMP3, TMP2, arm::lsr(64 - num_complete));
+    }
+
+    /* Sign-extend the number if the segment is signed. */
+    if ((flags & BSF_SIGNED) != 0) {
+        if (0 < bits && bits < 64) {
+            comment("sign extend extracted value");
+            a.lsl(TMP2, data_reg, imm(64 - bits));
+            a.asr(TMP2, TMP2, imm(64 - bits));
+            data_reg = TMP2;
+        }
+    }
+
+    /* Handle segments whose values might not fit in a small integer. */
+    if (bits >= SMALL_BITS) {
+        comment("test whether it fits in a small");
+        if (bits < 64 && (flags & BSF_SIGNED) == 0) {
+            a.and_(TMP2, data_reg, imm((1ull << bits) - 1));
+            data_reg = TMP2;
+        }
+        if ((flags & BSF_SIGNED) != 0) {
+            /* Signed segment. */
+            a.adds(TMP3, ZERO, data_reg, arm::lsr(SMALL_BITS - 1));
+            a.ccmp(TMP3,
+                   imm(_TAG_IMMED1_MASK << 1 | 1),
+                   imm(NZCV::kEqual),
+                   imm(arm::CondCode::kNE));
+            a.b_ne(big);
+        } else {
+            /* Unsigned segment. */
+            a.lsr(TMP3, data_reg, imm(SMALL_BITS - 1));
+            a.cbnz(TMP3, big);
+        }
+    }
+
+    /* Tag and store the extracted small integer. */
+    comment("store extracted integer as a small");
+    mov_imm(dst.reg, _TAG_IMMED1_SMALL);
+    if ((flags & BSF_SIGNED) != 0) {
+        a.orr(dst.reg, dst.reg, data_reg, arm::lsl(_TAG_IMMED1_SIZE));
+    } else {
+        if (bits >= SMALL_BITS) {
+            a.bfi(dst.reg,
+                  data_reg,
+                  arm::lsl(_TAG_IMMED1_SIZE),
+                  imm(SMALL_BITS));
+        } else if (bits != 0) {
+            a.bfi(dst.reg, data_reg, arm::lsl(_TAG_IMMED1_SIZE), imm(bits));
+        }
+    }
+
+    if (bits >= SMALL_BITS) {
+        a.b(done);
+    }
+
+    /* Handle a bignum (up to 64 bits). */
+    a.bind(big);
+    if (bits >= SMALL_BITS) {
+        comment("store extracted integer as a bignum");
+        a.add(dst.reg, HTOP, imm(TAG_PRIMARY_BOXED));
+        mov_imm(TMP3, make_pos_bignum_header(1));
+        if ((flags & BSF_SIGNED) == 0) {
+            /* Unsigned. */
+            a.stp(TMP3, data_reg, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+        } else {
+            /* Signed. */
+            Label store = a.newLabel();
+            a.adds(TMP2, data_reg, ZERO);
+            a.b_pl(store);
+
+            mov_imm(TMP3, make_neg_bignum_header(1));
+            a.neg(TMP2, TMP2);
+
+            a.bind(store);
+            a.stp(TMP3, TMP2, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+        }
+    }
+
+    a.bind(done);
+    flush_var(dst);
+}
+
+void BeamModuleAssembler::emit_extract_binary(const arm::Gp bitdata,
+                                              Uint bits,
+                                              const ArgRegister &Dst) {
+    auto dst = init_destination(Dst, TMP1);
+    Uint num_bytes = bits / 8;
+
+    a.add(dst.reg, HTOP, imm(TAG_PRIMARY_BOXED));
+    mov_imm(TMP2, header_heap_bin(num_bytes));
+    mov_imm(TMP3, num_bytes);
+    a.rev64(TMP4, bitdata);
+    a.stp(TMP2, TMP3, arm::Mem(HTOP).post(sizeof(Eterm[2])));
+    if (num_bytes != 0) {
+        a.str(TMP4, arm::Mem(HTOP).post(sizeof(Eterm[1])));
+    }
+    flush_var(dst);
+}
+
+static std::vector<BsmSegment> opt_bsm_segments(
+        const std::vector<BsmSegment> segments,
+        const ArgWord &Need,
+        const ArgWord &Live) {
+    std::vector<BsmSegment> segs;
+
+    Uint heap_need = Need.get();
+
+    /*
+     * First calculate the total number of heap words needed for
+     * bignums and binaries.
+     */
+    for (auto seg : segments) {
+        switch (seg.action) {
+        case BsmSegment::action::GET_INTEGER:
+            if (seg.size >= SMALL_BITS) {
+                heap_need += BIG_NEED_FOR_BITS(seg.size);
+            }
+            break;
+        case BsmSegment::action::GET_BINARY:
+            heap_need += heap_bin_size((seg.size + 7) / 8);
+            break;
+        case BsmSegment::action::GET_TAIL:
+            heap_need += EXTRACT_SUB_BIN_HEAP_NEED;
+            break;
+        default:
+            break;
+        }
+    }
+
+    int index = 0;
+    int read_action_pos = -1;
+
+    index = 0;
+    for (auto seg : segments) {
+        if (heap_need != 0 && seg.live.isWord()) {
+            BsmSegment s = seg;
+
+            read_action_pos = -1;
+            s.action = BsmSegment::action::TEST_HEAP;
+            s.size = heap_need;
+            segs.push_back(s);
+            index++;
+            heap_need = 0;
+        }
+
+        switch (seg.action) {
+        case BsmSegment::action::GET_INTEGER:
+        case BsmSegment::action::GET_BINARY:
+            if (seg.size > 64) {
+                read_action_pos = -1;
+            } else if (seg.action == BsmSegment::action::GET_BINARY &&
+                       seg.size % 8 != 0) {
+                read_action_pos = -1;
+            } else {
+                if ((seg.flags & BSF_LITTLE) != 0 || read_action_pos < 0 ||
+                    seg.size + segs.at(read_action_pos).size > 64) {
+                    BsmSegment s;
+
+                    /* Create a new READ action. */
+                    read_action_pos = index;
+                    s.action = BsmSegment::action::READ;
+                    s.size = seg.size;
+                    segs.push_back(s);
+                    index++;
+                } else {
+                    /* Reuse previous READ action. */
+                    segs.at(read_action_pos).size += seg.size;
+                }
+                switch (seg.action) {
+                case BsmSegment::action::GET_INTEGER:
+                    seg.action = BsmSegment::action::EXTRACT_INTEGER;
+                    break;
+                case BsmSegment::action::GET_BINARY:
+                    seg.action = BsmSegment::action::EXTRACT_BINARY;
+                    break;
+                default:
+                    break;
+                }
+            }
+            segs.push_back(seg);
+            break;
+        case BsmSegment::action::EQ: {
+            if (read_action_pos < 0 ||
+                seg.size + segs.at(read_action_pos).size > 64) {
+                BsmSegment s;
+
+                /* Create a new READ action. */
+                read_action_pos = index;
+                s.action = BsmSegment::action::READ;
+                s.size = seg.size;
+                segs.push_back(s);
+                index++;
+            } else {
+                /* Reuse previous READ action. */
+                segs.at(read_action_pos).size += seg.size;
+            }
+            auto &prev = segs.back();
+            if (prev.action == BsmSegment::action::EQ &&
+                prev.size + seg.size <= 64) {
+                /* Coalesce with the previous EQ instruction. */
+                prev.size += seg.size;
+                prev.unit = prev.unit << seg.size | seg.unit;
+                index--;
+            } else {
+                segs.push_back(seg);
+            }
+            break;
+        }
+        case BsmSegment::action::SKIP:
+            if (read_action_pos >= 0 &&
+                seg.size + segs.at(read_action_pos).size <= 64) {
+                segs.at(read_action_pos).size += seg.size;
+                seg.action = BsmSegment::action::DROP;
+            } else {
+                read_action_pos = -1;
+            }
+            segs.push_back(seg);
+            break;
+        default:
+            read_action_pos = -1;
+            segs.push_back(seg);
+            break;
+        }
+        index++;
+    }
+
+    /* Handle a trailing test_heap instruction (for the
+     * i_bs_match_test_heap instruction). */
+    if (heap_need) {
+        BsmSegment seg;
+
+        seg.action = BsmSegment::action::TEST_HEAP;
+        seg.size = heap_need;
+        seg.live = Live;
+        segs.push_back(seg);
+    }
+    return segs;
+}
+
+UWord BeamModuleAssembler::bs_get_flags(const ArgVal &val) {
+    if (val.isNil()) {
+        return 0;
+    } else if (val.isLiteral()) {
+        Eterm term = beamfile_get_literal(beam, val.as<ArgLiteral>().get());
+        UWord flags = 0;
+
+        while (is_list(term)) {
+            Eterm *consp = list_val(term);
+            Eterm elem = CAR(consp);
+            switch (elem) {
+            case am_little:
+            case am_native:
+                flags |= BSF_LITTLE;
+                break;
+            case am_signed:
+                flags |= BSF_SIGNED;
+                break;
+            }
+            term = CDR(consp);
+        }
+        ASSERT(is_nil(term));
+        return flags;
+    } else if (val.isWord()) {
+        /* Originates from bs_get_integer2 instruction. */
+        return val.as<ArgWord>().get();
+    } else {
+        ASSERT(0); /* Should not happen. */
+        return 0;
+    }
+}
+
+void BeamModuleAssembler::emit_i_bs_match(ArgLabel const &Fail,
+                                          ArgRegister const &Ctx,
+                                          Span<ArgVal> const &List) {
+    emit_i_bs_match_test_heap(Fail, Ctx, ArgWord(0), ArgWord(0), List);
+}
+
+void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail,
+                                                    ArgRegister const &Ctx,
+                                                    ArgWord const &Need,
+                                                    ArgWord const &Live,
+                                                    Span<ArgVal> const &List) {
+    const int orig_offset = offsetof(ErlBinMatchState, mb.orig);
+    const int base_offset = offsetof(ErlBinMatchState, mb.base);
+    const int position_offset = offsetof(ErlBinMatchState, mb.offset);
+    const int size_offset = offsetof(ErlBinMatchState, mb.size);
+
+    std::vector<BsmSegment> segments;
+
+    auto current = List.begin();
+    auto end = List.begin() + List.size();
+
+    while (current < end) {
+        auto cmd = current++->as<ArgImmed>().get();
+        BsmSegment seg;
+
+        switch (cmd) {
+        case am_ensure_at_least: {
+            seg.action = BsmSegment::action::ENSURE_AT_LEAST;
+            seg.size = current[0].as<ArgWord>().get();
+            seg.unit = current[1].as<ArgWord>().get();
+            current += 2;
+            break;
+        }
+        case am_ensure_exactly: {
+            seg.action = BsmSegment::action::ENSURE_EXACTLY;
+            seg.size = current[0].as<ArgWord>().get();
+            current += 1;
+            break;
+        }
+        case am_binary:
+        case am_integer: {
+            auto size = current[2].as<ArgWord>().get();
+            auto unit = current[3].as<ArgWord>().get();
+
+            switch (cmd) {
+            case am_integer:
+                seg.action = BsmSegment::action::GET_INTEGER;
+                break;
+            case am_binary:
+                seg.action = BsmSegment::action::GET_BINARY;
+                break;
+            }
+
+            seg.live = current[0];
+            seg.size = size * unit;
+            seg.unit = unit;
+            seg.flags = bs_get_flags(current[1]);
+            seg.dst = current[4].as<ArgRegister>();
+            current += 5;
+            break;
+        }
+        case am_get_tail: {
+            seg.action = BsmSegment::action::GET_TAIL;
+            seg.live = current[0].as<ArgWord>();
+            seg.dst = current[2].as<ArgRegister>();
+            current += 3;
+            break;
+        }
+        case am_skip: {
+            seg.action = BsmSegment::action::SKIP;
+            seg.size = current[0].as<ArgWord>().get();
+            seg.flags = 0;
+            current += 1;
+            break;
+        }
+        case am_Eq: {
+            seg.action = BsmSegment::action::EQ;
+            seg.live = current[0];
+            seg.size = current[1].as<ArgWord>().get();
+            seg.unit = current[2].as<ArgWord>().get();
+            current += 3;
+            break;
+        }
+        default:
+            abort();
+            break;
+        }
+        segments.push_back(seg);
+    }
+
+    segments = opt_bsm_segments(segments, Need, Live);
+
+    const arm::Gp bin_base = ARG2;
+    const arm::Gp bin_position = ARG3;
+    const arm::Gp bin_size = ARG4;
+    const arm::Gp bitdata = ARG8;
+    bool position_is_valid = false;
+
+    for (auto seg : segments) {
+        switch (seg.action) {
+        case BsmSegment::action::ENSURE_AT_LEAST: {
+            comment("ensure_at_least %ld %ld", seg.size, seg.unit);
+            auto ctx_reg = load_source(Ctx, TMP1);
+            auto stride = seg.size;
+            auto unit = seg.unit;
+
+            a.ldur(bin_position, emit_boxed_val(ctx_reg.reg, position_offset));
+            a.ldur(bin_size, emit_boxed_val(ctx_reg.reg, size_offset));
+            a.sub(TMP5, bin_size, bin_position);
+            if (stride != 0) {
+                cmp(TMP5, stride);
+                a.b_lo(resolve_beam_label(Fail, disp1MB));
+            }
+
+            if (unit != 1) {
+                if (stride % unit != 0) {
+                    sub(TMP5, TMP5, stride);
+                }
+
+                if ((unit & (unit - 1)) != 0) {
+                    mov_imm(TMP4, unit);
+
+                    a.udiv(TMP3, TMP5, TMP4);
+                    a.msub(TMP5, TMP3, TMP4, TMP5);
+
+                    a.cbnz(TMP5, resolve_beam_label(Fail, disp1MB));
+                } else {
+                    a.tst(TMP5, imm(unit - 1));
+                    a.b_ne(resolve_beam_label(Fail, disp1MB));
+                }
+            }
+
+            position_is_valid = true;
+            break;
+        }
+        case BsmSegment::action::ENSURE_EXACTLY: {
+            comment("ensure_exactly %ld", seg.size);
+            auto ctx_reg = load_source(Ctx, TMP1);
+            auto size = seg.size;
+
+            a.ldur(bin_position, emit_boxed_val(ctx_reg.reg, position_offset));
+            a.ldur(TMP3, emit_boxed_val(ctx_reg.reg, size_offset));
+            if (size != 0) {
+                a.sub(TMP1, TMP3, bin_position);
+                cmp(TMP1, size);
+            } else {
+                a.subs(TMP1, TMP3, bin_position);
+            }
+            a.b_ne(resolve_beam_label(Fail, disp1MB));
+            position_is_valid = true;
+            break;
+        }
+        case BsmSegment::action::EQ: {
+            comment("=:= %ld %ld", seg.size, seg.unit);
+            if (seg.size != 0 && seg.size != 64) {
+                a.ror(bitdata, bitdata, imm(64 - seg.size));
+            }
+            if (seg.size == 64) {
+                cmp(bitdata, seg.unit);
+            } else if (seg.size == 32) {
+                cmp(bitdata.w(), seg.unit);
+            } else if (seg.unit == 0) {
+                a.tst(bitdata, imm((1ull << seg.size) - 1));
+            } else {
+                a.and_(TMP1, bitdata, imm((1ull << seg.size) - 1));
+                cmp(TMP1, seg.unit);
+            }
+            a.b_ne(resolve_beam_label(Fail, disp1MB));
+            break;
+        }
+        case BsmSegment::action::TEST_HEAP: {
+            comment("test_heap %ld", seg.size);
+            emit_gc_test(ArgWord(0), ArgWord(seg.size), seg.live);
+            position_is_valid = false;
+            break;
+        }
+        case BsmSegment::action::READ: {
+            comment("read %ld", seg.size);
+            if (seg.size == 0) {
+                comment("(nothing to do)");
+            } else {
+                auto ctx = load_source(Ctx, ARG1);
+
+                if (!position_is_valid) {
+                    a.ldur(bin_position,
+                           emit_boxed_val(ctx.reg, position_offset));
+                    position_is_valid = true;
+                }
+                a.ldur(bin_base, emit_boxed_val(ctx.reg, base_offset));
+
+                emit_read_bits(seg.size, bin_base, bin_position, bitdata);
+
+                a.add(bin_position, bin_position, imm(seg.size));
+                a.stur(bin_position, emit_boxed_val(ctx.reg, position_offset));
+            }
+            break;
+        }
+        case BsmSegment::action::EXTRACT_BINARY: {
+            auto bits = seg.size;
+            auto Dst = seg.dst;
+
+            comment("extract binary %ld", bits);
+            emit_extract_binary(bitdata, bits, Dst);
+            if (bits != 0 && bits != 64) {
+                a.ror(bitdata, bitdata, imm(64 - bits));
+            }
+            break;
+        }
+        case BsmSegment::action::EXTRACT_INTEGER: {
+            auto bits = seg.size;
+            auto flags = seg.flags;
+            auto Dst = seg.dst;
+
+            comment("extract integer %ld", bits);
+            if (bits != 0 && bits != 64) {
+                a.ror(bitdata, bitdata, imm(64 - bits));
+            }
+            emit_extract_integer(bitdata, flags, bits, Dst);
+            break;
+        }
+        case BsmSegment::action::GET_INTEGER: {
+            Uint live = seg.live.as<ArgWord>().get();
+            Uint flags = seg.flags;
+            auto bits = seg.size;
+            auto Dst = seg.dst;
+
+            comment("get integer %ld", bits);
+            auto ctx = load_source(Ctx, TMP1);
+
+            a.mov(ARG1, c_p);
+            a.mov(ARG2, bits);
+            a.mov(ARG3, flags);
+            lea(ARG4, emit_boxed_val(ctx.reg, offsetof(ErlBinMatchState, mb)));
+
+            if (bits >= SMALL_BITS) {
+                emit_enter_runtime<Update::eHeapOnlyAlloc>(live);
+            } else {
+                emit_enter_runtime(live);
+            }
+
+            runtime_call<4>(erts_bs_get_integer_2);
+
+            if (bits >= SMALL_BITS) {
+                emit_leave_runtime<Update::eHeapOnlyAlloc>(live);
+            } else {
+                emit_leave_runtime(live);
+            }
+
+            mov_arg(Dst, ARG1);
+
+            position_is_valid = false;
+            break;
+        }
+        case BsmSegment::action::GET_BINARY: {
+            auto Live = seg.live;
+            comment("get binary %ld", seg.size);
+            auto ctx = load_source(Ctx, TMP1);
+
+            lea(ARG1, arm::Mem(c_p, offsetof(Process, htop)));
+            a.ldur(ARG2, emit_boxed_val(ctx.reg, orig_offset));
+            a.ldur(ARG3, emit_boxed_val(ctx.reg, base_offset));
+            a.ldur(ARG4, emit_boxed_val(ctx.reg, position_offset));
+            mov_imm(ARG5, seg.size);
+            a.add(TMP2, ARG4, ARG5);
+            a.stur(TMP2, emit_boxed_val(ctx.reg, position_offset));
+
+            emit_enter_runtime<Update::eHeapOnlyAlloc>(
+                    Live.as<ArgWord>().get());
+
+            runtime_call<5>(erts_extract_sub_binary);
+
+            emit_leave_runtime<Update::eHeapOnlyAlloc>(
+                    Live.as<ArgWord>().get());
+
+            mov_arg(seg.dst, ARG1);
+            position_is_valid = false;
+            break;
+        }
+        case BsmSegment::action::GET_TAIL: {
+            comment("get_tail");
+
+            mov_arg(ARG1, Ctx);
+            fragment_call(ga->get_bs_get_tail_shared());
+            mov_arg(seg.dst, ARG1);
+            position_is_valid = false;
+            break;
+        }
+        case BsmSegment::action::SKIP: {
+            comment("skip %ld", seg.size);
+            auto ctx = load_source(Ctx, TMP1);
+            if (!position_is_valid) {
+                a.ldur(bin_position, emit_boxed_val(ctx.reg, position_offset));
+                position_is_valid = true;
+            }
+            add(bin_position, bin_position, seg.size);
+            a.stur(bin_position, emit_boxed_val(ctx.reg, position_offset));
+            break;
+        }
+        case BsmSegment::action::DROP:
+            auto bits = seg.size;
+            comment("drop %ld", bits);
+            if (bits != 0 && bits != 64) {
+                a.ror(bitdata, bitdata, imm(64 - bits));
+            }
+            break;
+        }
+    }
+}