1 files changed, 2649 insertions, 417 deletions
diff --git a/erts/emulator/beam/jit/x86/instr_bs.cpp b/erts/emulator/beam/jit/x86/instr_bs.cpp
index ab6abff6cc..36e95df57c 100644
--- a/erts/emulator/beam/jit/x86/instr_bs.cpp
+++ b/erts/emulator/beam/jit/x86/instr_bs.cpp
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2020-2022. All Rights Reserved.
+ * Copyright Ericsson AB 2020-2023. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
  */
 
 #include "beam_asm.hpp"
+#include <numeric>
 
 extern "C"
 {
@@ -57,12 +58,22 @@ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size,
         a.jmp(fail);
         return -1;
     } else {
+        bool can_fail = true;
+
         mov_arg(RET, Size);
 
-        a.mov(ARG3d, RETd);
-        a.and_(ARG3d, imm(_TAG_IMMED1_MASK));
-        a.cmp(ARG3d, imm(_TAG_IMMED1_SMALL));
-        a.jne(fail);
+        if (always_small(Size)) {
+            auto [min, max] = getClampedRange(Size);
+            can_fail =
+                    !(0 <= min && (max >> (SMALL_BITS - ERL_UNIT_BITS)) == 0);
+            comment("simplified segment size checks because "
+                    "the types are known");
+        } else {
+            a.mov(ARG3d, RETd);
+            a.and_(ARG3d, imm(_TAG_IMMED1_MASK));
+            a.cmp(ARG3d, imm(_TAG_IMMED1_SMALL));
+            a.jne(fail);
+        }
 
         if (max_size) {
             ASSERT(Support::isInt32((Sint)make_small(max_size)));
@@ -70,19 +81,35 @@ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size,
             a.ja(fail);
         }
 
-        if (unit == 1) {
+        if (unit == 0) {
+            mov_imm(RET, 0);
+        } else if (unit == 1) {
             a.sar(RET, imm(_TAG_IMMED1_SIZE));
-            a.js(fail);
+            if (can_fail) {
+                a.js(fail);
+            }
+        } else if (!can_fail && Support::isPowerOf2(unit)) {
+            int trailing_bits = Support::ctz<Eterm>(unit);
+            a.and_(RET, imm(~_TAG_IMMED1_MASK));
+            if (trailing_bits < _TAG_IMMED1_SIZE) {
+                a.sar(RET, imm(_TAG_IMMED1_SIZE - trailing_bits));
+            } else if (trailing_bits > _TAG_IMMED1_SIZE) {
+                a.shl(RET, imm(trailing_bits - _TAG_IMMED1_SIZE));
+            }
         } else {
             /* Untag the size but don't shift it just yet, we want to fail on
              * overflow if the final result doesn't fit into a small. */
             a.and_(RET, imm(~_TAG_IMMED1_MASK));
-            a.js(fail);
+            if (can_fail) {
+                a.js(fail);
+            }
 
             /* Size = (Size) * (Unit) */
             mov_imm(ARG3, unit);
             a.mul(ARG3); /* CLOBBERS ARG3! */
-            a.jo(fail);
+            if (can_fail) {
+                a.jo(fail);
+            }
 
             a.sar(RET, imm(_TAG_IMMED1_SIZE));
         }
@@ -103,7 +130,7 @@ void BeamModuleAssembler::emit_i_bs_init_heap(const ArgWord &Size,
     mov_arg(ARG5, Heap);
     mov_arg(ARG6, Live);
 
-    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>();
 
     /* Must be last since mov_arg() may clobber ARG1 */
     a.mov(ARG1, c_p);
@@ -111,7 +138,7 @@ void BeamModuleAssembler::emit_i_bs_init_heap(const ArgWord &Size,
     load_erl_bits_state(ARG3);
     runtime_call<6>(beam_jit_bs_init);
 
-    emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>();
 
     mov_arg(Dst, RET);
 }
@@ -143,16 +170,14 @@ void BeamModuleAssembler::emit_i_bs_init_fail_heap(const ArgSource &Size,
         mov_arg(ARG5, Heap);
         mov_arg(ARG6, Live);
 
-        emit_enter_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap>();
+        emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>();
 
         a.mov(ARG1, c_p);
         load_x_reg_array(ARG2);
         load_erl_bits_state(ARG3);
         runtime_call<6>(beam_jit_bs_init);
 
-        emit_leave_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap>();
+        emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>();
 
         mov_arg(Dst, RET);
     }
@@ -204,7 +229,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_heap(const ArgWord &NumBits,
     mov_arg(ARG5, Alloc);
     mov_arg(ARG6, Live);
 
-    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>();
 
     /* Must be last since mov_arg() may clobber ARG1 */
     a.mov(ARG1, c_p);
@@ -212,7 +237,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_heap(const ArgWord &NumBits,
     load_erl_bits_state(ARG3);
     runtime_call<6>(beam_jit_bs_init_bits);
 
-    emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>();
 
     mov_arg(Dst, RET);
 }
@@ -245,8 +270,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_fail_heap(
         mov_arg(ARG5, Alloc);
         mov_arg(ARG6, Live);
 
-        emit_enter_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap>();
+        emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>();
 
         /* Must be last since mov_arg() may clobber ARG1 */
         a.mov(ARG1, c_p);
@@ -254,8 +278,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_fail_heap(
         load_erl_bits_state(ARG3);
         runtime_call<6>(beam_jit_bs_init_bits);
 
-        emit_leave_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap>();
+        emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>();
 
         mov_arg(Dst, RET);
     }
@@ -576,17 +599,18 @@ void BeamModuleAssembler::emit_i_bs_start_match3(const ArgRegister &Src,
 
     a.bind(is_binary);
     {
-        /* Src is not guaranteed to be inside the live range, so we need to
-         * stash it during GC. */
-        emit_gc_test_preserve(ArgWord(ERL_BIN_MATCHSTATE_SIZE(0)), Live, ARG2);
+        emit_gc_test_preserve(ArgWord(ERL_BIN_MATCHSTATE_SIZE(0)),
+                              Live,
+                              Src,
+                              ARG2);
 
-        emit_enter_runtime<Update::eStack | Update::eHeap>();
+        emit_enter_runtime<Update::eHeapOnlyAlloc>();
 
         a.mov(ARG1, c_p);
         /* ARG2 was set above */
         runtime_call<2>(erts_bs_start_match_3);
 
-        emit_leave_runtime<Update::eStack | Update::eHeap>();
+        emit_leave_runtime<Update::eHeapOnlyAlloc>();
 
         a.lea(ARG2, x86::qword_ptr(RET, TAG_PRIMARY_BOXED));
     }
@@ -650,278 +674,92 @@ void BeamModuleAssembler::emit_i_bs_get_position(const ArgRegister &Ctx,
     mov_arg(Dst, ARG1);
 }
 
-/* ARG3 = flags | (size << 3),
- * ARG4 = tagged match context */
-void BeamGlobalAssembler::emit_bs_fixed_integer_shared() {
-    emit_enter_runtime<Update::eStack | Update::eHeap>();
-
-    a.mov(ARG1, c_p);
-    /* Unpack size ... */
-    a.mov(ARG2, ARG3);
-    a.shr(ARG2, imm(3));
-    /* ... flags. */
-    a.and_(ARG3, imm(BSF_ALIGNED | BSF_LITTLE | BSF_SIGNED));
-    a.lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb)));
-    runtime_call<4>(erts_bs_get_integer_2);
-
-    emit_leave_runtime<Update::eStack | Update::eHeap>();
-
-    a.ret();
-}
-
-x86::Mem BeamModuleAssembler::emit_bs_get_integer_prologue(Label next,
-                                                           Label fail,
-                                                           int flags,
-                                                           int size) {
-    Label aligned = a.newLabel();
-
-    a.mov(ARG2, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.offset)));
-    a.lea(ARG3, x86::qword_ptr(ARG2, size));
-    a.cmp(ARG3, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.size)));
-    a.ja(fail);
-
-    a.test(ARG2.r8(), imm(CHAR_BIT - 1));
-    a.short_().je(aligned);
-
-    /* Actually unaligned reads are quite rare, so we handle everything in a
-     * shared fragment. */
-    mov_imm(ARG3, flags | (size << 3));
-    safe_fragment_call(ga->get_bs_fixed_integer_shared());
-
-    /* The above call can't fail since we work on small numbers and
-     * bounds-tested above. */
-#ifdef JIT_HARD_DEBUG
-    a.jmp(next);
-#else
-    a.short_().jmp(next);
-#endif
-
-    a.bind(aligned);
-    {
-        /* Read base address and convert offset to bytes. */
-        a.mov(ARG1, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.base)));
-        a.shr(ARG2, imm(3));
-
-        /* We cannot fail from here on; bump the match context's position. */
-        a.mov(emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.offset)),
-              ARG3);
-
-        return x86::Mem(ARG1, ARG2, 0, 0, size / 8);
-    }
-}
-
-void BeamModuleAssembler::emit_i_bs_get_integer_8(const ArgRegister &Ctx,
-                                                  const ArgWord &Flags,
-                                                  const ArgLabel &Fail,
-                                                  const ArgRegister &Dst) {
-    int flags = Flags.get();
-    Label next = a.newLabel();
-    x86::Mem address;
-
-    mov_arg(ARG4, Ctx);
-
-    address = emit_bs_get_integer_prologue(next,
-                                           resolve_beam_label(Fail),
-                                           flags,
-                                           8);
-
-    if (flags & BSF_SIGNED) {
-        a.movsx(RET, address);
-    } else {
-        a.movzx(RET, address);
-    }
-
-    a.shl(RET, imm(_TAG_IMMED1_SIZE));
-    a.or_(RET, imm(_TAG_IMMED1_SMALL));
-
-    a.bind(next);
-    mov_arg(Dst, RET);
-}
-
-void BeamModuleAssembler::emit_i_bs_get_integer_16(const ArgRegister &Ctx,
-                                                   const ArgWord &Flags,
-                                                   const ArgLabel &Fail,
-                                                   const ArgRegister &Dst) {
-    int flags = Flags.get();
-    Label next = a.newLabel();
-    x86::Mem address;
-
-    mov_arg(ARG4, Ctx);
-
-    address = emit_bs_get_integer_prologue(next,
-                                           resolve_beam_label(Fail),
-                                           flags,
-                                           16);
-
-    if (flags & BSF_LITTLE) {
-        if (flags & BSF_SIGNED) {
-            a.movsx(RET, address);
-        } else {
-            a.movzx(RET, address);
-        }
-    } else {
-        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
-            a.movbe(x86::ax, address);
-        } else {
-            a.mov(x86::ax, address);
-            a.xchg(x86::al, x86::ah);
-        }
+void BeamModuleAssembler::emit_bs_get_integer2(const ArgLabel &Fail,
+                                               const ArgRegister &Ctx,
+                                               const ArgWord &Live,
+                                               const ArgSource &Sz,
+                                               const ArgWord &Unit,
+                                               const ArgWord &Flags,
+                                               const ArgRegister &Dst) {
+    Uint size;
+    Uint flags = Flags.get();
 
-        if (flags & BSF_SIGNED) {
-            a.movsx(RET, x86::ax);
-        } else {
-            a.movzx(RET, x86::ax);
-        }
+    if (flags & BSF_NATIVE) {
+        flags &= ~BSF_NATIVE;
+        flags |= BSF_LITTLE;
     }
 
-    a.shl(RET, imm(_TAG_IMMED1_SIZE));
-    a.or_(RET, imm(_TAG_IMMED1_SMALL));
-
-    a.bind(next);
-    mov_arg(Dst, RET);
-}
-
-void BeamModuleAssembler::emit_i_bs_get_integer_32(const ArgRegister &Ctx,
-                                                   const ArgWord &Flags,
-                                                   const ArgLabel &Fail,
-                                                   const ArgRegister &Dst) {
-    int flags = Flags.get();
-    Label next = a.newLabel();
-    x86::Mem address;
-
-    mov_arg(ARG4, Ctx);
-
-    address = emit_bs_get_integer_prologue(next,
-                                           resolve_beam_label(Fail),
-                                           flags,
-                                           32);
-
-    if (flags & BSF_LITTLE) {
-        if (flags & BSF_SIGNED) {
-            a.movsxd(RET, address);
-        } else {
-            /* Implicitly zero-extends to 64 bits */
-            a.mov(RETd, address);
-        }
+    if (Sz.isSmall() && Sz.as<ArgSmall>().getUnsigned() < 8 * sizeof(Uint) &&
+        (size = Sz.as<ArgSmall>().getUnsigned() * Unit.get()) <
+                8 * sizeof(Uint)) {
+        /* Segment of a fixed size supported by bs_match. */
+        const ArgVal match[] = {ArgAtom(am_ensure_at_least),
+                                ArgWord(size),
+                                ArgWord(1),
+                                ArgAtom(am_integer),
+                                Live,
+                                ArgWord(flags),
+                                ArgWord(size),
+                                ArgWord(1),
+                                Dst};
+
+        const Span<ArgVal> args(match, sizeof(match) / sizeof(match[0]));
+        emit_i_bs_match(Fail, Ctx, args);
     } else {
-        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
-            a.movbe(RETd, address);
-        } else {
-            a.mov(RETd, address);
-            a.bswap(RETd);
-        }
-
-        if (flags & BSF_SIGNED) {
-            a.movsxd(RET, RETd);
-        }
-    }
-
-    a.shl(RET, imm(_TAG_IMMED1_SIZE));
-    a.or_(RET, imm(_TAG_IMMED1_SMALL));
-
-    a.bind(next);
-    mov_arg(Dst, RET);
-}
+        Label fail = resolve_beam_label(Fail);
+        int unit = Unit.get();
+
+        /* Clobbers RET + ARG3, returns a negative result if we always
+         * fail and further work is redundant. */
+        if (emit_bs_get_field_size(Sz, unit, fail, ARG5) >= 0) {
+            /* This operation can be expensive if a bignum can be
+             * created because there can be a garbage collection. */
+            auto max = std::get<1>(getClampedRange(Sz));
+            bool potentially_expensive =
+                    max >= SMALL_BITS || (max * Unit.get()) >= SMALL_BITS;
+
+            mov_arg(ARG3, Ctx);
+            mov_imm(ARG4, flags);
+            if (potentially_expensive) {
+                mov_arg(ARG6, Live);
+            } else {
+#ifdef DEBUG
+                /* Never actually used. */
+                mov_imm(ARG6, 1023);
+#endif
+            }
 
-void BeamModuleAssembler::emit_i_bs_get_integer_64(const ArgRegister &Ctx,
-                                                   const ArgWord &Flags,
-                                                   const ArgLabel &Fail,
-                                                   const ArgWord &Live,
-                                                   const ArgRegister &Dst) {
-    int flags = Flags.get();
-    Label next = a.newLabel();
-    x86::Mem address;
+            if (potentially_expensive) {
+                emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>();
+            } else {
+                comment("simplified entering runtime because result is always "
+                        "small");
+                emit_enter_runtime();
+            }
 
-    mov_arg(ARG4, Ctx);
+            a.mov(ARG1, c_p);
+            if (potentially_expensive) {
+                load_x_reg_array(ARG2);
+            } else {
+#ifdef DEBUG
+                /* Never actually used. */
+                mov_imm(ARG2, 0);
+#endif
+            }
+            runtime_call<6>(beam_jit_bs_get_integer);
 
-    /* Ctx is not guaranteed to be inside the live range, so we need to stash
-     * it during GC. */
-    emit_gc_test_preserve(ArgWord(BIG_UINT_HEAP_SIZE), Live, ARG4);
+            if (potentially_expensive) {
+                emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>();
+            } else {
+                emit_leave_runtime();
+            }
 
-    address = emit_bs_get_integer_prologue(next,
-                                           resolve_beam_label(Fail),
-                                           flags,
-                                           64);
+            emit_test_the_non_value(RET);
+            a.je(fail);
 
-    if (flags & BSF_LITTLE) {
-        a.mov(RET, address);
-    } else {
-        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
-            a.movbe(RET, address);
-        } else {
-            a.mov(RET, address);
-            a.bswap(RET);
+            mov_arg(Dst, RET);
         }
     }
-
-    a.mov(ARG1, RET);
-    a.mov(ARG2, RET);
-
-    /* Speculatively make a small out of the result even though it might not
-     * be one, and jump to the next instruction if it is. */
-    a.shl(RET, imm(_TAG_IMMED1_SIZE));
-    a.or_(RET, imm(_TAG_IMMED1_SMALL));
-
-    if (flags & BSF_SIGNED) {
-        a.sar(ARG2, imm(SMALL_BITS - 1));
-        a.add(ARG2, imm(1));
-        a.cmp(ARG2, imm(1));
-        a.jbe(next);
-    } else {
-        a.shr(ARG2, imm(SMALL_BITS - 1));
-        a.jz(next);
-    }
-
-    emit_enter_runtime();
-
-    a.mov(ARG2, HTOP);
-    if (flags & BSF_SIGNED) {
-        runtime_call<2>(small_to_big);
-    } else {
-        runtime_call<2>(uword_to_big);
-    }
-    a.add(HTOP, imm(sizeof(Eterm) * BIG_UINT_HEAP_SIZE));
-
-    emit_leave_runtime();
-
-    a.bind(next);
-    mov_arg(Dst, RET);
-}
-
-void BeamModuleAssembler::emit_i_bs_get_integer(const ArgRegister &Ctx,
-                                                const ArgLabel &Fail,
-                                                const ArgWord &Live,
-                                                const ArgWord &FlagsAndUnit,
-                                                const ArgSource &Sz,
-                                                const ArgRegister &Dst) {
-    Label fail;
-    int unit;
-
-    fail = resolve_beam_label(Fail);
-    unit = FlagsAndUnit.get() >> 3;
-
-    /* Clobbers RET + ARG3, returns a negative result if we always fail and
-     * further work is redundant. */
-    if (emit_bs_get_field_size(Sz, unit, fail, ARG5) >= 0) {
-        mov_arg(ARG3, Ctx);
-        mov_arg(ARG4, FlagsAndUnit);
-        mov_arg(ARG6, Live);
-
-        emit_enter_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap>();
-
-        a.mov(ARG1, c_p);
-        load_x_reg_array(ARG2);
-        runtime_call<6>(beam_jit_bs_get_integer);
-
-        emit_leave_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap>();
-
-        emit_test_the_non_value(RET);
-        a.je(fail);
-
-        mov_arg(Dst, RET);
-    }
 }
 
 void BeamModuleAssembler::emit_bs_test_tail2(const ArgLabel &Fail,
@@ -962,9 +800,7 @@ void BeamModuleAssembler::emit_i_bs_get_binary_all2(const ArgRegister &Ctx,
 
     mov_arg(ARG1, Ctx);
 
-    /* Ctx is not guaranteed to be inside the live range, so we need to stash
-     * it during GC. */
-    emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, ARG1);
+    emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, Ctx, ARG1);
 
     a.mov(RET, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb.size)));
     a.sub(RET, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb.offset)));
@@ -981,19 +817,19 @@ void BeamModuleAssembler::emit_i_bs_get_binary_all2(const ArgRegister &Ctx,
 
     a.jne(resolve_beam_label(Fail));
 
-    emit_enter_runtime<Update::eHeap>();
+    emit_enter_runtime<Update::eHeapOnlyAlloc>();
 
     a.lea(ARG2, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb)));
     a.mov(ARG1, c_p);
     runtime_call<2>(erts_bs_get_binary_all_2);
 
-    emit_leave_runtime<Update::eHeap>();
+    emit_leave_runtime<Update::eHeapOnlyAlloc>();
 
     mov_arg(Dst, RET);
 }
 
 void BeamGlobalAssembler::emit_bs_get_tail_shared() {
-    emit_enter_runtime<Update::eHeap>();
+    emit_enter_runtime<Update::eHeapOnlyAlloc>();
 
     a.mov(ARG2, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb.orig)));
     a.mov(ARG3, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb.base)));
@@ -1006,7 +842,7 @@ void BeamGlobalAssembler::emit_bs_get_tail_shared() {
     a.lea(ARG1, x86::qword_ptr(c_p, offsetof(Process, htop)));
     runtime_call<5>(erts_extract_sub_binary);
 
-    emit_leave_runtime<Update::eHeap>();
+    emit_leave_runtime<Update::eHeapOnlyAlloc>();
 
     a.ret();
 }
@@ -1016,9 +852,7 @@ void BeamModuleAssembler::emit_bs_get_tail(const ArgRegister &Ctx,
                                            const ArgWord &Live) {
     mov_arg(ARG1, Ctx);
 
-    /* Ctx is not guaranteed to be inside the live range, so we need to stash
-     * it during GC. */
-    emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, ARG1);
+    emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, Ctx, ARG1);
 
     safe_fragment_call(ga->get_bs_get_tail_shared());
 
@@ -1044,7 +878,6 @@ void BeamModuleAssembler::emit_i_bs_skip_bits2(const ArgRegister &Ctx,
     Label fail;
 
     fail = resolve_beam_label(Fail);
-
     if (emit_bs_get_field_size(Bits, Unit.get(), fail, RET) >= 0) {
         emit_bs_skip_bits(Fail, Ctx);
     }
@@ -1076,11 +909,12 @@ void BeamModuleAssembler::emit_i_bs_get_binary2(const ArgRegister &Ctx,
 
         mov_arg(ARG4, Ctx);
 
-        /* Ctx is not guaranteed to be inside the live range, so we need to
-         * stash it during GC. */
-        emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, ARG4);
+        emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED),
+                              Live,
+                              Ctx,
+                              ARG4);
 
-        emit_enter_runtime<Update::eHeap>();
+        emit_enter_runtime<Update::eHeapOnlyAlloc>();
 
         a.mov(ARG1, c_p);
         a.mov(ARG2, TMP_MEM1q);
@@ -1088,7 +922,7 @@ void BeamModuleAssembler::emit_i_bs_get_binary2(const ArgRegister &Ctx,
         a.lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb)));
         runtime_call<4>(erts_bs_get_binary_2);
 
-        emit_leave_runtime<Update::eHeap>();
+        emit_leave_runtime<Update::eHeapOnlyAlloc>();
 
         emit_test_the_non_value(RET);
         a.je(fail);
@@ -1111,19 +945,17 @@ void BeamModuleAssembler::emit_i_bs_get_float2(const ArgRegister &Ctx,
 
     mov_arg(ARG4, Ctx);
 
-    /* Ctx is not guaranteed to be inside the live range, so we need to stash
-     * it during GC. */
-    emit_gc_test_preserve(ArgWord(FLOAT_SIZE_OBJECT), Live, ARG4);
+    emit_gc_test_preserve(ArgWord(FLOAT_SIZE_OBJECT), Live, Ctx, ARG4);
 
     if (emit_bs_get_field_size(Sz, unit, fail, ARG2, 64) >= 0) {
-        emit_enter_runtime<Update::eHeap>();
+        emit_enter_runtime<Update::eHeapOnlyAlloc>();
 
         a.mov(ARG1, c_p);
         mov_imm(ARG3, Flags.get());
         a.lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb)));
         runtime_call<4>(erts_bs_get_float_2);
 
-        emit_leave_runtime<Update::eHeap>();
+        emit_leave_runtime<Update::eHeapOnlyAlloc>();
 
         emit_test_the_non_value(RET);
         a.je(fail);
@@ -1181,19 +1013,385 @@ void BeamModuleAssembler::emit_i_bs_put_utf8(const ArgLabel &Fail,
     }
 }
 
+/*
+ * ARG1 = pointer to match state
+ * ARG2 = position in binary in bits
+ * ARG3 = base pointer to binary data
+ * RET = number of bits left in binary
+ *
+ * This fragment is called if the binary is unaligned and/or the number
+ * of remaining bits is less than 32.
+ *
+ * See the comment for emit_bs_get_utf8_shared() for details about the
+ * return value.
+ */
+void BeamGlobalAssembler::emit_bs_get_utf8_short_shared() {
+    const int position_offset = offsetof(ErlBinMatchBuffer, offset);
+
+    const x86::Gp ctx = ARG1;
+    const x86::Gp bin_position = ARG2;
+    const x86::Gp bin_base = ARG3;
+
+    Label at_least_one = a.newLabel();
+    Label two = a.newLabel();
+    Label three_or_more = a.newLabel();
+    Label four = a.newLabel();
+    Label five = a.newLabel();
+    Label read_done = a.newLabel();
+    Label no_masking = a.newLabel();
+    Label ascii = a.newLabel();
+
+    /* Calculate the number of bytes remaining in the binary and error
+     * out if less than one. */
+    a.shr(RET, imm(3));
+    a.test(RET, RET);
+    a.short_().jne(at_least_one);
+
+    /* ZF is is already set. */
+    a.ret();
+
+    a.bind(at_least_one);
+
+    /* Save number of bytes remaining in binary. */
+    a.mov(ARG5, RET);
+
+    /* If the position in the binary is not byte-aligned, we'll need
+     * to read one more byte. */
+    a.test(bin_position, imm(7));
+    a.setne(ARG4.r8());
+    a.movzx(ARG4d, ARG4.r8());
+    a.add(RET, ARG4);
+
+    /* Save original position in bits and set up byte offset for
+     * reading. */
+    a.push(bin_position);
+    a.shr(bin_position, imm(3));
+
+    a.cmp(RET, imm(2));
+    a.short_().je(two);
+    a.short_().ja(three_or_more);
+
+    /* Read one byte (always byte-aligned). */
+    a.mov(RETb, x86::byte_ptr(bin_base, bin_position));
+    a.movzx(RETd, RETb);
+    a.short_().jmp(read_done);
+
+    /* Read two bytes. */
+    a.bind(two);
+    a.mov(RET.r16(), x86::word_ptr(bin_base, bin_position));
+    a.movzx(RETd, RET.r16());
+    a.short_().jmp(read_done);
+
+    a.bind(three_or_more);
+    a.cmp(RET, imm(4));
+    a.short_().je(four);
+    a.short_().ja(five);
+
+    /* Read three bytes. */
+    a.mov(RET.r8(), x86::byte_ptr(bin_base, bin_position, 0, 2));
+    a.movzx(RETd, RETb);
+    a.shl(RETd, imm(16));
+    a.mov(RET.r16(), x86::word_ptr(bin_base, bin_position));
+    a.short_().jmp(read_done);
+
+    /* Read four bytes (always unaligned). */
+    a.bind(four);
+    a.mov(RETd, x86::dword_ptr(bin_base, bin_position));
+    a.short_().jmp(read_done);
+
+    /* Read five bytes (always unaligned). */
+    a.bind(five);
+    a.mov(RETd, x86::dword_ptr(bin_base, bin_position));
+    a.mov(ARG4.r8(), x86::byte_ptr(bin_base, bin_position, 0, 4));
+    a.movzx(ARG4d, ARG4.r8());
+    a.shl(ARG4, imm(32));
+    a.or_(RET, ARG4);
+
+    /* Handle the bytes read. */
+    a.bind(read_done);
+    a.pop(bin_position);
+    a.bswap(RET);
+
+    if (x86::rcx == ctx) {
+        a.push(x86::rcx);
+    }
+    a.mov(x86::ecx, bin_position.r32());
+    a.and_(x86::cl, imm(7));
+    a.shl(RET, x86::cl);
+
+    /* Check whether we will need to clear out trailing
+     * garbage not part of the binary. */
+    a.mov(x86::cl, 64);
+    a.cmp(ARG5, imm(3));
+    a.short_().ja(no_masking);
+
+    /* Calculate a byte mask and zero out trailing garbage. */
+    a.shl(ARG5d, imm(3));
+    a.sub(x86::cl, ARG5.r8());
+    mov_imm(ARG5, -1);
+    a.shl(ARG5, x86::cl);
+    a.and_(RET, ARG5);
+
+    a.bind(no_masking);
+    if (x86::rcx == ctx) {
+        a.pop(x86::rcx);
+    }
+
+    /* `test rax, rax` is a shorter instruction but can cause a warning
+     * in valgrind if there are any uninitialized bits in rax. */
+    a.bt(RET, imm(63));
+    a.short_().jnc(ascii);
+
+    /* The bs_get_utf8_shared fragment expects the contents in RETd. */
+    a.shr(RET, imm(32));
+    a.jmp(labels[bs_get_utf8_shared]);
+
+    /* Handle plain old ASCII (code point < 128). */
+    a.bind(ascii);
+    a.add(x86::qword_ptr(ctx, position_offset), imm(8));
+    a.shr(RET, imm(56 - _TAG_IMMED1_SIZE));
+    a.or_(RET, imm(_TAG_IMMED1_SMALL)); /* Always clears ZF. */
+    a.ret();
+}
+
+/*
+ * ARG1 = pointer to match state
+ * ARG2 = position in binary in bits
+ * RETd = 4 bytes read from the binary in big-endian order
+ *
+ * On successful return, the extracted code point is in RET, the
+ * position in the match state has been updated, and the ZF is clear.
+ * On failure, the ZF is set.
+ */
+void BeamGlobalAssembler::emit_bs_get_utf8_shared() {
+    Label error = a.newLabel();
+
+    x86::Gp shift_q = ARG4, shift_d = ARG4d, shift_b = ARG4.r8();
+    x86::Gp original_value_d = RETd;
+
+    x86::Gp byte_count_q = ARG2, byte_count_d = ARG2d;
+    x86::Gp extracted_value_d = ARG3d, extracted_value_b = ARG3.r8();
+    x86::Gp control_mask_d = ARG5d;
+    x86::Gp error_mask_d = ARG6d;
+
+    ASSERT(extracted_value_d != shift_d);
+    ASSERT(control_mask_d != shift_d);
+    ASSERT(error_mask_d != shift_d);
+    ASSERT(byte_count_d != shift_d);
+
+    /* UTF-8 has the following layout, where 'x' are data bits:
+     *
+     * 1 byte:  0xxxxxxx (not handled by this path)
+     * 2 bytes: 110xxxxx, 10xxxxxx
+     * 3 bytes: 1110xxxx, 10xxxxxx 10xxxxxx
+     * 4 bytes: 11110xxx, 10xxxxxx 10xxxxxx 10xxxxxx
+     *
+     * Note that the number of leading bits is equal to the number of bytes,
+     * which makes it very easy to create masks for extraction and error
+     * checking. */
+
+    /* The PEXT instruction has poor latency on some processors, so we try to
+     * hide that by extracting early on. Should this be a problem, it's not
+     * much slower to hand-roll it with shifts or BEXTR.
+     *
+     * The mask covers data bits from all variants. This includes the 23rd bit
+     * to support the 2-byte case, which is set on all well-formed 4-byte
+     * codepoints, so it must be cleared before range testing .*/
+    a.mov(extracted_value_d, imm(0x1F3F3F3F));
+    a.pext(extracted_value_d, original_value_d, extracted_value_d);
+
+    /* Preserve current match buffer and bit offset. */
+    a.push(ARG1);
+    a.push(ARG2);
+
+    /* Byte count = leading bit count. */
+    a.mov(byte_count_d, original_value_d);
+    a.not_(byte_count_d);
+    a.lzcnt(byte_count_d, byte_count_d);
+
+    /* Mask shift = (4 - byte count) * 8 */
+    a.mov(shift_d, imm(4));
+    a.sub(shift_d, byte_count_d);
+    a.lea(shift_d, x86::qword_ptr(0, shift_q, 3));
+
+    /* Shift the original value and masks into place. */
+    a.shrx(original_value_d, original_value_d, shift_d);
+
+    /* Matches the '10xxxxxx' components, leaving the header byte alone. */
+    a.mov(control_mask_d, imm(0x00C0C0C0));
+    a.shrx(control_mask_d, control_mask_d, shift_d);
+    a.mov(error_mask_d, imm(0x00808080));
+    a.shrx(error_mask_d, error_mask_d, shift_d);
+
+    /* Extracted value shift = (4 - byte count) * 6, as the leading '10' on
+     * every byte has been removed through PEXT.
+     *
+     * We calculate the shift here to avoid depending on byte_count_d later on
+     * when it may have changed. */
+    a.mov(shift_d, imm(4));
+    a.sub(shift_d, byte_count_d);
+    a.add(shift_d, shift_d);
+    a.lea(shift_d, x86::qword_ptr(shift_q, shift_q, 1));
+
+    /* Assert that the header bits of each '10xxxxxx' component is correct,
+     * signalling errors by trashing the byte count with a guaranteed-illegal
+     * value. */
+    a.and_(original_value_d, control_mask_d);
+    a.cmp(original_value_d, error_mask_d);
+    a.cmovne(byte_count_d, error_mask_d);
+
+    /* Shift the extracted value into place. */
+    a.shrx(RETd, extracted_value_d, shift_d);
+
+    /* The extraction mask is a bit too wide, see above for details. */
+    a.and_(RETd, imm(~(1 << 22)));
+
+    /* Check for too large code point. */
+    a.cmp(RETd, imm(0x10FFFF));
+    a.cmova(byte_count_d, error_mask_d);
+
+    /* Check for the illegal range 16#D800 - 16#DFFF. */
+    a.mov(shift_d, RETd);
+    a.and_(shift_d, imm(-0x800));
+    a.cmp(shift_d, imm(0xD800));
+    a.cmove(byte_count_d, error_mask_d);
+
+    /* Test for overlong UTF-8 sequence. That can be done by testing
+     * that the bits marked y below are all zero.
+     *
+     * 1 byte:  0xxxxxxx (not handled by this path)
+     * 2 bytes: 110yyyyx, 10xxxxxx
+     * 3 bytes: 1110yyyy, 10yxxxxx 10xxxxxx
+     * 4 bytes: 11110yyy, 10yyxxxx 10xxxxxx 10xxxxxx
+     *
+     * 1 byte:                   xx'xxxxx
+     * 2 bytes:             y'yyyxx'xxxxx
+     * 3 bytes:       y'yyyyx'xxxxx'xxxxx
+     * 4 bytes: y'yyyyx'xxxxx'xxxxx'xxxxx
+     *
+     * The y bits can be isolated by shifting down by the number of bits
+     * shown in this table:
+     *
+     * 2:  7    (byte_count * 4 - 1)
+     * 3: 11    (byte_count * 4 - 1)
+     * 4: 16    (byte_count * 4)
+     */
+
+    /* Calculate number of bits to shift. */
+    a.lea(shift_d, x86::qword_ptr(0, byte_count_q, 2));
+    a.cmp(byte_count_d, imm(4));
+    a.setne(extracted_value_b);
+    a.sub(shift_b, extracted_value_b);
+    a.movzx(shift_q, shift_b);
+
+    /* Now isolate the y bits and compare to zero. */
+    a.shrx(extracted_value_d, RETd, shift_d);
+    a.test(extracted_value_d, extracted_value_d);
+    a.cmove(byte_count_d, error_mask_d);
+
+    /* Restore current bit offset and match buffer. */
+    ASSERT(ARG1 != byte_count_q && ARG3 != byte_count_q);
+    a.pop(ARG3);
+    a.pop(ARG1);
+
+    /* Advance our current position. */
+    a.lea(ARG3, x86::qword_ptr(ARG3, byte_count_q, 3));
+
+    /* Byte count must be 2, 3, or 4. */
+    a.sub(byte_count_d, imm(2));
+    a.cmp(byte_count_d, imm(2));
+    a.ja(error);
+
+    a.mov(x86::qword_ptr(ARG1, offsetof(ErlBinMatchBuffer, offset)), ARG3);
+
+    a.shl(RETd, imm(_TAG_IMMED1_SIZE));
+    a.or_(RETd, imm(_TAG_IMMED1_SMALL)); /* Always clears ZF. */
+
+    a.ret();
+
+    a.bind(error);
+    {
+        /* Signal error by setting ZF. */
+        a.xor_(RET, RET);
+        a.ret();
+    }
+}
+
 void BeamModuleAssembler::emit_bs_get_utf8(const ArgRegister &Ctx,
                                            const ArgLabel &Fail) {
-    mov_arg(ARG1, Ctx);
+    const int base_offset = offsetof(ErlBinMatchBuffer, base);
+    const int position_offset = offsetof(ErlBinMatchBuffer, offset);
+    const int size_offset = offsetof(ErlBinMatchBuffer, size);
+
+    const x86::Gp ctx = ARG1;
+    const x86::Gp bin_position = ARG2;
+    const x86::Gp bin_base = ARG3;
+
+    Label multi_byte = a.newLabel(), fallback = a.newLabel(),
+          check = a.newLabel(), done = a.newLabel();
+
+    mov_arg(ctx, Ctx);
+    a.lea(ctx, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb)));
+
+    a.mov(bin_position, x86::qword_ptr(ctx, position_offset));
+    a.mov(RET, x86::qword_ptr(ctx, size_offset));
+    a.mov(bin_base, x86::qword_ptr(ctx, base_offset));
+    a.sub(RET, bin_position);
+    a.cmp(RET, imm(32));
+    a.short_().jb(fallback);
+
+    a.test(bin_position, imm(7));
+    a.short_().jnz(fallback);
+
+    /* We're byte-aligned and can read at least 32 bits. */
+    a.mov(RET, bin_position);
+    a.shr(RET, 3);
+
+    /* The most significant bits come first, so we'll read the the next four
+     * bytes as big-endian so we won't have to reorder them later. */
+    if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+        a.movbe(RETd, x86::dword_ptr(bin_base, RET));
+    } else {
+        a.mov(RETd, x86::dword_ptr(bin_base, RET));
+        a.bswap(RETd);
+    }
+    a.test(RETd, RETd);
+    a.short_().js(multi_byte);
 
-    emit_enter_runtime();
+    /* Handle plain old ASCII (code point < 128). */
+    a.add(x86::qword_ptr(ctx, position_offset), imm(8));
+    a.shr(RETd, imm(24 - _TAG_IMMED1_SIZE));
+    a.or_(RETd, imm(_TAG_IMMED1_SMALL));
+    a.short_().jmp(done);
 
-    a.lea(ARG1, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb)));
-    runtime_call<1>(erts_bs_get_utf8);
+    a.bind(multi_byte);
 
-    emit_leave_runtime();
+    if (hasCpuFeature(CpuFeatures::X86::kBMI2)) {
+        /* This CPU supports the PEXT and SHRX instructions. */
+        safe_fragment_call(ga->get_bs_get_utf8_shared());
+        a.short_().jmp(check);
+    }
 
-    emit_test_the_non_value(RET);
+    /* Take care of unaligned binaries and binaries with less than 32
+     * bits left. */
+    a.bind(fallback);
+    if (hasCpuFeature(CpuFeatures::X86::kBMI2)) {
+        /* This CPU supports the PEXT and SHRX instructions. */
+        safe_fragment_call(ga->get_bs_get_utf8_short_shared());
+    } else {
+        emit_enter_runtime();
+
+        runtime_call<1>(erts_bs_get_utf8);
+
+        emit_leave_runtime();
+
+        emit_test_the_non_value(RET);
+    }
+
+    a.bind(check);
     a.je(resolve_beam_label(Fail));
+
+    a.bind(done);
 }
 
 void BeamModuleAssembler::emit_i_bs_get_utf8(const ArgRegister &Ctx,
@@ -1286,8 +1484,8 @@ void BeamModuleAssembler::emit_validate_unicode(Label next,
                                                 Label fail,
                                                 x86::Gp value) {
     a.mov(ARG3d, value.r32());
-    a.and_(ARG3d, imm(_TAG_IMMED1_MASK));
-    a.cmp(ARG3d, imm(_TAG_IMMED1_SMALL));
+    a.and_(ARG3d.r8(), imm(_TAG_IMMED1_MASK));
+    a.cmp(ARG3d.r8(), imm(_TAG_IMMED1_SMALL));
     a.jne(fail);
 
     a.cmp(value, imm(make_small(0xD800UL)));
@@ -1485,13 +1683,13 @@ void BeamModuleAssembler::emit_i_bs_append(const ArgLabel &Fail,
 
     mov_arg(ArgXRegister(Live.get()), Bin);
 
-    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>();
 
     a.mov(ARG1, c_p);
     load_x_reg_array(ARG2);
     runtime_call<6>(erts_bs_append);
 
-    emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>();
 
     emit_test_the_non_value(RET);
 
@@ -1544,18 +1742,18 @@ void BeamModuleAssembler::emit_i_bs_private_append(const ArgLabel &Fail,
 }
 
 void BeamModuleAssembler::emit_bs_init_writable() {
-    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>();
 
     a.mov(ARG1, c_p);
     a.mov(ARG2, getXRef(0));
     runtime_call<2>(erts_bs_init_writable);
     a.mov(getXRef(0), RET);
 
-    emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>();
 }
 
 void BeamGlobalAssembler::emit_bs_create_bin_error_shared() {
-    emit_enter_runtime<Update::eStack | Update::eHeap>();
+    emit_enter_runtime<Update::eHeapAlloc>();
 
     /* ARG3 is already set by the caller */
     a.mov(ARG2, ARG4);
@@ -1563,7 +1761,7 @@ void BeamGlobalAssembler::emit_bs_create_bin_error_shared() {
     a.mov(ARG1, c_p);
     runtime_call<4>(beam_jit_bs_construct_fail_info);
 
-    emit_leave_runtime<Update::eStack | Update::eHeap>();
+    emit_leave_runtime<Update::eHeapAlloc>();
 
     /* We must align the return address to make it a proper tagged CP, in case
      * we were called with `safe_fragment_call`. This is safe because we will
@@ -1587,10 +1785,52 @@ void BeamGlobalAssembler::emit_bs_create_bin_error_shared() {
     a.jmp(labels[raise_exception_shared]);
 }
 
+/*
+ * ARG1 = tagged bignum term
+ *
+ * On return, Z is set if ARG1 is not a bignum. Otherwise, Z is clear and
+ * ARG1 is the 64 least significant bits of the bignum.
+ */
+void BeamGlobalAssembler::emit_get_sint64_shared() {
+    Label success = a.newLabel();
+    Label fail = a.newLabel();
+
+    emit_is_boxed(fail, ARG1);
+    x86::Gp boxed_ptr = emit_ptr_val(ARG4, ARG1);
+    a.mov(ARG2, emit_boxed_val(boxed_ptr));
+    a.mov(ARG3, emit_boxed_val(boxed_ptr, sizeof(Eterm)));
+    a.and_(ARG2, imm(_TAG_HEADER_MASK));
+    a.cmp(ARG2, imm(POS_BIG_SUBTAG));
+    a.je(success);
+
+    a.cmp(ARG2, imm(NEG_BIG_SUBTAG));
+    a.jne(fail);
+
+    a.neg(ARG3);
+
+    a.bind(success);
+    {
+        a.mov(ARG1, ARG3);
+        /* Clear Z flag.
+         *
+         * ARG2 is known to be POS_BIG_SUBTAG or NEG_BIG_SUBTAG at this point.
+         */
+        ERTS_CT_ASSERT(POS_BIG_SUBTAG != 0 && NEG_BIG_SUBTAG != 0);
+        a.test(ARG2, ARG2);
+        a.ret();
+    }
+
+    a.bind(fail);
+    {
+        a.xor_(ARG2, ARG2); /* Set Z flag */
+        a.ret();
+    }
+}
+
 struct BscSegment {
     BscSegment()
             : type(am_false), unit(1), flags(0), src(ArgNil()), size(ArgNil()),
-              error_info(0), effectiveSize(-1) {
+              error_info(0), effectiveSize(-1), action(action::DIRECT) {
     }
 
     Eterm type;
@@ -1601,8 +1841,482 @@ struct BscSegment {
 
     Uint error_info;
     Sint effectiveSize;
+
+    /* Here are sub actions for storing integer segments.
+     *
+     * We use the ACCUMULATE_FIRST and ACCUMULATE actions to shift the
+     * values of segments with known, small sizes (no more than 64 bits)
+     * into an accumulator register.
+     *
+     * When no more segments can be accumulated, the STORE action is
+     * used to store the value of the accumulator into the binary.
+     *
+     * The DIRECT action is used when it is not possible to use the
+     * accumulator (for unknown or too large sizes).
+     */
+    enum class action { DIRECT, ACCUMULATE_FIRST, ACCUMULATE, STORE } action;
 };
 
+static std::vector<BscSegment> bs_combine_segments(
+        const std::vector<BscSegment> segments) {
+    std::vector<BscSegment> segs;
+
+    for (auto seg : segments) {
+        switch (seg.type) {
+        case am_integer: {
+            if (!(0 < seg.effectiveSize && seg.effectiveSize <= 64)) {
+                /* Unknown or too large size. Handle using the default
+                 * DIRECT action. */
+                segs.push_back(seg);
+                continue;
+            }
+
+            if (seg.flags & BSF_LITTLE || segs.size() == 0 ||
+                segs.back().action == BscSegment::action::DIRECT) {
+                /* There are no previous compatible ACCUMULATE / STORE
+                 * actions. Create the first ones. */
+                seg.action = BscSegment::action::ACCUMULATE_FIRST;
+                segs.push_back(seg);
+                seg.action = BscSegment::action::STORE;
+                segs.push_back(seg);
+                continue;
+            }
+
+            auto prev = segs.back();
+            if (prev.flags & BSF_LITTLE) {
+                /* Little-endian segments cannot be combined with other
+                 * segments. Create new ACCUMULATE_FIRST / STORE actions. */
+                seg.action = BscSegment::action::ACCUMULATE_FIRST;
+                segs.push_back(seg);
+                seg.action = BscSegment::action::STORE;
+                segs.push_back(seg);
+                continue;
+            }
+
+            /* The current segment is compatible with the previous
+             * segment. Try combining them. */
+            if (prev.effectiveSize + seg.effectiveSize <= 64) {
+                /* The combined values of the segments fits in the
+                 * accumulator. Insert an ACCUMULATE action for the
+                 * current segment before the pre-existing STORE
+                 * action. */
+                segs.pop_back();
+                prev.effectiveSize += seg.effectiveSize;
+                seg.action = BscSegment::action::ACCUMULATE;
+                segs.push_back(seg);
+                segs.push_back(prev);
+            } else {
+                /* The size exceeds 64 bits. Can't combine. */
+                seg.action = BscSegment::action::ACCUMULATE_FIRST;
+                segs.push_back(seg);
+                seg.action = BscSegment::action::STORE;
+                segs.push_back(seg);
+            }
+            break;
+        }
+        default:
+            segs.push_back(seg);
+            break;
+        }
+    }
+    return segs;
+}
+
+/*
+ * In:
+ *    bin_offset = if valid, register to store the lower 32 bits
+ *           of the bit offset into the binary
+ *    bin_ptr = register to store pointer to current byte in
+ *    bit_offset = current bit offset into binary, or -1 if unknown
+ *    size = size of segment to be constructed
+ *           (ignored if size_reg is valid register)
+ *    size_reg = if a valid register, it contains the size of
+ *               the segment to be constructed
+ *
+ * Out:
+ *    bin_offset register = the lower 32 bits of the bit offset
+ *          into the binary
+ *    bin_ptr register = pointer to current byte
+ *
+ *    Preserves all other registers except RET.
+ */
+void BeamModuleAssembler::update_bin_state(x86::Gp bin_offset,
+                                           x86::Gp current_byte,
+                                           Sint bit_offset,
+                                           Sint size,
+                                           x86::Gp size_reg) {
+    const int x_reg_offset = offsetof(ErtsSchedulerRegisters, x_reg_array.d);
+    const int cur_bin_base =
+            offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
+            offsetof(struct erl_bits_state, erts_current_bin_);
+    const int cur_bin_offset =
+            offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
+            offsetof(struct erl_bits_state, erts_bin_offset_);
+
+    x86::Mem mem_bin_base =
+            x86::Mem(registers, cur_bin_base - x_reg_offset, sizeof(UWord));
+    x86::Mem mem_bin_offset =
+            x86::Mem(registers, cur_bin_offset - x_reg_offset, sizeof(UWord));
+
+    if (bit_offset % 8 != 0 || !Support::isInt32(bit_offset + size)) {
+        /* The bit offset is unknown or not byte-aligned. Alternatively,
+         * the sum of bit_offset and size does not fit in an immediate. */
+        a.mov(current_byte, mem_bin_offset);
+        a.mov(RET, mem_bin_base);
+
+        if (bin_offset.isValid()) {
+            a.mov(bin_offset.r32(), current_byte.r32());
+        }
+        if (size_reg.isValid()) {
+            a.add(mem_bin_offset, size_reg);
+        } else {
+            a.add(mem_bin_offset, imm(size));
+        }
+        a.shr(current_byte, imm(3));
+        a.add(current_byte, RET);
+    } else {
+        ASSERT(size >= 0 || size_reg.isValid());
+        ASSERT(bit_offset % 8 == 0);
+
+        comment("optimized updating of binary construction state");
+        a.mov(current_byte, mem_bin_base);
+        if (bit_offset) {
+            a.add(current_byte, imm(bit_offset >> 3));
+        }
+        if (size_reg.isValid()) {
+            a.add(mem_bin_offset, size_reg);
+        } else {
+            a.mov(mem_bin_offset, imm(bit_offset + size));
+        }
+    }
+}
+
+bool BeamModuleAssembler::need_mask(const ArgVal Val, Sint size) {
+    if (size == 64) {
+        return false;
+    } else {
+        auto [min, max] = getClampedRange(Val);
+        return !(0 <= min && max >> size == 0);
+    }
+}
+
+/*
+ * The size of the segment is assumed to be in ARG3.
+ */
+void BeamModuleAssembler::set_zero(Sint effectiveSize) {
+    update_bin_state(ARG2, ARG1, -1, -1, ARG3);
+
+    mov_imm(RET, 0);
+
+    if (effectiveSize < 0 || effectiveSize > 128) {
+        /* Size is unknown or greater than 128. Modern CPUs have an
+         * enhanced "rep stosb" instruction that in most circumstances
+         * is the fastest way to clear blocks of more than 128
+         * bytes. */
+        Label done = a.newLabel();
+
+        if (effectiveSize < 0) {
+            a.test(ARG3, ARG3);
+            a.short_().jz(done);
+        }
+
+        if (ARG1 != x86::rdi) {
+            a.mov(x86::rdi, ARG1);
+        }
+        a.mov(x86::rcx, ARG3);
+        a.add(x86::rcx, imm(7));
+        a.shr(x86::rcx, imm(3));
+        a.rep().stosb();
+
+        a.bind(done);
+    } else {
+        /* The size is known and it is at most 128 bits. */
+        Uint offset = 0;
+
+        ASSERT(0 <= effectiveSize && effectiveSize <= 128);
+
+        if (effectiveSize == 128) {
+            a.mov(x86::Mem(ARG1, offset, 8), RET);
+            offset += 8;
+        }
+
+        if (effectiveSize >= 64) {
+            a.mov(x86::Mem(ARG1, offset, 8), RET);
+            offset += 8;
+        }
+
+        if ((effectiveSize & 63) >= 32) {
+            a.mov(x86::Mem(ARG1, offset, 4), RETd);
+            offset += 4;
+        }
+
+        if ((effectiveSize & 31) >= 16) {
+            a.mov(x86::Mem(ARG1, offset, 2), RET.r16());
+            offset += 2;
+        }
+
+        if ((effectiveSize & 15) >= 8) {
+            a.mov(x86::Mem(ARG1, offset, 1), RET.r8());
+            offset += 1;
+        }
+
+        if ((effectiveSize & 7) > 0) {
+            a.mov(x86::Mem(ARG1, offset, 1), RET.r8());
+        }
+    }
+}
+
+/*
+ * In:
+ *
+ *   ARG3 = valid unicode code point (=> 0x80) to encode
+ *
+ * Out:
+ *
+ *   ARG3d = the code point encoded in UTF-8.
+ *   ARG2 = number of bits of result (16, 24, or 32)
+ *
+ *   Clobbers RET and the other ARG* registers.
+ */
+void BeamGlobalAssembler::emit_construct_utf8_shared() {
+    Label more_than_two_bytes = a.newLabel();
+    Label four_bytes = a.newLabel();
+    const x86::Gp tmp1 = ARG1;
+    const x86::Gp tmp2 = ARG2;
+    const x86::Gp value = ARG3;
+    const x86::Gp num_bits = ARG2;
+
+    a.mov(RETd, value.r32());
+    a.and_(RETd, imm(0x3f));
+
+    a.cmp(value.r32(), imm(0x800));
+    a.jae(more_than_two_bytes);
+
+    a.shl(RETd, imm(8));
+
+    a.shr(value, imm(6));
+
+    a.or_(value.r32(), RETd);
+    a.or_(value.r32(), imm(0x80c0));
+
+    mov_imm(num_bits, 16);
+    a.ret();
+
+    /* Test whether the value should be encoded in four bytes. */
+    a.bind(more_than_two_bytes);
+    a.cmp(value.r32(), imm(0x10000));
+    a.jae(four_bytes);
+
+    /* Encode Unicode code point in three bytes. */
+    a.shl(RETd, imm(16));
+
+    a.lea(tmp1.r32(), x86::Mem(0ULL, ARG3, 2, 0));
+    a.and_(tmp1.r32(), imm(0x3f00));
+
+    a.shr(value.r32(), imm(12));
+    a.or_(value.r32(), tmp1.r32());
+    a.or_(value.r32(), RETd);
+    a.or_(value.r32(), imm(0x8080e0));
+
+    mov_imm(num_bits, 24);
+    a.ret();
+
+    /* Encode Unicode code point in four bytes. */
+    a.bind(four_bytes);
+    a.shl(RETd, imm(24));
+
+    a.mov(tmp1.r32(), value.r32());
+    a.shl(tmp1.r32(), imm(10));
+    a.and_(tmp1.r32(), imm(0x3f0000));
+
+    a.mov(tmp2.r32(), value.r32());
+    a.shr(tmp2.r32(), imm(4));
+    a.and_(tmp2.r32(), imm(0x3f00));
+
+    a.shr(value.r32(), imm(18));
+
+    a.or_(value.r32(), RETd);
+    a.or_(value.r32(), tmp1.r32());
+    a.or_(value.r32(), tmp2.r32());
+    a.or_(value.r32(), imm(0xffffffff808080f0));
+
+    mov_imm(num_bits, 32);
+    a.ret();
+}
+
+void BeamModuleAssembler::emit_construct_utf8(const ArgVal &Src,
+                                              Sint bit_offset,
+                                              bool is_byte_aligned) {
+    Label prepare_store = a.newLabel();
+    Label store = a.newLabel();
+    Label next = a.newLabel();
+
+#ifdef WIN32
+    const x86::Gp bin_ptr = ARG4;
+    const x86::Gp bin_offset = is_byte_aligned ? x86::Gp() : ARG1;
+#else
+    const x86::Gp bin_ptr = ARG1;
+    const x86::Gp bin_offset = is_byte_aligned ? x86::Gp() : ARG4;
+#endif
+    ASSERT(!bin_offset.isValid() || bin_offset == x86::rcx);
+
+    /* The following two registers must be the same as
+     * emit_construct_utf8_shared() expects. */
+    const x86::Gp code_point = ARG3;
+    const x86::Gp size_reg = ARG2;
+
+    comment("construct utf8 segment");
+
+    mov_arg(code_point, Src);
+    a.shr(code_point.r32(), imm(_TAG_IMMED1_SIZE));
+    mov_imm(size_reg, 8);
+    a.cmp(code_point, imm(0x80));
+    a.jb(prepare_store);
+
+    safe_fragment_call(ga->get_construct_utf8_shared());
+
+    a.bind(prepare_store);
+
+    update_bin_state(bin_offset, bin_ptr, bit_offset, -1, size_reg);
+
+    if (!is_byte_aligned) {
+        /* Bit offset is unknown and is not known to be
+         * byte aligned. Must test alignment. */
+        a.and_(bin_offset.r32(), imm(7));
+        a.je(store);
+
+        /* We must combine the last partial byte with the UTF-8
+         * encoded code point. */
+
+        a.movzx(RETd, x86::byte_ptr(bin_ptr));
+
+        a.bswap(code_point);
+        a.shr(code_point, bin_offset.r8());
+        a.bswap(code_point);
+
+        a.shl(RETd, bin_offset.r8());
+        a.and_(RETd, imm(~0xff));
+        a.shr(RETd, bin_offset.r8());
+
+        a.or_(code_point, RET);
+
+        a.add(size_reg.r32(), imm(8));
+    }
+
+    a.bind(store);
+    if (bit_offset % (4 * 8) == 0) {
+        /* This segment is aligned on a 4-byte boundary. This implies
+         * that a 4-byte write will be inside the allocated binary. */
+        a.mov(x86::dword_ptr(bin_ptr), code_point.r32());
+    } else {
+        Label do_store_1 = a.newLabel();
+        Label do_store_2 = a.newLabel();
+
+        /* Unsuitable or unknown alignment. We must be careful not
+         * to write beyound the allocated end of the binary. */
+        a.cmp(size_reg.r8(), imm(8));
+        a.short_().jne(do_store_1);
+
+        a.mov(x86::byte_ptr(bin_ptr), code_point.r8());
+        a.short_().jmp(next);
+
+        a.bind(do_store_1);
+        a.cmp(size_reg.r8(), imm(24));
+        a.ja(do_store_2);
+
+        a.mov(x86::word_ptr(bin_ptr), code_point.r16());
+        a.cmp(size_reg.r8(), imm(16));
+        a.short_().je(next);
+
+        a.shr(code_point.r32(), imm(16));
+        a.mov(x86::byte_ptr(bin_ptr, 2), code_point.r8());
+        a.short_().jmp(next);
+
+        a.bind(do_store_2);
+        a.mov(x86::dword_ptr(bin_ptr), code_point.r32());
+
+        if (!is_byte_aligned) {
+            a.cmp(size_reg.r8(), imm(32));
+            a.je(next);
+
+            a.shr(code_point, imm(32));
+            a.mov(x86::byte_ptr(bin_ptr, 4), code_point.r8());
+        }
+    }
+
+    a.bind(next);
+}
+/*
+ * In:
+ *   ARG1 = pointer to current byte
+ *   ARG3 = bit offset
+ *   ARG4 = number of bits to write
+ *   ARG5 = data to write
+ */
+void BeamGlobalAssembler::emit_store_unaligned() {
+    Label loop = a.newLabel();
+    Label done = a.newLabel();
+    const x86::Gp bin_ptr = ARG1;
+    const x86::Gp left_bit_offset = ARG3;
+    const x86::Gp right_bit_offset = ARG2;
+    const x86::Gp num_bits = ARG4;
+    const x86::Gp bitdata = ARG5;
+
+    a.movzx(RETd, x86::byte_ptr(bin_ptr));
+
+    a.xchg(left_bit_offset, x86::rcx);
+
+    a.mov(right_bit_offset, bitdata);
+    a.and_(right_bit_offset, imm(0xff));
+    a.shr(right_bit_offset, x86::cl);
+
+    a.shl(RETd, x86::cl);
+    a.and_(RETd, imm(~0xff));
+    a.shr(RETd, x86::cl);
+
+    a.xchg(left_bit_offset, x86::rcx);
+
+    a.or_(RETd, ARG2d);
+    a.mov(byte_ptr(ARG1), RETb);
+    a.add(ARG1, imm(1));
+
+    mov_imm(right_bit_offset, 8);
+    a.sub(right_bit_offset, left_bit_offset);
+
+    a.xchg(right_bit_offset, x86::rcx);
+    a.bswap(bitdata);
+    a.shl(bitdata, x86::cl);
+    a.xchg(right_bit_offset, x86::rcx);
+
+    a.sub(ARG4, right_bit_offset);
+    a.jle(done);
+
+    a.bind(loop);
+    a.rol(bitdata, imm(8));
+    a.mov(byte_ptr(ARG1), bitdata.r8());
+    a.add(ARG1, imm(1));
+    a.sub(num_bits, imm(8));
+    a.jg(loop);
+
+    a.bind(done);
+    a.ret();
+}
+
+bool BeamModuleAssembler::bs_maybe_enter_runtime(bool entered) {
+    if (!entered) {
+        comment("enter runtime");
+        emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>();
+    }
+    return true;
+}
+
+void BeamModuleAssembler::bs_maybe_leave_runtime(bool entered) {
+    if (entered) {
+        comment("leave runtime");
+        emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>();
+    }
+}
+
 void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                                                const ArgWord &Alloc,
                                                const ArgWord &Live0,
@@ -1611,10 +2325,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
     Uint num_bits = 0;
     std::size_t n = args.size();
     std::vector<BscSegment> segments;
-    Label error = a.newLabel();
-    Label past_error = a.newLabel();
+    Label error; /* Intentionally uninitialized */
     ArgWord Live = Live0;
     x86::Gp sizeReg;
+    Sint allocated_size = -1;
+    bool need_error_handler = false;
+    bool runtime_entered = false;
 
     /*
      * Collect information about each segment and calculate sizes of
@@ -1660,12 +2376,45 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         seg.error_info = beam_jit_set_bsc_segment_op(bsc_segment, bsc_op);
 
         /*
+         * Test whether we can omit the code for the error handler.
+         */
+        switch (seg.type) {
+        case am_append:
+            if (!(exact_type<BeamTypeId::Bitstring>(seg.src) &&
+                  std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit)) {
+                need_error_handler = true;
+            }
+            break;
+        case am_binary:
+            if (!(seg.size.isAtom() && seg.size.as<ArgAtom>().get() == am_all &&
+                  exact_type<BeamTypeId::Bitstring>(seg.src) &&
+                  std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit)) {
+                need_error_handler = true;
+            }
+            break;
+        case am_integer:
+            if (!exact_type<BeamTypeId::Integer>(seg.src)) {
+                need_error_handler = true;
+            }
+            break;
+        case am_private_append:
+        case am_string:
+            break;
+        default:
+            need_error_handler = true;
+            break;
+        }
+
+        /*
          * As soon as we have entered runtime mode, Y registers can no
          * longer be accessed in the usual way. Therefore, if the source
-         * and/or size are in Y register, copy them to X registers.
+         * and/or size are in Y registers, copy them to X registers. Be
+         * careful to preserve any associated type information.
          */
         if (seg.src.isYRegister()) {
-            ArgVal reg = ArgXRegister(Live.get());
+            auto reg =
+                    seg.src.as<ArgYRegister>().copy<ArgXRegister>(Live.get());
+            ASSERT(reg.typeIndex() == seg.src.as<ArgYRegister>().typeIndex());
             mov_arg(reg, seg.src);
 
             Live = Live + 1;
@@ -1673,7 +2422,9 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         }
 
         if (seg.size.isYRegister()) {
-            ArgVal reg = ArgXRegister(Live.get());
+            auto reg =
+                    seg.size.as<ArgYRegister>().copy<ArgXRegister>(Live.get());
+            ASSERT(reg.typeIndex() == seg.size.as<ArgYRegister>().typeIndex());
             mov_arg(reg, seg.size);
 
             Live = Live + 1;
@@ -1694,16 +2445,64 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         if (seg.effectiveSize < 0 && seg.type != am_append &&
             seg.type != am_private_append) {
             sizeReg = FCALLS;
+            need_error_handler = true;
         }
 
         segments.insert(segments.end(), seg);
     }
 
-    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+    /*
+     * Test whether a heap binary of fixed size will result from the
+     * construction. If so, allocate and construct the binary now
+     * before entering the runtime mode.
+     */
+    if (!sizeReg.isValid() && num_bits % 8 == 0 &&
+        num_bits / 8 <= ERL_ONHEAP_BIN_LIMIT && segments[0].type != am_append &&
+        segments[0].type != am_private_append) {
+        const int x_reg_offset =
+                offsetof(ErtsSchedulerRegisters, x_reg_array.d);
+        const int cur_bin_base =
+                offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
+                offsetof(struct erl_bits_state, erts_current_bin_);
+        const int cur_bin_offset =
+                offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) +
+                offsetof(struct erl_bits_state, erts_bin_offset_);
+        x86::Mem mem_bin_base =
+                x86::qword_ptr(registers, cur_bin_base - x_reg_offset);
+        x86::Mem mem_bin_offset =
+                x86::qword_ptr(registers, cur_bin_offset - x_reg_offset);
+        Uint num_bytes = num_bits / 8;
+
+        comment("allocate heap binary");
+        allocated_size = (num_bytes + 7) & (-8);
+
+        /* Ensure that there is enough room on the heap. */
+        Uint need = heap_bin_size(num_bytes) + Alloc.get();
+        emit_gc_test(ArgWord(0), ArgWord(need), Live);
+
+        /* Create the heap binary. */
+        a.lea(RET, x86::qword_ptr(HTOP, TAG_PRIMARY_BOXED));
+        a.mov(TMP_MEM1q, RET);
+        a.mov(x86::qword_ptr(HTOP, 0), imm(header_heap_bin(num_bytes)));
+        a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), imm(num_bytes));
+
+        /* Initialize the erl_bin_state struct. */
+        a.add(HTOP, imm(sizeof(Eterm[2])));
+        a.mov(mem_bin_base, HTOP);
+        a.mov(mem_bin_offset, imm(0));
+
+        /* Update HTOP. */
+        a.add(HTOP, imm(allocated_size));
+    }
+
+    if (!need_error_handler) {
+        comment("(cannot fail)");
+    } else {
+        Label past_error = a.newLabel();
+
+        runtime_entered = bs_maybe_enter_runtime(false);
+        a.short_().jmp(past_error);
 
-    a.short_().jmp(past_error);
-    a.bind(error);
-    {
         /*
          * ARG1 = optional bad size value; valid if BSC_VALUE_ARG1 is set in
          * ARG4
@@ -1713,17 +2512,18 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
          *
          * ARG4 = packed error information
          */
+        error = a.newLabel();
+        a.bind(error);
+        bs_maybe_leave_runtime(runtime_entered);
         comment("handle error");
-        emit_leave_runtime<Update::eReductions | Update::eStack |
-                           Update::eHeap>();
         if (Fail.get() != 0) {
             a.jmp(resolve_beam_label(Fail));
         } else {
             safe_fragment_call(ga->get_bs_create_bin_error_shared());
         }
-    }
 
-    a.bind(past_error);
+        a.bind(past_error);
+    }
 
     /* We count the total number of bits in an unsigned integer. To
      * avoid having to check for overflow when adding to the counter,
@@ -1748,12 +2548,50 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         if (seg.size.isAtom() && seg.size.as<ArgAtom>().get() == am_all &&
             seg.type == am_binary) {
             comment("size of an entire binary");
+            runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             mov_arg(ARG1, seg.src);
-            runtime_call<1>(beam_jit_bs_bit_size);
-            if (exact_type(seg.src, BEAM_TYPE_BITSTRING)) {
-                comment("skipped check for success since the source "
-                        "is always a bit string");
+
+            if (exact_type<BeamTypeId::Bitstring>(seg.src)) {
+                auto unit = getSizeUnit(seg.src);
+                bool is_bitstring = unit == 0 || std::gcd(unit, 8) != 8;
+                x86::Gp boxed_ptr = emit_ptr_val(ARG1, ARG1);
+
+                if (is_bitstring) {
+                    comment("inlined size code because the value is always "
+                            "a bitstring");
+                } else {
+                    comment("inlined size code because the value is always "
+                            "a binary");
+                }
+
+                a.mov(ARG2, emit_boxed_val(boxed_ptr, sizeof(Eterm)));
+
+                if (is_bitstring) {
+                    a.mov(RETd, emit_boxed_val(boxed_ptr, 0, sizeof(Uint32)));
+                }
+
+                a.lea(sizeReg, x86::Mem(sizeReg, ARG2, 3, 0, 1));
+
+                if (is_bitstring) {
+                    Label not_sub_bin = a.newLabel();
+                    const auto diff_mask =
+                            _TAG_HEADER_SUB_BIN - _TAG_HEADER_REFC_BIN;
+                    ERTS_CT_ASSERT((_TAG_HEADER_SUB_BIN & diff_mask) != 0 &&
+                                   (_TAG_HEADER_REFC_BIN & diff_mask) == 0 &&
+                                   (_TAG_HEADER_HEAP_BIN & diff_mask) == 0);
+                    a.test(RETb, imm(diff_mask));
+                    a.short_().jz(not_sub_bin);
+
+                    a.movzx(RETd,
+                            emit_boxed_val(boxed_ptr,
+                                           offsetof(ErlSubBin, bitsize),
+                                           1));
+                    a.add(sizeReg, RET);
+
+                    a.bind(not_sub_bin);
+                }
             } else {
+                runtime_call<1>(beam_jit_bs_bit_size);
                 if (Fail.get() == 0) {
                     mov_arg(ARG1, seg.src);
                     mov_imm(ARG4,
@@ -1764,17 +2602,15 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                 }
                 a.test(RET, RET);
                 a.js(error);
+                a.add(sizeReg, RET);
             }
-            a.add(sizeReg, RET);
         } else if (seg.unit != 0) {
             bool can_fail = true;
             comment("size binary/integer/float/string");
 
-            if (always_small(seg.size)) {
-                auto min = std::get<0>(getIntRange(seg.size));
-                if (min >= 0) {
-                    can_fail = false;
-                }
+            if (std::get<0>(getClampedRange(seg.size)) >= 0) {
+                /* Can't fail if size is always positive. */
+                can_fail = false;
             }
 
             if (can_fail && Fail.get() == 0) {
@@ -1789,10 +2625,9 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
 
             if (always_small(seg.size)) {
                 comment("skipped test for small size since it is always small");
-            } else if (always_one_of(seg.size,
-                                     BEAM_TYPE_FLOAT | BEAM_TYPE_INTEGER)) {
+            } else if (always_one_of<BeamTypeId::Number>(seg.size)) {
                 comment("simplified test for small size since it is a number");
-                a.test(ARG1d, imm(TAG_PRIMARY_LIST));
+                a.test(ARG1.r8(), imm(TAG_PRIMARY_LIST));
                 a.je(error);
             } else {
                 a.mov(RETd, ARG1d);
@@ -1827,23 +2662,59 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                 comment("size utf8");
                 mov_arg(ARG1, seg.src);
 
+                if (Fail.get() == 0) {
+                    mov_imm(ARG4,
+                            beam_jit_update_bsc_reason_info(seg.error_info,
+                                                            BSC_REASON_BADARG,
+                                                            BSC_INFO_TYPE,
+                                                            BSC_VALUE_ARG1));
+                }
+
+                if (always_small(seg.src)) {
+                    comment("skipped test for small value since it is always "
+                            "small");
+                } else if (always_one_of<BeamTypeId::Integer,
+                                         BeamTypeId::AlwaysBoxed>(seg.src)) {
+                    comment("simplified test for small operand since other "
+                            "types are boxed");
+                    emit_is_not_boxed(error, ARG1);
+                } else {
+                    a.mov(RETd, ARG1d);
+                    a.and_(RETb, imm(_TAG_IMMED1_MASK));
+                    a.cmp(RETb, imm(_TAG_IMMED1_SMALL));
+                    a.jne(error);
+                }
+
                 mov_imm(RET, 0);
-                a.mov(RETb, imm(1 * 8));
+                a.mov(RETb, imm(1));
                 a.cmp(ARG1, imm(make_small(0x80UL)));
-                a.short_().jl(next);
+                a.short_().jb(next);
 
-                a.mov(RETb, imm(2 * 8));
+                a.mov(RETb, imm(2));
                 a.cmp(ARG1, imm(make_small(0x800UL)));
-                a.short_().jl(next);
+                a.short_().jb(next);
 
-                a.mov(RETb, imm(3 * 8));
-                a.cmp(ARG1, imm(make_small(0x10000UL)));
-                a.short_().jl(next);
+                /* Ensure that the value is not in the invalid range
+                 * 0xD800 through 0xDFFF. */
+                a.mov(ARG2, ARG1);
+                a.sar(ARG2, imm(11 + _TAG_IMMED1_SIZE));
+                a.cmp(ARG2, imm(0x1b));
+                a.je(error);
 
-                a.mov(RETb, imm(4 * 8));
+                a.cmp(ARG1, imm(make_small(0x10000UL)));
+                a.setae(RETb);
+                a.add(RETb, imm(3));
+
+                auto [min, max] = getClampedRange(seg.src);
+                if (0 <= min && max < 0x110000) {
+                    comment("skipped range check for unicode code point");
+                } else {
+                    a.cmp(ARG1, imm(make_small(0x110000)));
+                    a.jae(error);
+                }
 
                 a.bind(next);
-                a.add(sizeReg, RET);
+                a.lea(sizeReg, x86::Mem(sizeReg, RET, 3, 0, 1));
                 break;
             }
             case am_utf16: {
@@ -1891,9 +2762,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         }
     }
 
+    segments = bs_combine_segments(segments);
+
     /* Allocate the binary. */
     if (segments[0].type == am_append) {
         BscSegment seg = segments[0];
+        runtime_entered = bs_maybe_enter_runtime(runtime_entered);
         comment("append to binary");
         mov_arg(ARG3, Live);
         if (sizeReg.isValid()) {
@@ -1907,18 +2781,28 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         a.mov(ARG1, c_p);
         load_x_reg_array(ARG2);
         runtime_call<6>(erts_bs_append_checked);
-        if (Fail.get() == 0) {
-            mov_arg(ARG1, ArgXRegister(Live.get()));
-            mov_imm(ARG4,
-                    beam_jit_update_bsc_reason_info(seg.error_info,
-                                                    BSC_REASON_BADARG,
-                                                    BSC_INFO_FVALUE,
-                                                    BSC_VALUE_ARG1));
+
+        if (exact_type<BeamTypeId::Bitstring>(seg.src) &&
+            std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) {
+            /* There is no way the call can fail with a system_limit
+             * exception on a 64-bit architecture. */
+            comment("skipped test for success because units are compatible");
+        } else {
+            if (Fail.get() == 0) {
+                mov_arg(ARG1, ArgXRegister(Live.get()));
+                mov_imm(ARG4,
+                        beam_jit_update_bsc_reason_info(seg.error_info,
+                                                        BSC_REASON_BADARG,
+                                                        BSC_INFO_FVALUE,
+                                                        BSC_VALUE_ARG1));
+            }
+            emit_test_the_non_value(RET);
+            a.je(error);
         }
-        emit_test_the_non_value(RET);
-        a.je(error);
+        a.mov(TMP_MEM1q, RET);
     } else if (segments[0].type == am_private_append) {
         BscSegment seg = segments[0];
+        runtime_entered = bs_maybe_enter_runtime(runtime_entered);
         comment("private append to binary");
         ASSERT(Alloc.get() == 0);
         mov_arg(ARG2, seg.src);
@@ -1931,38 +2815,53 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
         a.mov(ARG1, c_p);
         runtime_call<4>(erts_bs_private_append_checked);
         /* There is no way the call can fail on a 64-bit architecture. */
+        a.mov(TMP_MEM1q, RET);
+    } else if (allocated_size >= 0) {
+        /* The binary has already been allocated. */
     } else {
         comment("allocate binary");
+        runtime_entered = bs_maybe_enter_runtime(runtime_entered);
         mov_arg(ARG5, Alloc);
         mov_arg(ARG6, Live);
         load_erl_bits_state(ARG3);
         load_x_reg_array(ARG2);
         a.mov(ARG1, c_p);
         if (sizeReg.isValid()) {
-            comment("(size in bits)");
             a.mov(ARG4, sizeReg);
             runtime_call<6>(beam_jit_bs_init_bits);
-        } else if (num_bits % 8 == 0) {
-            comment("(size in bytes)");
-            mov_imm(ARG4, num_bits / 8);
-            runtime_call<6>(beam_jit_bs_init);
         } else {
+            allocated_size = (num_bits + 7) / 8;
+            if (allocated_size <= ERL_ONHEAP_BIN_LIMIT) {
+                allocated_size = (allocated_size + 7) & (-8);
+            }
             mov_imm(ARG4, num_bits);
             runtime_call<6>(beam_jit_bs_init_bits);
         }
+        a.mov(TMP_MEM1q, RET);
     }
-    a.mov(TMP_MEM1q, RET);
+
+    /* Keep track of the bit offset from the being of the binary.
+     * Set to -1 if offset is not known (when a segment of unknown
+     * size has been seen). */
+    Sint bit_offset = 0;
+
+    /* Keep track of whether the current segment is byte-aligned.  (A
+     * segment can be known to be byte-aligned even if the bit offset
+     * is unknown.) */
+    bool is_byte_aligned = true;
 
     /* Build each segment of the binary. */
     for (auto seg : segments) {
         switch (seg.type) {
         case am_append:
         case am_private_append:
+            bit_offset = -1;
             break;
         case am_binary: {
             Uint error_info;
             bool can_fail = true;
 
+            runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             comment("construct a binary segment");
             if (seg.effectiveSize >= 0) {
                 /* The segment has a literal size. */
@@ -1986,8 +2885,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
                                                              BSC_REASON_BADARG,
                                                              BSC_INFO_UNIT,
                                                              BSC_VALUE_FVALUE);
-                if (seg.unit == 1) {
-                    comment("skipped test for success because unit =:= 1");
+                if (exact_type<BeamTypeId::Bitstring>(seg.src) &&
+                    std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) {
+                    comment("skipped test for success because units are "
+                            "compatible");
                     can_fail = false;
                 }
             } else {
@@ -2021,6 +2922,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             break;
         }
         case am_float:
+            runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             comment("construct float segment");
             if (seg.effectiveSize >= 0) {
                 mov_imm(ARG3, seg.effectiveSize);
@@ -2049,42 +2951,292 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             a.jne(error);
             break;
         case am_integer:
-            comment("construct integer segment");
-            if (seg.effectiveSize >= 0) {
-                mov_imm(ARG3, seg.effectiveSize);
-            } else {
-                mov_arg(ARG3, seg.size);
-                a.sar(ARG3, imm(_TAG_IMMED1_SIZE));
-                if (seg.unit != 1) {
-                    mov_imm(RET, seg.unit);
-                    a.mul(ARG3); /* CLOBBERS RDX = ARG3! */
-                    a.mov(ARG3, RET);
+            switch (seg.action) {
+            case BscSegment::action::ACCUMULATE_FIRST:
+            case BscSegment::action::ACCUMULATE: {
+                /* Shift an integer of known size (no more than 64 bits)
+                 * into a word-size accumulator. */
+                Label accumulate = a.newLabel();
+                Label value_is_small = a.newLabel();
+                x86::Gp tmp = ARG4;
+                x86::Gp bin_data = ARG5;
+
+                comment("accumulate value for integer segment");
+                if (seg.action == BscSegment::action::ACCUMULATE_FIRST) {
+                    mov_imm(bin_data, 0);
+                } else if (seg.effectiveSize < 64) {
+                    a.shl(bin_data, imm(seg.effectiveSize));
+                }
+                mov_arg(ARG1, seg.src);
+
+                if (!always_small(seg.src)) {
+                    if (always_one_of<BeamTypeId::Integer,
+                                      BeamTypeId::AlwaysBoxed>(seg.src)) {
+                        comment("simplified small test since all other types "
+                                "are boxed");
+                        emit_is_boxed(value_is_small, seg.src, ARG1);
+                    } else {
+                        a.mov(ARG2d, ARG1d);
+                        a.and_(ARG2d, imm(_TAG_IMMED1_MASK));
+                        a.cmp(ARG2d, imm(_TAG_IMMED1_SMALL));
+                        a.short_().je(value_is_small);
+                    }
+
+                    /* The value is boxed. If it is a bignum, extract the
+                     * least significant 64 bits. */
+                    safe_fragment_call(ga->get_get_sint64_shared());
+                    if (exact_type<BeamTypeId::Integer>(seg.src)) {
+                        a.short_().jmp(accumulate);
+                    } else {
+                        a.short_().jne(accumulate);
+
+                        /* Not a bignum. Signal error. */
+                        if (Fail.get() == 0) {
+                            mov_imm(ARG4,
+                                    beam_jit_update_bsc_reason_info(
+                                            seg.error_info,
+                                            BSC_REASON_BADARG,
+                                            BSC_INFO_TYPE,
+                                            BSC_VALUE_ARG1));
+                        }
+                        a.jmp(error);
+                    }
                 }
+
+                a.bind(value_is_small);
+                a.sar(ARG1, imm(_TAG_IMMED1_SIZE));
+
+                /* Mask (if needed) and accumulate. */
+                a.bind(accumulate);
+                if (seg.effectiveSize == 64) {
+                    a.mov(bin_data, ARG1);
+                } else if (!need_mask(seg.src, seg.effectiveSize)) {
+                    comment("skipped masking because the value always fits");
+                    a.or_(bin_data, ARG1);
+                } else if (seg.effectiveSize == 32) {
+                    a.mov(ARG1d, ARG1d);
+                    a.or_(bin_data, ARG1);
+                } else if (seg.effectiveSize < 32) {
+                    a.and_(ARG1, (1ULL << seg.effectiveSize) - 1);
+                    a.or_(bin_data, ARG1);
+                } else {
+                    mov_imm(tmp, (1ULL << seg.effectiveSize) - 1);
+                    a.and_(ARG1, tmp);
+                    a.or_(bin_data, ARG1);
+                }
+                break;
             }
-            mov_arg(ARG2, seg.src);
-            mov_imm(ARG4, seg.flags);
-            load_erl_bits_state(ARG1);
-            runtime_call<4>(erts_new_bs_put_integer);
-            if (exact_type(seg.src, BEAM_TYPE_INTEGER)) {
-                comment("skipped test for success because construction can't "
-                        "fail");
-            } else {
-                if (Fail.get() == 0) {
-                    mov_arg(ARG1, seg.src);
-                    mov_imm(ARG4,
-                            beam_jit_update_bsc_reason_info(seg.error_info,
-                                                            BSC_REASON_BADARG,
-                                                            BSC_INFO_TYPE,
-                                                            BSC_VALUE_ARG1));
+            case BscSegment::action::STORE: {
+                /* The accumulator is now full or the next segment is
+                 * not possible to accumulate, so it's time to store
+                 * the accumulator to the current position in the
+                 * binary. */
+                Label store = a.newLabel();
+                Label done = a.newLabel();
+                x86::Gp bin_ptr = ARG1;
+                x86::Gp bin_offset = ARG3;
+                x86::Gp tmp = ARG4;
+                x86::Gp bin_data = ARG5;
+
+                comment("construct integer segment from accumulator");
+
+                /* First we'll need to ensure that the value in the
+                 * accumulator is in little endian format. */
+                if (seg.effectiveSize % 8 != 0) {
+                    Uint complete_bytes = 8 * (seg.effectiveSize / 8);
+                    Uint num_partial = seg.effectiveSize % 8;
+                    if ((seg.flags & BSF_LITTLE) == 0) {
+                        a.shl(bin_data, imm(64 - seg.effectiveSize));
+                        a.bswap(bin_data);
+                    } else {
+                        Sint mask = (1ll << complete_bytes) - 1;
+                        a.mov(RET, bin_data);
+                        a.shr(RET, imm(complete_bytes));
+                        a.and_(RETd, imm((1ull << num_partial) - 1));
+                        a.shl(RET, imm(complete_bytes + 8 - num_partial));
+                        if (Support::isInt32(mask)) {
+                            a.and_(bin_data, imm(mask));
+                        } else {
+                            mov_imm(tmp, mask);
+                            a.and_(bin_data, tmp);
+                        }
+                        a.or_(bin_data, RET);
+                    }
+                } else if ((seg.flags & BSF_LITTLE) == 0) {
+                    switch (seg.effectiveSize) {
+                    case 8:
+                        break;
+                    case 32:
+                        a.bswap(bin_data.r32());
+                        break;
+                    case 64:
+                        a.bswap(bin_data);
+                        break;
+                    default:
+                        a.bswap(bin_data);
+                        a.shr(bin_data, imm(64 - seg.effectiveSize));
+                        break;
+                    }
                 }
-                a.test(RETd, RETd);
-                a.je(error);
+
+                update_bin_state(bin_offset,
+                                 bin_ptr,
+                                 bit_offset,
+                                 seg.effectiveSize,
+                                 x86::Gp());
+
+                if (!is_byte_aligned) {
+                    if (bit_offset < 0) {
+                        /* Bit offset is unknown. Must test alignment. */
+                        a.and_(bin_offset, imm(7));
+                        a.short_().je(store);
+                    } else if (bit_offset >= 0) {
+                        /* Alignment is known to be unaligned. */
+                        mov_imm(bin_offset, bit_offset & 7);
+                    }
+
+                    /* Bit offset is tested or known to be unaligned. */
+                    mov_imm(ARG4, seg.effectiveSize);
+                    safe_fragment_call(ga->get_store_unaligned());
+
+                    if (bit_offset < 0) {
+                        /* The bit offset is unknown, which implies
+                         * that there exists store code that we will
+                         * need to branch past. */
+                        a.short_().jmp(done);
+                    }
+                }
+
+                a.bind(store);
+
+                if (bit_offset <= 0 || is_byte_aligned) {
+                    /* Bit offset is tested or known to be
+                     * byte-aligned. Emit inline code to store the
+                     * value of the accumulator into the binary. */
+                    int num_bytes = (seg.effectiveSize + 7) / 8;
+
+                    /* If more than one instruction is required for
+                     * doing the store, test whether it would be safe
+                     * to do a single 32 or 64 bit store. */
+                    switch (num_bytes) {
+                    case 3:
+                        if (bit_offset >= 0 &&
+                            allocated_size * 8 - bit_offset >= 32) {
+                            comment("simplified complicated store");
+                            num_bytes = 4;
+                        }
+                        break;
+                    case 5:
+                    case 6:
+                    case 7:
+                        if (bit_offset >= 0 &&
+                            allocated_size * 8 - bit_offset >= 64) {
+                            comment("simplified complicated store");
+                            num_bytes = 8;
+                        }
+                        break;
+                    }
+
+                    do {
+                        switch (num_bytes) {
+                        case 1:
+                            a.mov(x86::Mem(bin_ptr, 0, 1), bin_data.r8());
+                            break;
+                        case 2:
+                            a.mov(x86::Mem(bin_ptr, 0, 2), bin_data.r16());
+                            break;
+                        case 3:
+                            a.mov(x86::Mem(bin_ptr, 0, 2), bin_data.r16());
+                            a.shr(bin_data, imm(16));
+                            a.mov(x86::Mem(bin_ptr, 2, 1), bin_data.r8());
+                            break;
+                        case 4:
+                            a.mov(x86::Mem(bin_ptr, 0, 4), bin_data.r32());
+                            break;
+                        case 5:
+                        case 6:
+                        case 7:
+                            a.mov(x86::Mem(bin_ptr, 0, 4), bin_data.r32());
+                            a.add(bin_ptr, imm(4));
+                            a.shr(bin_data, imm(32));
+                            break;
+                        case 8:
+                            a.mov(x86::Mem(bin_ptr, 0, 8), bin_data);
+                            num_bytes = 0;
+                            break;
+                        default:
+                            ASSERT(0);
+                        }
+                        num_bytes -= 4;
+                    } while (num_bytes > 0);
+                }
+
+                a.bind(done);
+                break;
+            }
+            case BscSegment::action::DIRECT:
+                /* This segment either has a size exceeding the maximum
+                 * accumulator size of 64 bits or has a variable size.
+                 *
+                 * First load the effective size (size * unit) into ARG3.
+                 */
+                comment("construct integer segment");
+                if (seg.effectiveSize >= 0) {
+                    mov_imm(ARG3, seg.effectiveSize);
+                } else {
+                    mov_arg(ARG3, seg.size);
+                    a.sar(ARG3, imm(_TAG_IMMED1_SIZE));
+                    if (Support::isPowerOf2(seg.unit)) {
+                        Uint trailing_bits = Support::ctz<Eterm>(seg.unit);
+                        if (trailing_bits) {
+                            a.shl(ARG3, imm(trailing_bits));
+                        }
+                    } else {
+                        mov_imm(RET, seg.unit);
+                        a.mul(ARG3); /* CLOBBERS RDX = ARG3! */
+                        a.mov(ARG3, RET);
+                    }
+                }
+
+                if (is_byte_aligned && seg.src.isSmall() &&
+                    seg.src.as<ArgSmall>().getSigned() == 0) {
+                    /* Optimize the special case of setting a known
+                     * byte-aligned segment to zero. */
+                    comment("optimized setting segment to 0");
+                    set_zero(seg.effectiveSize);
+                } else {
+                    /* Call the helper function to fetch and store the
+                     * integer into the binary. */
+                    runtime_entered = bs_maybe_enter_runtime(runtime_entered);
+                    mov_arg(ARG2, seg.src);
+                    mov_imm(ARG4, seg.flags);
+                    load_erl_bits_state(ARG1);
+                    runtime_call<4>(erts_new_bs_put_integer);
+                    if (exact_type<BeamTypeId::Integer>(seg.src)) {
+                        comment("skipped test for success because construction "
+                                "can't fail");
+                    } else {
+                        if (Fail.get() == 0) {
+                            mov_arg(ARG1, seg.src);
+                            mov_imm(ARG4,
+                                    beam_jit_update_bsc_reason_info(
+                                            seg.error_info,
+                                            BSC_REASON_BADARG,
+                                            BSC_INFO_TYPE,
+                                            BSC_VALUE_ARG1));
+                        }
+                        a.test(RETd, RETd);
+                        a.je(error);
+                    }
+                }
+                break;
             }
             break;
         case am_string: {
             ArgBytePtr string_ptr(
                     ArgVal(ArgVal::BytePtr, seg.src.as<ArgWord>().get()));
 
+            runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             comment("insert string");
             ASSERT(seg.effectiveSize >= 0);
             mov_imm(ARG3, seg.effectiveSize / 8);
@@ -2092,22 +3244,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             load_erl_bits_state(ARG1);
             runtime_call<3>(erts_new_bs_put_string);
         } break;
-        case am_utf8:
-            mov_arg(ARG2, seg.src);
-            load_erl_bits_state(ARG1);
-            runtime_call<2>(erts_bs_put_utf8);
-            if (Fail.get() == 0) {
-                mov_arg(ARG1, seg.src);
-                mov_imm(ARG4,
-                        beam_jit_update_bsc_reason_info(seg.error_info,
-                                                        BSC_REASON_BADARG,
-                                                        BSC_INFO_TYPE,
-                                                        BSC_VALUE_ARG1));
-            }
-            a.test(RETd, RETd);
-            a.je(error);
+        case am_utf8: {
+            runtime_entered = bs_maybe_enter_runtime(runtime_entered);
+            emit_construct_utf8(seg.src, bit_offset, is_byte_aligned);
             break;
+        }
         case am_utf16:
+            runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             mov_arg(ARG2, seg.src);
             a.mov(ARG3, seg.flags);
             load_erl_bits_state(ARG1);
@@ -2124,6 +3267,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             a.je(error);
             break;
         case am_utf32:
+            runtime_entered = bs_maybe_enter_runtime(runtime_entered);
             mov_arg(ARG2, seg.src);
             mov_imm(ARG3, 4 * 8);
             a.mov(ARG4, seg.flags);
@@ -2144,10 +3288,1098 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail,
             ASSERT(0);
             break;
         }
+
+        /* Try to keep track of the bit offset. */
+        if (bit_offset >= 0 && (seg.action == BscSegment::action::DIRECT ||
+                                seg.action == BscSegment::action::STORE)) {
+            if (seg.effectiveSize >= 0) {
+                bit_offset += seg.effectiveSize;
+            } else {
+                bit_offset = -1;
+            }
+        }
+
+        /* Try to keep track whether the next segment is byte
+         * aligned. */
+        if (seg.type == am_append || seg.type == am_private_append) {
+            if (!exact_type<BeamTypeId::Bitstring>(seg.src) ||
+                std::gcd(getSizeUnit(seg.src), 8) != 8) {
+                is_byte_aligned = false;
+            }
+        } else if (bit_offset % 8 == 0) {
+            is_byte_aligned = true;
+        } else if (seg.effectiveSize >= 0) {
+            if (seg.effectiveSize % 8 != 0) {
+                is_byte_aligned = false;
+            }
+        } else if (std::gcd(seg.unit, 8) != 8) {
+            is_byte_aligned = false;
+        }
     }
 
+    bs_maybe_leave_runtime(runtime_entered);
     comment("done");
-    emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
     a.mov(RET, TMP_MEM1q);
     mov_arg(Dst, RET);
 }
+
+/*
+ * Here follows the bs_match instruction and friends.
+ */
+
+struct BsmSegment {
+    BsmSegment()
+            : action(action::TEST_HEAP), live(ArgNil()), size(0), unit(1),
+              flags(0), dst(ArgXRegister(0)){};
+
+    enum class action {
+        TEST_HEAP,
+        ENSURE_AT_LEAST,
+        ENSURE_EXACTLY,
+        READ,
+        EXTRACT_BINARY,
+        EXTRACT_INTEGER,
+        READ_INTEGER,
+        GET_INTEGER,
+        GET_BINARY,
+        SKIP,
+        DROP,
+        GET_TAIL,
+        EQ
+    } action;
+    ArgVal live;
+    Uint size;
+    Uint unit;
+    Uint flags;
+    ArgRegister dst;
+};
+
+void BeamModuleAssembler::emit_read_bits(Uint bits,
+                                         const x86::Gp bin_base,
+                                         const x86::Gp bin_offset,
+                                         const x86::Gp bitdata) {
+    Label read_done = a.newLabel();
+    auto num_partial = bits % 8;
+    auto num_bytes_to_read = (bits + 7) / 8;
+
+    ASSERT(bin_offset == x86::rcx);
+
+    a.mov(RET, bin_offset);
+    a.shr(RET, imm(3));
+    if (num_bytes_to_read != 1) {
+        a.add(bin_base, RET);
+    }
+    a.and_(bin_offset.r32(), imm(7));
+
+    /*
+     * Special-case handling of reading 8 or 9 bytes.
+     */
+    if (num_bytes_to_read == 8) {
+        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+            a.movbe(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8));
+        } else {
+            a.mov(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8));
+            a.bswap(bitdata);
+        }
+
+        a.shl(bitdata, bin_offset.r8());
+
+        a.test(x86::cl, imm(7));
+        if (num_partial == 0) {
+            /* Byte-sized segment. If bit_offset is not byte-aligned, this
+             * segment always needs an additional byte. */
+            a.jz(read_done);
+        } else if (num_partial > 1) {
+            /* Non-byte-sized segment. Test whether we will need an
+             * additional byte. */
+            a.cmp(bin_offset.r32(), imm(8 - num_partial));
+            a.jle(read_done);
+        }
+
+        if (num_partial != 1) {
+            /* Read an extra byte. */
+            a.movzx(RETd, x86::byte_ptr(bin_base, 8));
+            a.shl(RETd, bin_offset.r8());
+            a.shr(RETd, imm(8));
+            a.or_(bitdata, RET);
+        }
+
+        a.bind(read_done);
+
+        return;
+    }
+
+    /*
+     * Handle reading of up to 7 bytes.
+     */
+    Label handle_partial = a.newLabel();
+    Label swap = a.newLabel();
+    Label shift = a.newLabel();
+
+    if (num_partial == 0) {
+        /* Byte-sized segment. If bit_offset is not byte-aligned, this
+         * segment always needs an additional byte. */
+        a.jnz(handle_partial);
+    } else if (num_partial > 1) {
+        /* Non-byte-sized segment. Test whether we will need an
+         * additional byte. */
+        a.cmp(bin_offset.r32(), imm(8 - num_partial));
+        a.jg(handle_partial);
+    }
+
+    /* We don't need an extra byte. */
+    if (num_bytes_to_read == 1) {
+        a.movzx(bitdata.r32(), x86::byte_ptr(bin_base, RET));
+        if (num_partial == 0) {
+            a.bswap(bitdata);
+            a.short_().jmp(read_done);
+        } else if (num_partial > 1) {
+            a.short_().jmp(swap);
+        }
+    } else if (num_bytes_to_read <= 4) {
+        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+            a.movbe(bitdata.r32(),
+                    x86::dword_ptr(bin_base, num_bytes_to_read - 4));
+        } else {
+            a.mov(bitdata.r32(),
+                  x86::dword_ptr(bin_base, num_bytes_to_read - 4));
+            a.bswap(bitdata.r32());
+        }
+        a.add(bin_offset.r32(), imm(64 - 8 * num_bytes_to_read));
+        a.short_().jmp(shift);
+    } else {
+        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+            a.movbe(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8));
+        } else {
+            a.mov(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8));
+            a.bswap(bitdata);
+        }
+        ASSERT(num_bytes_to_read < 8);
+        a.add(bin_offset.r32(), imm(64 - 8 * num_bytes_to_read));
+        a.short_().jmp(shift);
+    }
+
+    /* We'll need an extra byte and we will need to shift. */
+    a.bind(handle_partial);
+    if (num_partial != 1) {
+        if (num_bytes_to_read == 1) {
+            a.mov(bitdata.r16(), x86::word_ptr(bin_base, RET));
+        } else {
+            ASSERT(num_bytes_to_read < 8);
+            a.mov(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 7));
+            a.shr(bitdata, imm(64 - 8 * (num_bytes_to_read + 1)));
+        }
+    }
+
+    a.bind(swap);
+    a.bswap(bitdata);
+
+    /* Shift the read data into the most significant bits of the
+     * word. */
+    a.bind(shift);
+    a.shl(bitdata, bin_offset.r8());
+
+    a.bind(read_done);
+}
+
+/*
+ * Read an integer and store as a term. This function only handles
+ * integers of certain common sizes. This is a special optimization
+ * when only one integer is to be extracted from a binary.
+ *
+ * Input: bin_base, bin_offset
+ *
+ * Clobbers: bin_base, bin_offset, tmp, RET
+ */
+void BeamModuleAssembler::emit_read_integer(const x86::Gp bin_base,
+                                            const x86::Gp bin_offset,
+                                            const x86::Gp tmp,
+                                            Uint flags,
+                                            Uint bits,
+                                            const ArgRegister &Dst) {
+    Label handle_unaligned = a.newLabel();
+    Label store = a.newLabel();
+    x86::Mem address;
+
+    a.mov(tmp, bin_offset);
+    a.shr(tmp, imm(3));
+    a.and_(bin_offset.r32(), imm(7));
+
+    switch (bits) {
+    case 8:
+        address = x86::Mem(bin_base, tmp, 0, 0, 1);
+        if ((flags & BSF_SIGNED) == 0) {
+            a.movzx(RETd, address);
+        } else {
+            a.movsx(RET, address);
+        }
+
+        a.short_().jz(store);
+
+        a.bind(handle_unaligned);
+        address = x86::Mem(bin_base, tmp, 0, 0, 2);
+        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+            a.movbe(RET.r16(), address);
+        } else {
+            a.mov(RET.r16(), address);
+            a.xchg(x86::al, x86::ah);
+        }
+        ASSERT(bin_offset == x86::rcx);
+        a.shl(RETd, bin_offset.r8());
+        a.mov(x86::al, x86::ah);
+        if ((flags & BSF_SIGNED) == 0) {
+            a.movzx(RETd, RETb);
+        } else {
+            a.movsx(RET, RETb);
+        }
+        break;
+    case 16:
+        address = x86::Mem(bin_base, tmp, 0, 0, 2);
+        if ((flags & BSF_LITTLE) != 0) {
+            if ((flags & BSF_SIGNED) == 0) {
+                a.movzx(RETd, address);
+            } else {
+                a.movsx(RET, address);
+            }
+        } else {
+            /* Big-endian segment. */
+            if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+                a.movbe(RET.r16(), address);
+            } else {
+                a.mov(RET.r16(), address);
+                a.xchg(x86::al, x86::ah);
+            }
+
+            if ((flags & BSF_SIGNED) != 0) {
+                a.movsx(RET, RET.r16());
+            } else {
+                a.movzx(RET, RET.r16());
+            }
+        }
+
+        a.short_().jz(store);
+
+        a.bind(handle_unaligned);
+        a.add(bin_base, tmp);
+        address = x86::Mem(bin_base, -1, 4);
+        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+            a.movbe(RETd, address);
+        } else {
+            a.mov(RETd, address);
+            a.bswap(RETd);
+        }
+        ASSERT(bin_offset == x86::rcx);
+        a.shl(RETd, bin_offset.r8());
+        a.shr(RETd, imm(8));
+
+        if ((flags & BSF_LITTLE) != 0) {
+            a.xchg(x86::al, x86::ah);
+        }
+
+        if ((flags & BSF_SIGNED) == 0) {
+            a.movzx(RETd, RET.r16());
+        } else {
+            a.movsx(RET, RET.r16());
+        }
+        break;
+    case 32:
+        address = x86::Mem(bin_base, tmp, 0, 0, 4);
+        if ((flags & BSF_LITTLE) != 0) {
+            /* Little-endian segment. */
+            if ((flags & BSF_SIGNED) == 0) {
+                a.mov(RETd, address);
+            } else {
+                a.movsxd(RET, address);
+            }
+        } else {
+            /* Big-endian segment. */
+            if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+                a.movbe(RETd, address);
+            } else {
+                a.mov(RETd, address);
+                a.bswap(RETd);
+            }
+
+            if ((flags & BSF_SIGNED) != 0) {
+                a.movsxd(RET, RETd);
+            }
+        }
+
+        a.short_().jz(store);
+
+        a.bind(handle_unaligned);
+        a.add(bin_base, tmp);
+        address = x86::Mem(bin_base, -3, 8);
+        if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) {
+            a.movbe(RET, address);
+        } else {
+            a.mov(RET, address);
+            a.bswap(RET);
+        }
+        ASSERT(bin_offset == x86::rcx);
+        a.shl(RET, bin_offset.r8());
+        a.shr(RET, imm(8));
+
+        if ((flags & BSF_LITTLE) != 0) {
+            a.bswap(RETd);
+        }
+
+        if ((flags & BSF_SIGNED) == 0) {
+            a.mov(RETd, RETd);
+        } else {
+            a.movsxd(RET, RETd);
+        }
+        break;
+    default:
+        ASSERT(0);
+        break;
+    }
+
+    a.bind(store);
+    a.shl(RET, imm(_TAG_IMMED1_SIZE));
+    a.or_(RET, imm(_TAG_IMMED1_SMALL));
+    mov_arg(Dst, RET);
+}
+
+void BeamModuleAssembler::emit_extract_integer(const x86::Gp bitdata,
+                                               const x86::Gp tmp,
+                                               Uint flags,
+                                               Uint bits,
+                                               const ArgRegister &Dst) {
+    if (bits == 0) {
+        /* Necessary for correctness when matching a zero-size
+         * signed segment.
+         */
+        mov_arg(Dst, make_small(0));
+        return;
+    }
+
+    Label big = a.newLabel();
+    Label done = a.newLabel();
+    Uint num_partial = bits % 8;
+    Uint num_complete = 8 * (bits / 8);
+
+    if (bits <= 8) {
+        /* Endian does not matter for values that fit in a byte. */
+        flags &= ~BSF_LITTLE;
+    }
+
+    if ((flags & BSF_LITTLE) == 0) {
+        /* Big-endian segment. */
+        a.mov(RET, bitdata);
+    } else if ((flags & BSF_LITTLE) != 0) {
+        /* Reverse endianness for this little-endian segment. */
+        if (num_partial == 0) {
+            a.mov(RET, bitdata);
+            a.bswap(RET);
+            if (bits < 64) {
+                a.shl(RET, imm(64 - num_complete));
+            }
+        } else {
+            Uint shifted_mask = ((1 << num_partial) - 1) << (8 - num_partial);
+            a.mov(tmp, bitdata);
+            a.shr(tmp, imm(64 - num_complete));
+            a.bswap(tmp);
+            a.shr(tmp, imm(num_partial));
+
+            a.mov(RET, bitdata);
+            a.rol(RET, imm(num_complete + 8));
+            a.and_(RETd, imm(shifted_mask));
+            a.ror(RET, imm(8));
+            a.or_(RET, tmp);
+        }
+    }
+
+    /* Now the extracted data is in RET. */
+    if (bits >= SMALL_BITS) {
+        /* Handle segments whose values might not fit in a small
+         * integer. */
+        Label small = a.newLabel();
+        comment("test whether this integer is a small");
+        if (bits < 64) {
+            if ((flags & BSF_SIGNED) == 0) {
+                /* Unsigned segment. */
+                a.shr(RET, imm(64 - bits));
+            } else {
+                /* Signed segment. */
+                a.sar(RET, imm(64 - bits));
+            }
+        }
+        a.mov(tmp, RET);
+        a.shr(tmp, imm(SMALL_BITS - 1));
+        if ((flags & BSF_SIGNED) == 0) {
+            /* Unsigned segment. */
+            a.jnz(big);
+        } else {
+            /* Signed segment. */
+            a.jz(small);
+            a.cmp(tmp.r32(), imm(_TAG_IMMED1_MASK << 1 | 1));
+            a.jnz(big);
+        }
+
+        comment("store extracted integer as a small");
+        a.bind(small);
+        a.shl(RET, imm(_TAG_IMMED1_SIZE));
+        a.or_(RET, imm(_TAG_IMMED1_SMALL));
+        a.short_().jmp(done);
+    } else {
+        /* This segment always fits in a small. */
+        comment("store extracted integer as a small");
+        if ((flags & BSF_SIGNED) == 0) {
+            /* Unsigned segment. */
+            a.shr(RET, imm(64 - bits - _TAG_IMMED1_SIZE));
+        } else {
+            /* Signed segment. */
+            a.sar(RET, imm(64 - bits - _TAG_IMMED1_SIZE));
+        }
+        ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == (1 << _TAG_IMMED1_SIZE) - 1);
+        a.or_(RET, imm(_TAG_IMMED1_SMALL));
+    }
+
+    a.bind(big);
+    if (bits >= SMALL_BITS) {
+        comment("store extracted integer as a bignum");
+        if ((flags & BSF_SIGNED) == 0) {
+            /* Unsigned segment. */
+            a.mov(x86::qword_ptr(HTOP), make_pos_bignum_header(1));
+            a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), RET);
+        } else {
+            Label negative = a.newLabel();
+            Label sign_done = a.newLabel();
+
+            /* Signed segment. */
+            a.test(RET, RET);
+            a.short_().jl(negative);
+
+            a.mov(x86::qword_ptr(HTOP), make_pos_bignum_header(1));
+            a.short_().jmp(sign_done);
+
+            a.bind(negative);
+            a.mov(x86::qword_ptr(HTOP), make_neg_bignum_header(1));
+            a.neg(RET);
+
+            a.bind(sign_done);
+            a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), RET);
+        }
+        a.lea(RET, x86::qword_ptr(HTOP, TAG_PRIMARY_BOXED));
+        a.add(HTOP, imm(sizeof(Eterm[2])));
+    }
+
+    a.bind(done);
+    mov_arg(Dst, RET);
+}
+
+/*
+ * Clobbers: RET
+ */
+void BeamModuleAssembler::emit_extract_binary(const x86::Gp bitdata,
+                                              Uint bits,
+                                              const ArgRegister &Dst) {
+    Uint num_bytes = bits / 8;
+
+    a.lea(RET, x86::qword_ptr(HTOP, TAG_PRIMARY_BOXED));
+    mov_arg(Dst, RET);
+    a.mov(x86::qword_ptr(HTOP), header_heap_bin(num_bytes));
+    a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), imm(num_bytes));
+    a.mov(RET, bitdata);
+    a.bswap(RET);
+    if (num_bytes == 0) {
+        a.add(HTOP, imm(sizeof(Eterm[2])));
+    } else {
+        a.mov(x86::qword_ptr(HTOP, 2 * sizeof(Eterm)), RET);
+        a.add(HTOP, imm(sizeof(Eterm[3])));
+    }
+}
+
+static std::vector<BsmSegment> opt_bsm_segments(
+        const std::vector<BsmSegment> segments,
+        const ArgWord &Need,
+        const ArgWord &Live) {
+    std::vector<BsmSegment> segs;
+
+    Uint heap_need = Need.get();
+
+    /*
+     * First calculate the total number of heap words needed for
+     * bignums and binaries.
+     */
+    for (auto seg : segments) {
+        switch (seg.action) {
+        case BsmSegment::action::GET_INTEGER:
+            if (seg.size >= SMALL_BITS) {
+                heap_need += BIG_NEED_FOR_BITS(seg.size);
+            }
+            break;
+        case BsmSegment::action::GET_BINARY:
+            heap_need += heap_bin_size((seg.size + 7) / 8);
+            break;
+        case BsmSegment::action::GET_TAIL:
+            heap_need += EXTRACT_SUB_BIN_HEAP_NEED;
+            break;
+        default:
+            break;
+        }
+    }
+
+    int read_action_pos = -1;
+    int seg_index = 0;
+    int count = segments.size();
+
+    for (int i = 0; i < count; i++) {
+        auto seg = segments[i];
+        if (heap_need != 0 && seg.live.isWord()) {
+            BsmSegment s = seg;
+
+            read_action_pos = -1;
+            s.action = BsmSegment::action::TEST_HEAP;
+            s.size = heap_need;
+            segs.push_back(s);
+            heap_need = 0;
+            seg_index++;
+        }
+
+        switch (seg.action) {
+        case BsmSegment::action::GET_INTEGER:
+        case BsmSegment::action::GET_BINARY: {
+            bool is_common_size;
+            switch (seg.size) {
+            case 8:
+            case 16:
+            case 32:
+                is_common_size = true;
+                break;
+            default:
+                is_common_size = false;
+                break;
+            }
+
+            if (seg.size > 64) {
+                read_action_pos = -1;
+            } else if (seg.action == BsmSegment::action::GET_BINARY &&
+                       seg.size % 8 != 0) {
+                read_action_pos = -1;
+            } else if ((seg.flags & BSF_LITTLE) != 0 && is_common_size) {
+                seg.action = BsmSegment::action::READ_INTEGER;
+                read_action_pos = -1;
+            } else if (read_action_pos < 0 &&
+                       seg.action == BsmSegment::action::GET_INTEGER &&
+                       is_common_size && i + 1 == count) {
+                seg.action = BsmSegment::action::READ_INTEGER;
+                read_action_pos = -1;
+            } else {
+                if ((seg.flags & BSF_LITTLE) != 0 || read_action_pos < 0 ||
+                    seg.size + segs.at(read_action_pos).size > 64) {
+                    BsmSegment s;
+
+                    /* Create a new READ action. */
+                    read_action_pos = seg_index;
+                    s.action = BsmSegment::action::READ;
+                    s.size = seg.size;
+                    segs.push_back(s);
+                    seg_index++;
+                } else {
+                    /* Reuse previous READ action. */
+                    segs.at(read_action_pos).size += seg.size;
+                }
+                switch (seg.action) {
+                case BsmSegment::action::GET_INTEGER:
+                    seg.action = BsmSegment::action::EXTRACT_INTEGER;
+                    break;
+                case BsmSegment::action::GET_BINARY:
+                    seg.action = BsmSegment::action::EXTRACT_BINARY;
+                    break;
+                default:
+                    break;
+                }
+            }
+            segs.push_back(seg);
+            break;
+        }
+        case BsmSegment::action::EQ: {
+            if (read_action_pos < 0 ||
+                seg.size + segs.at(read_action_pos).size > 64) {
+                BsmSegment s;
+
+                /* Create a new READ action. */
+                read_action_pos = seg_index;
+                s.action = BsmSegment::action::READ;
+                s.size = seg.size;
+                segs.push_back(s);
+                seg_index++;
+            } else {
+                /* Reuse previous READ action. */
+                segs.at(read_action_pos).size += seg.size;
+            }
+            auto &prev = segs.back();
+            if (prev.action == BsmSegment::action::EQ &&
+                prev.size + seg.size <= 64) {
+                /* Coalesce with the previous EQ instruction. */
+                prev.size += seg.size;
+                prev.unit = prev.unit << seg.size | seg.unit;
+                seg_index--;
+            } else {
+                segs.push_back(seg);
+            }
+            break;
+        }
+        case BsmSegment::action::SKIP:
+            if (read_action_pos >= 0 &&
+                seg.size + segs.at(read_action_pos).size <= 64) {
+                segs.at(read_action_pos).size += seg.size;
+                seg.action = BsmSegment::action::DROP;
+            } else {
+                read_action_pos = -1;
+            }
+            segs.push_back(seg);
+            break;
+        default:
+            read_action_pos = -1;
+            segs.push_back(seg);
+            break;
+        }
+        seg_index++;
+    }
+
+    /* Handle a trailing test_heap instruction (for the
+     * i_bs_match_test_heap instruction). */
+    if (heap_need) {
+        BsmSegment seg;
+
+        seg.action = BsmSegment::action::TEST_HEAP;
+        seg.size = heap_need;
+        seg.live = Live;
+        segs.push_back(seg);
+    }
+    return segs;
+}
+
+UWord BeamModuleAssembler::bs_get_flags(const ArgVal &val) {
+    if (val.isNil()) {
+        return 0;
+    } else if (val.isLiteral()) {
+        Eterm term = beamfile_get_literal(beam, val.as<ArgLiteral>().get());
+        UWord flags = 0;
+
+        while (is_list(term)) {
+            Eterm *consp = list_val(term);
+            Eterm elem = CAR(consp);
+            switch (elem) {
+            case am_little:
+            case am_native:
+                flags |= BSF_LITTLE;
+                break;
+            case am_signed:
+                flags |= BSF_SIGNED;
+                break;
+            }
+            term = CDR(consp);
+        }
+        ASSERT(is_nil(term));
+        return flags;
+    } else if (val.isWord()) {
+        /* Originates from bs_get_integer2 instruction. */
+        return val.as<ArgWord>().get();
+    } else {
+        ASSERT(0); /* Should not happen. */
+        return 0;
+    }
+}
+
+void BeamModuleAssembler::emit_i_bs_match(ArgLabel const &Fail,
+                                          ArgRegister const &Ctx,
+                                          Span<ArgVal> const &List) {
+    emit_i_bs_match_test_heap(Fail, Ctx, ArgWord(0), ArgWord(0), List);
+}
+
+void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail,
+                                                    ArgRegister const &Ctx,
+                                                    ArgWord const &Need,
+                                                    ArgWord const &Live,
+                                                    Span<ArgVal> const &List) {
+    const int orig_offset = offsetof(ErlBinMatchState, mb.orig);
+    const int base_offset = offsetof(ErlBinMatchState, mb.base);
+    const int position_offset = offsetof(ErlBinMatchState, mb.offset);
+    const int size_offset = offsetof(ErlBinMatchState, mb.size);
+
+    std::vector<BsmSegment> segments;
+
+    auto current = List.begin();
+    auto end = List.begin() + List.size();
+
+    while (current < end) {
+        auto cmd = current++->as<ArgImmed>().get();
+        BsmSegment seg;
+
+        switch (cmd) {
+        case am_ensure_at_least: {
+            seg.action = BsmSegment::action::ENSURE_AT_LEAST;
+            seg.size = current[0].as<ArgWord>().get();
+            seg.unit = current[1].as<ArgWord>().get();
+            current += 2;
+            break;
+        }
+        case am_ensure_exactly: {
+            seg.action = BsmSegment::action::ENSURE_EXACTLY;
+            seg.size = current[0].as<ArgWord>().get();
+            current += 1;
+            break;
+        }
+        case am_binary:
+        case am_integer: {
+            auto size = current[2].as<ArgWord>().get();
+            auto unit = current[3].as<ArgWord>().get();
+
+            switch (cmd) {
+            case am_integer:
+                seg.action = BsmSegment::action::GET_INTEGER;
+                break;
+            case am_binary:
+                seg.action = BsmSegment::action::GET_BINARY;
+                break;
+            }
+
+            seg.live = current[0];
+            seg.size = size * unit;
+            seg.unit = unit;
+            seg.flags = bs_get_flags(current[1]);
+            seg.dst = current[4].as<ArgRegister>();
+            current += 5;
+            break;
+        }
+        case am_get_tail: {
+            seg.action = BsmSegment::action::GET_TAIL;
+            seg.live = current[0].as<ArgWord>();
+            seg.dst = current[2].as<ArgRegister>();
+            current += 3;
+            break;
+        }
+        case am_skip: {
+            seg.action = BsmSegment::action::SKIP;
+            seg.size = current[0].as<ArgWord>().get();
+            seg.flags = 0;
+            current += 1;
+            break;
+        }
+        case am_Eq: {
+            seg.action = BsmSegment::action::EQ;
+            seg.live = current[0];
+            seg.size = current[1].as<ArgWord>().get();
+            seg.unit = current[2].as<ArgWord>().get();
+            current += 3;
+            break;
+        }
+        default:
+            abort();
+            break;
+        }
+        segments.push_back(seg);
+    }
+
+    segments = opt_bsm_segments(segments, Need, Live);
+
+    /* Constraints:
+     *
+     * bin_position must be RCX because only CL can be used for
+     * a variable shift without using the SHLX instruction from BMI2.
+     */
+#ifdef WIN32
+    const x86::Gp bin_position = ARG1;
+    const x86::Gp bitdata = ARG2;
+    const x86::Gp bin_base = ARG3;
+    const x86::Gp ctx = ARG4;
+#else
+    const x86::Gp bin_position = ARG4;
+    const x86::Gp bitdata = ARG3;
+    const x86::Gp bin_base = ARG1;
+    const x86::Gp ctx = ARG2;
+#endif
+    ASSERT(bin_position == x86::rcx);
+    const x86::Gp tmp = ARG5;
+
+    bool is_ctx_valid = false;
+    bool is_position_valid = false;
+    bool next_instr_clobbers = false;
+    int count = segments.size();
+
+    for (int i = 0; i < count; i++) {
+        auto seg = segments[i];
+
+        /* Find out whether the next sub instruction clobbers
+         * registers or is the last. */
+        next_instr_clobbers =
+                i == count - 1 ||
+                (i < count - 1 &&
+                 segments[i + 1].action == BsmSegment::action::TEST_HEAP);
+
+        switch (seg.action) {
+        case BsmSegment::action::ENSURE_AT_LEAST: {
+            auto size = seg.size;
+            auto unit = seg.unit;
+            comment("ensure_at_least %ld %ld", size, seg.unit);
+            mov_arg(ctx, Ctx);
+            if (unit == 1) {
+                a.mov(bin_position, emit_boxed_val(ctx, position_offset));
+                a.lea(RET, qword_ptr(bin_position, size));
+                a.cmp(RET, emit_boxed_val(ctx, size_offset));
+                a.ja(resolve_beam_label(Fail));
+            } else if (size == 0 && next_instr_clobbers) {
+                a.mov(RET, emit_boxed_val(ctx, size_offset));
+                a.sub(RET, emit_boxed_val(ctx, position_offset));
+                is_ctx_valid = is_position_valid = false;
+            } else {
+                a.mov(RET, emit_boxed_val(ctx, size_offset));
+                a.mov(bin_position, emit_boxed_val(ctx, position_offset));
+                a.sub(RET, bin_position);
+                cmp(RET, size, tmp);
+                a.jl(resolve_beam_label(Fail));
+            }
+
+            is_ctx_valid = is_position_valid = true;
+
+            if (unit != 1) {
+                if (size % unit != 0) {
+                    sub(RET, size, tmp);
+                }
+
+                if ((unit & (unit - 1))) {
+                    /* Clobbers ARG3 */
+                    a.cqo();
+                    mov_imm(tmp, unit);
+                    a.div(tmp);
+                    a.test(x86::rdx, x86::rdx);
+                    is_ctx_valid = is_position_valid = false;
+                } else {
+                    a.test(RETb, imm(unit - 1));
+                }
+                a.jnz(resolve_beam_label(Fail));
+            }
+            break;
+        }
+        case BsmSegment::action::ENSURE_EXACTLY: {
+            auto size = seg.size;
+            comment("ensure_exactly %ld", size);
+
+            mov_arg(ctx, Ctx);
+            a.mov(RET, emit_boxed_val(ctx, size_offset));
+            if (next_instr_clobbers) {
+                a.sub(RET, emit_boxed_val(ctx, position_offset));
+                is_ctx_valid = is_position_valid = false;
+            } else {
+                a.mov(bin_position, emit_boxed_val(ctx, position_offset));
+                a.sub(RET, bin_position);
+                is_ctx_valid = is_position_valid = true;
+            }
+            if (size != 0) {
+                cmp(RET, size, tmp);
+            }
+            a.jne(resolve_beam_label(Fail));
+            break;
+        }
+        case BsmSegment::action::EQ: {
+            comment("=:= %ld %ld", seg.size, seg.unit);
+            auto bits = seg.size;
+            x86::Gp extract_reg;
+
+            if (next_instr_clobbers) {
+                extract_reg = bitdata;
+            } else {
+                extract_reg = RET;
+                a.mov(extract_reg, bitdata);
+            }
+            if (bits != 0 && bits != 64) {
+                a.shr(extract_reg, imm(64 - bits));
+            }
+
+            if (seg.size <= 32) {
+                cmp(extract_reg.r32(), seg.unit, tmp);
+            } else {
+                cmp(extract_reg, seg.unit, tmp);
+            }
+
+            a.jne(resolve_beam_label(Fail));
+
+            if (!next_instr_clobbers && bits != 0 && bits != 64) {
+                a.shl(bitdata, imm(bits));
+            }
+
+            /* bin_position is clobbered. */
+            is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::TEST_HEAP: {
+            comment("test_heap %ld", seg.size);
+            emit_gc_test(ArgWord(0), ArgWord(seg.size), seg.live);
+            is_ctx_valid = is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::READ: {
+            comment("read %ld", seg.size);
+            if (seg.size == 0) {
+                comment("(nothing to do)");
+            } else {
+                if (!is_ctx_valid) {
+                    mov_arg(ctx, Ctx);
+                    is_ctx_valid = true;
+                }
+                if (!is_position_valid) {
+                    a.mov(bin_position, emit_boxed_val(ctx, position_offset));
+                    is_position_valid = true;
+                }
+                a.mov(bin_base, emit_boxed_val(ctx, base_offset));
+                a.add(emit_boxed_val(ctx, position_offset), imm(seg.size));
+
+                emit_read_bits(seg.size, bin_base, bin_position, bitdata);
+            }
+
+            is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::EXTRACT_BINARY: {
+            auto bits = seg.size;
+            auto Dst = seg.dst;
+
+            comment("extract binary %ld", bits);
+            emit_extract_binary(bitdata, bits, Dst);
+            if (!next_instr_clobbers && bits != 0 && bits != 64) {
+                a.shl(bitdata, imm(bits));
+            }
+            break;
+        }
+        case BsmSegment::action::EXTRACT_INTEGER: {
+            auto bits = seg.size;
+            auto flags = seg.flags;
+            auto Dst = seg.dst;
+
+            comment("extract integer %ld", bits);
+            if (next_instr_clobbers && flags == 0 && bits < SMALL_BITS) {
+                a.shr(bitdata, imm(64 - bits - _TAG_IMMED1_SIZE));
+                a.or_(bitdata, imm(_TAG_IMMED1_SMALL));
+                mov_arg(Dst, bitdata);
+            } else {
+                emit_extract_integer(bitdata, tmp, flags, bits, Dst);
+                if (!next_instr_clobbers && bits != 0 && bits != 64) {
+                    a.shl(bitdata, imm(bits));
+                }
+            }
+
+            /* bin_position is clobbered. */
+            is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::READ_INTEGER: {
+            auto bits = seg.size;
+            auto flags = seg.flags;
+            auto Dst = seg.dst;
+
+            comment("read integer %ld", bits);
+            if (!is_ctx_valid) {
+                mov_arg(ctx, Ctx);
+                is_ctx_valid = true;
+            }
+            if (!is_position_valid) {
+                a.mov(bin_position, emit_boxed_val(ctx, position_offset));
+                is_position_valid = true;
+            }
+
+            a.mov(bin_base, emit_boxed_val(ctx, base_offset));
+            a.add(emit_boxed_val(ctx, position_offset), imm(seg.size));
+            emit_read_integer(bin_base, bin_position, tmp, flags, bits, Dst);
+
+            is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::GET_INTEGER: {
+            Uint flags = seg.flags;
+            auto bits = seg.size;
+            auto Dst = seg.dst;
+
+            comment("get integer %ld", bits);
+            if (!is_ctx_valid) {
+                mov_arg(ctx, Ctx);
+            }
+
+            a.lea(ARG4, emit_boxed_val(ctx, offsetof(ErlBinMatchState, mb)));
+
+            if (bits >= SMALL_BITS) {
+                emit_enter_runtime<Update::eReductions |
+                                   Update::eHeapOnlyAlloc>();
+            } else {
+                emit_enter_runtime();
+            }
+
+            a.mov(ARG1, c_p);
+            a.mov(ARG2, bits);
+            a.mov(ARG3, flags);
+            /* ARG4 set above */
+            runtime_call<4>(erts_bs_get_integer_2);
+
+            if (bits >= SMALL_BITS) {
+                emit_leave_runtime<Update::eReductions |
+                                   Update::eHeapOnlyAlloc>();
+            } else {
+                emit_leave_runtime();
+            }
+
+            mov_arg(Dst, RET);
+
+            is_ctx_valid = is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::GET_BINARY: {
+            comment("get binary %ld", seg.size);
+            if (is_ctx_valid) {
+                a.mov(RET, ctx);
+            } else {
+                mov_arg(RET, Ctx);
+            }
+            emit_enter_runtime<Update::eHeapOnlyAlloc>();
+            a.lea(ARG1, x86::qword_ptr(c_p, offsetof(Process, htop)));
+            a.mov(ARG2, emit_boxed_val(RET, orig_offset));
+            a.mov(ARG3, emit_boxed_val(RET, base_offset));
+            a.mov(ARG4, emit_boxed_val(RET, position_offset));
+            mov_imm(ARG5, seg.size);
+            a.add(emit_boxed_val(RET, position_offset), ARG5);
+
+            runtime_call<5>(erts_extract_sub_binary);
+
+            emit_leave_runtime<Update::eHeapOnlyAlloc>();
+            mov_arg(seg.dst, RET);
+
+            is_ctx_valid = is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::GET_TAIL: {
+            comment("get_tail");
+            if (is_ctx_valid) {
+                a.mov(ARG1, ctx);
+            } else {
+                mov_arg(ARG1, Ctx);
+            }
+            safe_fragment_call(ga->get_bs_get_tail_shared());
+            mov_arg(seg.dst, RET);
+            is_ctx_valid = is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::SKIP: {
+            comment("skip %ld", seg.size);
+            if (!is_ctx_valid) {
+                mov_arg(ctx, Ctx);
+                is_ctx_valid = true;
+            }
+            /* The compiler limits the size of any segment in a bs_match
+             * instruction to 24 bits. */
+            ASSERT((seg.size >> 24) == 0);
+            a.add(emit_boxed_val(ctx, position_offset), imm(seg.size));
+            is_position_valid = false;
+            break;
+        }
+        case BsmSegment::action::DROP:
+            auto bits = seg.size;
+            comment("drop %ld", bits);
+            if (bits != 0 && bits != 64) {
+                a.shl(bitdata, imm(bits));
+            }
+            break;
+        }
+    }
+}