1 files changed, 403 insertions, 317 deletions
diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp
index 66891730ef..c7f085ee62 100644
--- a/erts/emulator/beam/jit/x86/beam_asm.hpp
+++ b/erts/emulator/beam/jit/x86/beam_asm.hpp
@@ -52,18 +52,20 @@ extern "C"
 
 using namespace asmjit;
 
-class BeamAssembler : public ErrorHandler {
-protected:
-    /* Holds code and relocation information. */
-    CodeHolder code;
-
-    /* TODO: Want to change this to x86::Builder in order to be able to patch
-     * the correct I into the code after code generation */
-    x86::Assembler a;
+struct BeamAssembler : public BeamAssemblerCommon {
+    BeamAssembler() : BeamAssemblerCommon(a) {
+        Error err = code.attach(&a);
+        ERTS_ASSERT(!err && "Failed to attach codeHolder");
+    }
 
-    FileLogger logger;
+    BeamAssembler(const std::string &log) : BeamAssembler() {
+        if (erts_jit_asm_dump) {
+            setLogger(log + ".asm");
+        }
+    }
 
-    Section *rodata = nullptr;
+protected:
+    x86::Assembler a;
 
     /* * * * * * * * * */
 
@@ -170,27 +172,6 @@ protected:
 
     enum Distance { dShort, dLong };
 
-public:
-    static bool hasCpuFeature(uint32_t featureId);
-
-    BeamAssembler();
-    BeamAssembler(const std::string &log);
-
-    ~BeamAssembler();
-
-    void *getBaseAddress();
-    size_t getOffset();
-
-protected:
-    void _codegen(JitAllocator *allocator,
-                  const void **executable_ptr,
-                  void **writable_ptr);
-
-    void *getCode(Label label);
-    byte *getCode(char *labelName);
-
-    void handleError(Error err, const char *message, BaseEmitter *origin);
-
     constexpr x86::Mem getRuntimeStackRef() const {
         int base = offsetof(ErtsSchedulerRegisters, aux_regs.d.runtime_stack);
 
@@ -577,11 +558,27 @@ protected:
 #endif
     }
 
+    /* Prefer `eHeapAlloc` over `eStack | eHeap` when calling
+     * functions in the runtime system that allocate heap
+     * memory (`HAlloc`, heap factories, etc).
+     *
+     * Prefer `eHeapOnlyAlloc` over `eHeapAlloc` for functions
+     * that assume there's already a certain amount of free
+     * space on the heap, such as those using `HeapOnlyAlloc`
+     * or similar. It's slightly cheaper in release builds,
+     * and in debug builds it updates `eStack` to ensure that
+     * we can make heap size assertions. */
     enum Update : int {
         eStack = (1 << 0),
         eHeap = (1 << 1),
         eReductions = (1 << 2),
-        eCodeIndex = (1 << 3)
+        eCodeIndex = (1 << 3),
+        eHeapAlloc = Update::eHeap | Update::eStack,
+#ifndef DEBUG
+        eHeapOnlyAlloc = Update::eHeap,
+#else
+        eHeapOnlyAlloc = Update::eHeapAlloc
+#endif
     };
 
     void emit_enter_frame() {
@@ -624,9 +621,9 @@ protected:
         if (ERTS_LIKELY(erts_frame_layout == ERTS_FRAME_LAYOUT_RA)) {
             if ((Spec & (Update::eHeap | Update::eStack)) ==
                 (Update::eHeap | Update::eStack)) {
-                /* To update both heap and stack we use sse instructions like
-                 * gcc -O3 does. Basically it is this function run through
-                 * gcc -O3:
+                /* To update both heap and stack we use SSE/AVX
+                 * instructions like gcc -O3 does. Basically it is
+                 * this function run through gcc -O3:
                  *
                  *    struct a { long a; long b; long c; };
                  *    void test(long a, long b, long c, struct a *s) {
@@ -636,11 +633,18 @@ protected:
                  *    } */
                 ERTS_CT_ASSERT((offsetof(Process, stop) -
                                 offsetof(Process, htop)) == sizeof(Eterm *));
-                a.movq(x86::xmm0, HTOP);
-                a.movq(x86::xmm1, E);
-                a.punpcklqdq(x86::xmm0, x86::xmm1);
-                a.movups(x86::xmmword_ptr(c_p, offsetof(Process, htop)),
-                         x86::xmm0);
+                if (hasCpuFeature(CpuFeatures::X86::kAVX)) {
+                    a.vmovq(x86::xmm1, HTOP);
+                    a.vpinsrq(x86::xmm0, x86::xmm1, E, 1);
+                    a.vmovdqu(x86::xmmword_ptr(c_p, offsetof(Process, htop)),
+                              x86::xmm0);
+                } else {
+                    a.movq(x86::xmm0, HTOP);
+                    a.movq(x86::xmm1, E);
+                    a.punpcklqdq(x86::xmm0, x86::xmm1);
+                    a.movups(x86::xmmword_ptr(c_p, offsetof(Process, htop)),
+                             x86::xmm0);
+                }
             } else if (Spec & Update::eHeap) {
                 a.mov(x86::qword_ptr(c_p, offsetof(Process, htop)), HTOP);
             } else if (Spec & Update::eStack) {
@@ -659,11 +663,18 @@ protected:
             if (Spec & Update::eStack) {
                 ERTS_CT_ASSERT((offsetof(Process, frame_pointer) -
                                 offsetof(Process, stop)) == sizeof(Eterm *));
-                a.movq(x86::xmm0, E);
-                a.movq(x86::xmm1, frame_pointer);
-                a.punpcklqdq(x86::xmm0, x86::xmm1);
-                a.movups(x86::xmmword_ptr(c_p, offsetof(Process, stop)),
-                         x86::xmm0);
+                if (hasCpuFeature(CpuFeatures::X86::kAVX)) {
+                    a.vmovq(x86::xmm1, E);
+                    a.vpinsrq(x86::xmm0, x86::xmm1, frame_pointer, 1);
+                    a.vmovdqu(x86::xmmword_ptr(c_p, offsetof(Process, stop)),
+                              x86::xmm0);
+                } else {
+                    a.movq(x86::xmm0, E);
+                    a.movq(x86::xmm1, frame_pointer);
+                    a.punpcklqdq(x86::xmm0, x86::xmm1);
+                    a.movups(x86::xmmword_ptr(c_p, offsetof(Process, stop)),
+                             x86::xmm0);
+                }
             } else {
                 /* We can skip updating the frame pointer whenever the process
                  * doesn't have to inspect the stack. We still need to update
@@ -693,6 +704,14 @@ protected:
         a.sub(x86::rsp, imm(15));
         a.and_(x86::rsp, imm(-16));
 #endif
+        /* If the emulator has not been compiled with AVX support (which stops
+         * it from using legacy SSE instructions), we'll need to clear the upper
+         * bits of all AVX registers to avoid AVX/SSE transition penalties.  */
+#if !defined(__AVX__)
+        if (hasCpuFeature(CpuFeatures::X86::kAVX)) {
+            a.vzeroupper();
+        }
+#endif
     }
 
     template<int Spec = 0>
@@ -747,7 +766,7 @@ protected:
 #endif
     }
 
-    void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) {
+    void emit_test_boxed(x86::Gp Src) {
         /* Use the shortest possible instruction depending on the source
          * register. */
         if (Src == x86::rax || Src == x86::rdi || Src == x86::rsi ||
@@ -756,6 +775,10 @@ protected:
         } else {
             a.test(Src.r32(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED));
         }
+    }
+
+    void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) {
+        emit_test_boxed(Src);
         if (dist == dShort) {
             a.short_().jne(Fail);
         } else {
@@ -763,6 +786,15 @@ protected:
         }
     }
 
+    void emit_is_not_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) {
+        emit_test_boxed(Src);
+        if (dist == dShort) {
+            a.short_().je(Fail);
+        } else {
+            a.je(Fail);
+        }
+    }
+
     x86::Gp emit_ptr_val(x86::Gp Dst, x86::Gp Src) {
 #if !defined(TAG_LITERAL_PTR)
         return Src;
@@ -864,114 +896,233 @@ protected:
         mov_imm(to, 0);
     }
 
-public:
-    void embed_rodata(const char *labelName, const char *buff, size_t size);
-    void embed_bss(const char *labelName, size_t size);
-    void embed_zeros(size_t size);
-
-    void setLogger(std::string log);
-    void setLogger(FILE *log);
+    template<typename Dst, typename Src>
+    void vmovups(Dst dst, Src src) {
+        if (hasCpuFeature(CpuFeatures::X86::kAVX)) {
+            a.vmovups(dst, src);
+        } else {
+            a.movups(dst, src);
+        }
+    }
 
-    void comment(const char *format) {
-        if (logger.file()) {
-            a.commentf("# %s", format);
+    template<typename Dst, typename Src>
+    void vmovsd(Dst dst, Src src) {
+        if (hasCpuFeature(CpuFeatures::X86::kAVX)) {
+            a.vmovsd(dst, src);
+        } else {
+            a.movsd(dst, src);
         }
     }
 
-    template<typename... Ts>
-    void comment(const char *format, Ts... args) {
-        if (logger.file()) {
-            char buff[1024];
-            erts_snprintf(buff, sizeof(buff), format, args...);
-            a.commentf("# %s", buff);
+    template<typename Dst, typename Src>
+    void vucomisd(Dst dst, Src src) {
+        if (hasCpuFeature(CpuFeatures::X86::kAVX)) {
+            a.vucomisd(dst, src);
+        } else {
+            a.ucomisd(dst, src);
         }
     }
 
-    struct AsmRange {
-        ErtsCodePtr start;
-        ErtsCodePtr stop;
-        const std::string name;
+    /* Copies `count` words from `from` to `to`.
+     *
+     * Clobbers `spill` and the first vector register (xmm0, ymm0 etc). */
+    void emit_copy_words(x86::Mem from,
+                         x86::Mem to,
+                         Sint32 count,
+                         x86::Gp spill) {
+        ASSERT(!from.hasIndex() && !to.hasIndex());
+        ASSERT(count >= 0 && count < (ERTS_SINT32_MAX / (Sint32)sizeof(UWord)));
+        ASSERT(from.offset() < ERTS_SINT32_MAX - count * (Sint32)sizeof(UWord));
+        ASSERT(to.offset() < ERTS_SINT32_MAX - count * (Sint32)sizeof(UWord));
+
+        /* We're going to mix sizes pretty wildly below, so it's easiest to
+         * turn off size validation. */
+        from.setSize(0);
+        to.setSize(0);
+
+        using vectors = std::initializer_list<std::tuple<x86::Vec,
+                                                         Sint32,
+                                                         x86::Inst::Id,
+                                                         CpuFeatures::X86::Id>>;
+        for (const auto &spec : vectors{{x86::zmm0,
+                                         8,
+                                         x86::Inst::kIdVmovups,
+                                         CpuFeatures::X86::kAVX512_VL},
+                                        {x86::zmm0,
+                                         8,
+                                         x86::Inst::kIdVmovups,
+                                         CpuFeatures::X86::kAVX512_F},
+                                        {x86::ymm0,
+                                         4,
+                                         x86::Inst::kIdVmovups,
+                                         CpuFeatures::X86::kAVX},
+                                        {x86::xmm0,
+                                         2,
+                                         x86::Inst::kIdVmovups,
+                                         CpuFeatures::X86::kAVX},
+                                        {x86::xmm0,
+                                         2,
+                                         x86::Inst::kIdMovups,
+                                         CpuFeatures::X86::kSSE}}) {
+            const auto &[vector_reg, vector_size, vector_inst, feature] = spec;
+
+            if (!hasCpuFeature(feature)) {
+                continue;
+            }
 
-        struct LineData {
-            ErtsCodePtr start;
-            const std::string file;
-            unsigned line;
-        };
+            /* Copy the words inline if we can, otherwise use a loop with the
+             * largest vector size we're capable of. */
+            if (count <= vector_size * 4) {
+                while (count >= vector_size) {
+                    a.emit(vector_inst, vector_reg, from);
+                    a.emit(vector_inst, to, vector_reg);
+
+                    from.addOffset(sizeof(UWord) * vector_size);
+                    to.addOffset(sizeof(UWord) * vector_size);
+                    count -= vector_size;
+                }
+            } else {
+                Sint32 loop_iterations, loop_size;
+                Label copy_next = a.newLabel();
 
-        const std::vector<LineData> lines;
-    };
-};
+                loop_iterations = count / vector_size;
+                loop_size = loop_iterations * vector_size * sizeof(UWord);
 
-#include "beam_asm_global.hpp"
+                from.addOffset(loop_size);
+                to.addOffset(loop_size);
+                from.setIndex(spill);
+                to.setIndex(spill);
 
-class BeamModuleAssembler : public BeamAssembler {
-    typedef unsigned BeamLabel;
+                mov_imm(spill, -loop_size);
+                a.bind(copy_next);
+                {
+                    a.emit(vector_inst, vector_reg, from);
+                    a.emit(vector_inst, to, vector_reg);
 
-    /* Map of label number to asmjit Label */
-    typedef std::unordered_map<BeamLabel, const Label> LabelMap;
-    LabelMap rawLabels;
+                    a.add(spill, imm(vector_size * sizeof(UWord)));
+                    a.short_().jne(copy_next);
+                }
 
-    struct patch {
-        Label where;
-        int64_t ptr_offs;
-        int64_t val_offs;
-    };
+                from.resetIndex();
+                to.resetIndex();
 
-    struct patch_catch {
-        struct patch patch;
-        Label handler;
-    };
-    std::vector<struct patch_catch> catches;
+                count %= vector_size;
+            }
+        }
 
-    /* Map of import entry to patch labels and mfa */
-    struct patch_import {
-        std::vector<struct patch> patches;
-        ErtsCodeMFA mfa;
-    };
-    typedef std::unordered_map<unsigned, struct patch_import> ImportMap;
-    ImportMap imports;
+        if (count == 1) {
+            a.mov(spill, from);
+            a.mov(to, spill);
 
-    /* Map of fun entry to trampoline labels and patches */
-    struct patch_lambda {
-        std::vector<struct patch> patches;
-        Label trampoline;
-    };
-    typedef std::unordered_map<unsigned, struct patch_lambda> LambdaMap;
-    LambdaMap lambdas;
+            count -= 1;
+        }
 
-    /* Map of literals to patch labels */
-    struct patch_literal {
-        std::vector<struct patch> patches;
-    };
-    typedef std::unordered_map<unsigned, struct patch_literal> LiteralMap;
-    LiteralMap literals;
+        ASSERT(count == 0);
+        (void)count;
+    }
+};
+
+#include "beam_asm_global.hpp"
+
+class BeamModuleAssembler : public BeamAssembler,
+                            public BeamModuleAssemblerCommon {
+    BeamGlobalAssembler *ga;
 
-    /* All string patches */
-    std::vector<struct patch> strings;
+    /* Save the last PC for an error. */
+    size_t last_error_offset = 0;
 
-    /* All functions that have been seen so far */
-    std::vector<BeamLabel> functions;
+    /* Skip unnecessary moves in mov_arg() and cmp_arg(). */
+    size_t last_movarg_offset = 0;
+    x86::Gp last_movarg_from1, last_movarg_from2;
+    x86::Mem last_movarg_to1, last_movarg_to2;
 
-    /* The BEAM file we've been loaded from, if any. */
-    const BeamFile *beam;
+    /* Private helper. */
+    void preserve__cache(x86::Gp dst) {
+        last_movarg_offset = a.offset();
+        invalidate_cache(dst);
+    }
 
-    BeamGlobalAssembler *ga;
+    bool is_cache_valid() {
+        return a.offset() == last_movarg_offset;
+    }
 
-    Label code_header;
+    void preserve_cache(x86::Gp dst, bool cache_valid) {
+        if (cache_valid) {
+            preserve__cache(dst);
+        }
+    }
 
-    /* Used by emit to populate the labelToMFA map */
-    Label current_label;
+    /* Store CPU register into memory and update the cache. */
+    void store_cache(x86::Gp src, x86::Mem dst) {
+        if (is_cache_valid() && dst != last_movarg_to1) {
+            /* Something is already cached in the first slot. Use the
+             * second slot. */
+            a.mov(dst, src);
 
-    /* The module's on_load function, if any. */
-    Label on_load;
+            last_movarg_offset = a.offset();
+            last_movarg_to2 = dst;
+            last_movarg_from2 = src;
+        } else {
+            /* Nothing cached yet, or the first slot has the same
+             * memory address as we will store into. Use the first
+             * slot and invalidate the second slot. */
+            a.mov(dst, src);
 
-    /* The end of the last function. */
-    Label code_end;
+            last_movarg_offset = a.offset();
+            last_movarg_to1 = dst;
+            last_movarg_from1 = src;
 
-    Eterm mod;
+            last_movarg_to2 = x86::Mem();
+        }
+    }
 
-    /* Save the last PC for an error. */
-    size_t last_error_offset = 0;
+    void invalidate_cache(x86::Gp dst) {
+        if (dst == last_movarg_from1) {
+            last_movarg_to1 = x86::Mem();
+            last_movarg_from1 = x86::Gp();
+        }
+        if (dst == last_movarg_from2) {
+            last_movarg_to2 = x86::Mem();
+            last_movarg_from2 = x86::Gp();
+        }
+    }
+
+    x86::Gp cached_reg(x86::Mem mem) {
+        if (is_cache_valid()) {
+            if (mem == last_movarg_to1) {
+                return last_movarg_from1;
+            }
+            if (mem == last_movarg_to2) {
+                return last_movarg_from2;
+            }
+        }
+        return x86::Gp();
+    }
+
+    void load_cached(x86::Gp dst, x86::Mem mem) {
+        if (a.offset() == last_movarg_offset) {
+            x86::Gp reg = cached_reg(mem);
+
+            if (reg.isValid()) {
+                /* This memory location is cached. */
+                if (reg != dst) {
+                    comment("simplified fetching of BEAM register");
+                    a.mov(dst, reg);
+                    preserve__cache(dst);
+                } else {
+                    comment("skipped fetching of BEAM register");
+                    invalidate_cache(dst);
+                }
+            } else {
+                /* Not cached. Load and preserve the cache. */
+                a.mov(dst, mem);
+                preserve__cache(dst);
+            }
+        } else {
+            /* The cache is invalid. */
+            a.mov(dst, mem);
+        }
+    }
 
     /* Maps code pointers to thunks that jump to them, letting us treat global
      * fragments as if they were local. */
@@ -1027,180 +1178,19 @@ public:
     const ErtsCodeInfo *getOnLoad(void);
 
     unsigned patchCatches(char *rw_base);
-    void patchLambda(char *rw_base, unsigned index, BeamInstr I);
+    void patchLambda(char *rw_base, unsigned index, const ErlFunEntry *fe);
     void patchLiteral(char *rw_base, unsigned index, Eterm lit);
-    void patchImport(char *rw_base, unsigned index, BeamInstr I);
+    void patchImport(char *rw_base, unsigned index, const Export *import);
     void patchStrings(char *rw_base, const byte *string);
 
 protected:
-    int getTypeUnion(const ArgSource &arg) const {
-        auto typeIndex =
-                arg.isRegister() ? arg.as<ArgRegister>().typeIndex() : 0;
-
-        ASSERT(typeIndex < beam->types.count);
-        return beam->types.entries[typeIndex].type_union;
-    }
-
-    auto getIntRange(const ArgSource &arg) const {
-        if (arg.isSmall()) {
-            Sint value = arg.as<ArgSmall>().getSigned();
-            return std::make_pair(value, value);
-        } else {
-            auto typeIndex =
-                    arg.isRegister() ? arg.as<ArgRegister>().typeIndex() : 0;
-
-            ASSERT(typeIndex < beam->types.count);
-            const auto &entry = beam->types.entries[typeIndex];
-            ASSERT(entry.type_union & BEAM_TYPE_INTEGER);
-            return std::make_pair(entry.min, entry.max);
-        }
-    }
-
-    bool always_small(const ArgSource &arg) const {
-        if (arg.isSmall()) {
-            return true;
-        }
-
-        int type_union = getTypeUnion(arg);
-        if (type_union == BEAM_TYPE_INTEGER) {
-            auto [min, max] = getIntRange(arg);
-            return min <= max;
-        } else {
-            return false;
-        }
-    }
-
-    bool always_immediate(const ArgSource &arg) const {
-        if (arg.isImmed() || always_small(arg)) {
-            return true;
-        }
-
-        int type_union = getTypeUnion(arg);
-        return (type_union & BEAM_TYPE_MASK_ALWAYS_IMMEDIATE) == type_union;
-    }
-
-    bool always_same_types(const ArgSource &lhs, const ArgSource &rhs) const {
-        int lhs_types = getTypeUnion(lhs);
-        int rhs_types = getTypeUnion(rhs);
-
-        /* We can only be certain that the types are the same when there's
-         * one possible type. For example, if one is a number and the other
-         * is an integer, they could differ if the former is a float. */
-        if ((lhs_types & (lhs_types - 1)) == 0) {
-            return lhs_types == rhs_types;
-        }
-
-        return false;
-    }
-
-    bool always_one_of(const ArgSource &arg, int types) const {
-        if (arg.isImmed()) {
-            if (arg.isSmall()) {
-                return !!(types & BEAM_TYPE_INTEGER);
-            } else if (arg.isAtom()) {
-                return !!(types & BEAM_TYPE_ATOM);
-            } else if (arg.isNil()) {
-                return !!(types & BEAM_TYPE_NIL);
-            }
-
-            return false;
-        } else {
-            int type_union = getTypeUnion(arg);
-            return type_union == (type_union & types);
-        }
-    }
-
-    int masked_types(const ArgSource &arg, int mask) const {
-        if (arg.isImmed()) {
-            if (arg.isSmall()) {
-                return mask & BEAM_TYPE_INTEGER;
-            } else if (arg.isAtom()) {
-                return mask & BEAM_TYPE_ATOM;
-            } else if (arg.isNil()) {
-                return mask & BEAM_TYPE_NIL;
-            }
-
-            return BEAM_TYPE_NONE;
-        } else {
-            return getTypeUnion(arg) & mask;
-        }
-    }
-
-    bool exact_type(const ArgSource &arg, int type_id) const {
-        return always_one_of(arg, type_id);
-    }
-
-    bool is_sum_small(const ArgSource &LHS, const ArgSource &RHS) {
-        if (!(always_small(LHS) && always_small(RHS))) {
-            return false;
-        } else {
-            Sint min, max;
-            auto [min1, max1] = getIntRange(LHS);
-            auto [min2, max2] = getIntRange(RHS);
-            min = min1 + min2;
-            max = max1 + max2;
-            return IS_SSMALL(min) && IS_SSMALL(max);
-        }
-    }
-
-    bool is_difference_small(const ArgSource &LHS, const ArgSource &RHS) {
-        if (!(always_small(LHS) && always_small(RHS))) {
-            return false;
-        } else {
-            Sint min, max;
-            auto [min1, max1] = getIntRange(LHS);
-            auto [min2, max2] = getIntRange(RHS);
-            min = min1 - max2;
-            max = max1 - min2;
-            return IS_SSMALL(min) && IS_SSMALL(max);
-        }
-    }
-
-    bool is_product_small(const ArgSource &LHS, const ArgSource &RHS) {
-        if (!(always_small(LHS) && always_small(RHS))) {
-            return false;
-        } else {
-            auto [min1, max1] = getIntRange(LHS);
-            auto [min2, max2] = getIntRange(RHS);
-            auto mag1 = std::max(std::abs(min1), std::abs(max1));
-            auto mag2 = std::max(std::abs(min2), std::abs(max2));
-
-            /*
-             * mag1 * mag2 <= MAX_SMALL
-             * mag1 <= MAX_SMALL / mag2   (when mag2 != 0)
-             */
-            ERTS_CT_ASSERT(MAX_SMALL < -MIN_SMALL);
-            return mag2 == 0 || mag1 <= MAX_SMALL / mag2;
-        }
-    }
-
-    bool is_bsl_small(const ArgSource &LHS, const ArgSource &RHS) {
-        /*
-         * In the code compiled by scripts/diffable, there never
-         * seems to be any range information for the RHS. Therefore,
-         * don't bother unless RHS is an immediate small.
-         */
-        if (!(always_small(LHS) && RHS.isSmall())) {
-            return false;
-        } else {
-            auto [min1, max1] = getIntRange(LHS);
-            auto rhs_val = RHS.as<ArgSmall>().getSigned();
-
-            if (min1 < 0 || max1 == 0 || rhs_val < 0) {
-                return false;
-            }
-
-            return rhs_val < Support::clz(max1) - _TAG_IMMED1_SIZE;
-        }
-    }
-
-    /* Helpers */
     void emit_gc_test(const ArgWord &Stack,
                       const ArgWord &Heap,
                       const ArgWord &Live);
     void emit_gc_test_preserve(const ArgWord &Need,
                                const ArgWord &Live,
-                               x86::Gp term);
+                               const ArgSource &Preserve,
+                               x86::Gp preserve_reg);
 
     x86::Mem emit_variable_apply(bool includeI);
     x86::Mem emit_fixed_apply(const ArgWord &arity, bool includeI);
@@ -1209,11 +1199,6 @@ protected:
                           bool skip_fun_test = false,
                           bool skip_arity_test = false);
 
-    x86::Gp emit_is_binary(const ArgLabel &Fail,
-                           const ArgSource &Src,
-                           Label next,
-                           Label subbin);
-
     void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) {
         BeamAssembler::emit_is_boxed(Fail, Src, dist);
     }
@@ -1222,7 +1207,7 @@ protected:
                        const ArgVal &Arg,
                        x86::Gp Src,
                        Distance dist = dLong) {
-        if (always_one_of(Arg, BEAM_TYPE_MASK_ALWAYS_BOXED)) {
+        if (always_one_of<BeamTypeId::AlwaysBoxed>(Arg)) {
             comment("skipped box test since argument is always boxed");
             return;
         }
@@ -1246,10 +1231,12 @@ protected:
 
     void emit_error(int code);
 
-    x86::Mem emit_bs_get_integer_prologue(Label next,
-                                          Label fail,
-                                          int flags,
-                                          int size);
+    void emit_bs_get_integer(const ArgRegister &Ctx,
+                             const ArgLabel &Fail,
+                             const ArgWord &Live,
+                             const ArgWord Flags,
+                             int bits,
+                             const ArgRegister &Dst);
 
     int emit_bs_get_field_size(const ArgSource &Size,
                                int unit,
@@ -1261,6 +1248,40 @@ protected:
     void emit_bs_get_utf16(const ArgRegister &Ctx,
                            const ArgLabel &Fail,
                            const ArgWord &Flags);
+    void update_bin_state(x86::Gp bin_offset,
+                          x86::Gp current_byte,
+                          Sint bit_offset,
+                          Sint size,
+                          x86::Gp size_reg);
+    bool need_mask(const ArgVal Val, Sint size);
+    void set_zero(Sint effectiveSize);
+    bool bs_maybe_enter_runtime(bool entered);
+    void bs_maybe_leave_runtime(bool entered);
+    void emit_construct_utf8_shared();
+    void emit_construct_utf8(const ArgVal &Src,
+                             Sint bit_offset,
+                             bool is_byte_aligned);
+
+    void emit_read_bits(Uint bits,
+                        const x86::Gp bin_base,
+                        const x86::Gp bin_offset,
+                        const x86::Gp bitdata);
+    void emit_extract_integer(const x86::Gp bitdata,
+                              const x86::Gp tmp,
+                              Uint flags,
+                              Uint bits,
+                              const ArgRegister &Dst);
+    void emit_extract_binary(const x86::Gp bitdata,
+                             Uint bits,
+                             const ArgRegister &Dst);
+    void emit_read_integer(const x86::Gp bin_base,
+                           const x86::Gp bin_position,
+                           const x86::Gp tmp,
+                           Uint flags,
+                           Uint bits,
+                           const ArgRegister &Dst);
+
+    UWord bs_get_flags(const ArgVal &val);
 
     void emit_raise_exception();
     void emit_raise_exception(const ErtsCodeMFA *exp);
@@ -1274,7 +1295,8 @@ protected:
                             const ArgVal &Fail,
                             const Span<ArgVal> &args);
 
-    void emit_float_instr(uint32_t instId,
+    void emit_float_instr(uint32_t instIdSSE,
+                          uint32_t instIdAVX,
                           const ArgFRegister &LHS,
                           const ArgFRegister &RHS,
                           const ArgFRegister &Dst);
@@ -1294,6 +1316,16 @@ protected:
                                  Eterm fail_value,
                                  Eterm succ_value);
 
+    void emit_cond_to_bool(uint32_t instId, const ArgRegister &Dst);
+    void emit_bif_is_ge_lt(uint32_t instId,
+                           const ArgSource &LHS,
+                           const ArgSource &RHS,
+                           const ArgRegister &Dst);
+    void emit_bif_min_max(uint32_t instId,
+                          const ArgSource &LHS,
+                          const ArgSource &RHS,
+                          const ArgRegister &Dst);
+
     void emit_proc_lc_unrequire(void);
     void emit_proc_lc_require(void);
 
@@ -1338,7 +1370,7 @@ protected:
 
     void make_move_patch(x86::Gp to,
                          std::vector<struct patch> &patches,
-                         int64_t offset = 0) {
+                         size_t offset = 0) {
         const int MOV_IMM64_PAYLOAD_OFFSET = 2;
         Label lbl = a.newLabel();
 
@@ -1374,13 +1406,33 @@ protected:
     }
 
     void cmp_arg(x86::Mem mem, const ArgVal &val, const x86::Gp &spill) {
-        /* Note that the cast to Sint is necessary to handle negative numbers
-         * such as NIL. */
-        if (val.isImmed() && Support::isInt32((Sint)val.as<ArgImmed>().get())) {
-            a.cmp(mem, imm(val.as<ArgImmed>().get()));
+        x86::Gp reg = cached_reg(mem);
+
+        if (reg.isValid()) {
+            /* Note that the cast to Sint is necessary to handle
+             * negative numbers such as NIL. */
+            if (val.isImmed() &&
+                Support::isInt32((Sint)val.as<ArgImmed>().get())) {
+                comment("simplified compare of BEAM register");
+                a.cmp(reg, imm(val.as<ArgImmed>().get()));
+            } else if (reg != spill) {
+                comment("simplified compare of BEAM register");
+                mov_arg(spill, val);
+                a.cmp(reg, spill);
+            } else {
+                mov_arg(spill, val);
+                a.cmp(mem, spill);
+            }
         } else {
-            mov_arg(spill, val);
-            a.cmp(mem, spill);
+            /* Note that the cast to Sint is necessary to handle
+             * negative numbers such as NIL. */
+            if (val.isImmed() &&
+                Support::isInt32((Sint)val.as<ArgImmed>().get())) {
+                a.cmp(mem, imm(val.as<ArgImmed>().get()));
+            } else {
+                mov_arg(spill, val);
+                a.cmp(mem, spill);
+            }
         }
     }
 
@@ -1393,8 +1445,31 @@ protected:
         }
     }
 
+    void cmp(x86::Gp gp, int64_t val, const x86::Gp &spill) {
+        if (Support::isInt32(val)) {
+            a.cmp(gp, imm(val));
+        } else if (gp.isGpd()) {
+            mov_imm(spill, val);
+            a.cmp(gp, spill.r32());
+        } else {
+            mov_imm(spill, val);
+            a.cmp(gp, spill);
+        }
+    }
+
+    void sub(x86::Gp gp, int64_t val, const x86::Gp &spill) {
+        if (Support::isInt32(val)) {
+            a.sub(gp, imm(val));
+        } else {
+            mov_imm(spill, val);
+            a.sub(gp, spill);
+        }
+    }
+
     /* Note: May clear flags. */
     void mov_arg(x86::Gp to, const ArgVal &from, const x86::Gp &spill) {
+        bool valid_cache = is_cache_valid();
+
         if (from.isBytePtr()) {
             make_move_patch(to, strings, from.as<ArgBytePtr>().get());
         } else if (from.isExport()) {
@@ -1406,13 +1481,15 @@ protected:
         } else if (from.isLiteral()) {
             make_move_patch(to, literals[from.as<ArgLiteral>().get()].patches);
         } else if (from.isRegister()) {
-            a.mov(to, getArgRef(from.as<ArgRegister>()));
+            auto mem = getArgRef(from.as<ArgRegister>());
+            load_cached(to, mem);
         } else if (from.isWord()) {
             mov_imm(to, from.as<ArgWord>().get());
         } else {
             ASSERT(!"mov_arg with incompatible type");
         }
 
+        preserve_cache(to, valid_cache);
 #ifdef DEBUG
         /* Explicitly clear flags to catch bugs quicker, it may be very rare
          * for a certain instruction to load values that would otherwise cause
@@ -1431,6 +1508,15 @@ protected:
                 a.mov(spill, imm(val));
                 a.mov(to, spill);
             }
+        } else if (from.isWord()) {
+            auto val = from.as<ArgWord>().get();
+
+            if (Support::isInt32((Sint)val)) {
+                a.mov(to, imm(val));
+            } else {
+                a.mov(spill, imm(val));
+                a.mov(to, spill);
+            }
         } else {
             mov_arg(spill, from);
             a.mov(to, spill);
@@ -1440,7 +1526,8 @@ protected:
     void mov_arg(const ArgVal &to, x86::Gp from, const x86::Gp &spill) {
         (void)spill;
 
-        a.mov(getArgRef(to), from);
+        auto mem = getArgRef(to);
+        store_cache(from, mem);
     }
 
     void mov_arg(const ArgVal &to, x86::Mem from, const x86::Gp &spill) {
@@ -1467,10 +1554,9 @@ protected:
     }
 };
 
-void beamasm_metadata_update(
-        std::string module_name,
-        ErtsCodePtr base_address,
-        size_t code_size,
-        const std::vector<BeamAssembler::AsmRange> &ranges);
+void beamasm_metadata_update(std::string module_name,
+                             ErtsCodePtr base_address,
+                             size_t code_size,
+                             const std::vector<AsmRange> &ranges);
 void beamasm_metadata_early_init();
 void beamasm_metadata_late_init();