diff options
Diffstat (limited to 'erts/emulator/beam/jit/x86/beam_asm.hpp')
-rw-r--r-- | erts/emulator/beam/jit/x86/beam_asm.hpp | 720 |
1 files changed, 403 insertions, 317 deletions
diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp index 66891730ef..c7f085ee62 100644 --- a/erts/emulator/beam/jit/x86/beam_asm.hpp +++ b/erts/emulator/beam/jit/x86/beam_asm.hpp @@ -52,18 +52,20 @@ extern "C" using namespace asmjit; -class BeamAssembler : public ErrorHandler { -protected: - /* Holds code and relocation information. */ - CodeHolder code; - - /* TODO: Want to change this to x86::Builder in order to be able to patch - * the correct I into the code after code generation */ - x86::Assembler a; +struct BeamAssembler : public BeamAssemblerCommon { + BeamAssembler() : BeamAssemblerCommon(a) { + Error err = code.attach(&a); + ERTS_ASSERT(!err && "Failed to attach codeHolder"); + } - FileLogger logger; + BeamAssembler(const std::string &log) : BeamAssembler() { + if (erts_jit_asm_dump) { + setLogger(log + ".asm"); + } + } - Section *rodata = nullptr; +protected: + x86::Assembler a; /* * * * * * * * * */ @@ -170,27 +172,6 @@ protected: enum Distance { dShort, dLong }; -public: - static bool hasCpuFeature(uint32_t featureId); - - BeamAssembler(); - BeamAssembler(const std::string &log); - - ~BeamAssembler(); - - void *getBaseAddress(); - size_t getOffset(); - -protected: - void _codegen(JitAllocator *allocator, - const void **executable_ptr, - void **writable_ptr); - - void *getCode(Label label); - byte *getCode(char *labelName); - - void handleError(Error err, const char *message, BaseEmitter *origin); - constexpr x86::Mem getRuntimeStackRef() const { int base = offsetof(ErtsSchedulerRegisters, aux_regs.d.runtime_stack); @@ -577,11 +558,27 @@ protected: #endif } + /* Prefer `eHeapAlloc` over `eStack | eHeap` when calling + * functions in the runtime system that allocate heap + * memory (`HAlloc`, heap factories, etc). + * + * Prefer `eHeapOnlyAlloc` over `eHeapAlloc` for functions + * that assume there's already a certain amount of free + * space on the heap, such as those using `HeapOnlyAlloc` + * or similar. It's slightly cheaper in release builds, + * and in debug builds it updates `eStack` to ensure that + * we can make heap size assertions. */ enum Update : int { eStack = (1 << 0), eHeap = (1 << 1), eReductions = (1 << 2), - eCodeIndex = (1 << 3) + eCodeIndex = (1 << 3), + eHeapAlloc = Update::eHeap | Update::eStack, +#ifndef DEBUG + eHeapOnlyAlloc = Update::eHeap, +#else + eHeapOnlyAlloc = Update::eHeapAlloc +#endif }; void emit_enter_frame() { @@ -624,9 +621,9 @@ protected: if (ERTS_LIKELY(erts_frame_layout == ERTS_FRAME_LAYOUT_RA)) { if ((Spec & (Update::eHeap | Update::eStack)) == (Update::eHeap | Update::eStack)) { - /* To update both heap and stack we use sse instructions like - * gcc -O3 does. Basically it is this function run through - * gcc -O3: + /* To update both heap and stack we use SSE/AVX + * instructions like gcc -O3 does. Basically it is + * this function run through gcc -O3: * * struct a { long a; long b; long c; }; * void test(long a, long b, long c, struct a *s) { @@ -636,11 +633,18 @@ protected: * } */ ERTS_CT_ASSERT((offsetof(Process, stop) - offsetof(Process, htop)) == sizeof(Eterm *)); - a.movq(x86::xmm0, HTOP); - a.movq(x86::xmm1, E); - a.punpcklqdq(x86::xmm0, x86::xmm1); - a.movups(x86::xmmword_ptr(c_p, offsetof(Process, htop)), - x86::xmm0); + if (hasCpuFeature(CpuFeatures::X86::kAVX)) { + a.vmovq(x86::xmm1, HTOP); + a.vpinsrq(x86::xmm0, x86::xmm1, E, 1); + a.vmovdqu(x86::xmmword_ptr(c_p, offsetof(Process, htop)), + x86::xmm0); + } else { + a.movq(x86::xmm0, HTOP); + a.movq(x86::xmm1, E); + a.punpcklqdq(x86::xmm0, x86::xmm1); + a.movups(x86::xmmword_ptr(c_p, offsetof(Process, htop)), + x86::xmm0); + } } else if (Spec & Update::eHeap) { a.mov(x86::qword_ptr(c_p, offsetof(Process, htop)), HTOP); } else if (Spec & Update::eStack) { @@ -659,11 +663,18 @@ protected: if (Spec & Update::eStack) { ERTS_CT_ASSERT((offsetof(Process, frame_pointer) - offsetof(Process, stop)) == sizeof(Eterm *)); - a.movq(x86::xmm0, E); - a.movq(x86::xmm1, frame_pointer); - a.punpcklqdq(x86::xmm0, x86::xmm1); - a.movups(x86::xmmword_ptr(c_p, offsetof(Process, stop)), - x86::xmm0); + if (hasCpuFeature(CpuFeatures::X86::kAVX)) { + a.vmovq(x86::xmm1, E); + a.vpinsrq(x86::xmm0, x86::xmm1, frame_pointer, 1); + a.vmovdqu(x86::xmmword_ptr(c_p, offsetof(Process, stop)), + x86::xmm0); + } else { + a.movq(x86::xmm0, E); + a.movq(x86::xmm1, frame_pointer); + a.punpcklqdq(x86::xmm0, x86::xmm1); + a.movups(x86::xmmword_ptr(c_p, offsetof(Process, stop)), + x86::xmm0); + } } else { /* We can skip updating the frame pointer whenever the process * doesn't have to inspect the stack. We still need to update @@ -693,6 +704,14 @@ protected: a.sub(x86::rsp, imm(15)); a.and_(x86::rsp, imm(-16)); #endif + /* If the emulator has not been compiled with AVX support (which stops + * it from using legacy SSE instructions), we'll need to clear the upper + * bits of all AVX registers to avoid AVX/SSE transition penalties. */ +#if !defined(__AVX__) + if (hasCpuFeature(CpuFeatures::X86::kAVX)) { + a.vzeroupper(); + } +#endif } template<int Spec = 0> @@ -747,7 +766,7 @@ protected: #endif } - void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) { + void emit_test_boxed(x86::Gp Src) { /* Use the shortest possible instruction depending on the source * register. */ if (Src == x86::rax || Src == x86::rdi || Src == x86::rsi || @@ -756,6 +775,10 @@ protected: } else { a.test(Src.r32(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED)); } + } + + void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) { + emit_test_boxed(Src); if (dist == dShort) { a.short_().jne(Fail); } else { @@ -763,6 +786,15 @@ protected: } } + void emit_is_not_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) { + emit_test_boxed(Src); + if (dist == dShort) { + a.short_().je(Fail); + } else { + a.je(Fail); + } + } + x86::Gp emit_ptr_val(x86::Gp Dst, x86::Gp Src) { #if !defined(TAG_LITERAL_PTR) return Src; @@ -864,114 +896,233 @@ protected: mov_imm(to, 0); } -public: - void embed_rodata(const char *labelName, const char *buff, size_t size); - void embed_bss(const char *labelName, size_t size); - void embed_zeros(size_t size); - - void setLogger(std::string log); - void setLogger(FILE *log); + template<typename Dst, typename Src> + void vmovups(Dst dst, Src src) { + if (hasCpuFeature(CpuFeatures::X86::kAVX)) { + a.vmovups(dst, src); + } else { + a.movups(dst, src); + } + } - void comment(const char *format) { - if (logger.file()) { - a.commentf("# %s", format); + template<typename Dst, typename Src> + void vmovsd(Dst dst, Src src) { + if (hasCpuFeature(CpuFeatures::X86::kAVX)) { + a.vmovsd(dst, src); + } else { + a.movsd(dst, src); } } - template<typename... Ts> - void comment(const char *format, Ts... args) { - if (logger.file()) { - char buff[1024]; - erts_snprintf(buff, sizeof(buff), format, args...); - a.commentf("# %s", buff); + template<typename Dst, typename Src> + void vucomisd(Dst dst, Src src) { + if (hasCpuFeature(CpuFeatures::X86::kAVX)) { + a.vucomisd(dst, src); + } else { + a.ucomisd(dst, src); } } - struct AsmRange { - ErtsCodePtr start; - ErtsCodePtr stop; - const std::string name; + /* Copies `count` words from `from` to `to`. + * + * Clobbers `spill` and the first vector register (xmm0, ymm0 etc). */ + void emit_copy_words(x86::Mem from, + x86::Mem to, + Sint32 count, + x86::Gp spill) { + ASSERT(!from.hasIndex() && !to.hasIndex()); + ASSERT(count >= 0 && count < (ERTS_SINT32_MAX / (Sint32)sizeof(UWord))); + ASSERT(from.offset() < ERTS_SINT32_MAX - count * (Sint32)sizeof(UWord)); + ASSERT(to.offset() < ERTS_SINT32_MAX - count * (Sint32)sizeof(UWord)); + + /* We're going to mix sizes pretty wildly below, so it's easiest to + * turn off size validation. */ + from.setSize(0); + to.setSize(0); + + using vectors = std::initializer_list<std::tuple<x86::Vec, + Sint32, + x86::Inst::Id, + CpuFeatures::X86::Id>>; + for (const auto &spec : vectors{{x86::zmm0, + 8, + x86::Inst::kIdVmovups, + CpuFeatures::X86::kAVX512_VL}, + {x86::zmm0, + 8, + x86::Inst::kIdVmovups, + CpuFeatures::X86::kAVX512_F}, + {x86::ymm0, + 4, + x86::Inst::kIdVmovups, + CpuFeatures::X86::kAVX}, + {x86::xmm0, + 2, + x86::Inst::kIdVmovups, + CpuFeatures::X86::kAVX}, + {x86::xmm0, + 2, + x86::Inst::kIdMovups, + CpuFeatures::X86::kSSE}}) { + const auto &[vector_reg, vector_size, vector_inst, feature] = spec; + + if (!hasCpuFeature(feature)) { + continue; + } - struct LineData { - ErtsCodePtr start; - const std::string file; - unsigned line; - }; + /* Copy the words inline if we can, otherwise use a loop with the + * largest vector size we're capable of. */ + if (count <= vector_size * 4) { + while (count >= vector_size) { + a.emit(vector_inst, vector_reg, from); + a.emit(vector_inst, to, vector_reg); + + from.addOffset(sizeof(UWord) * vector_size); + to.addOffset(sizeof(UWord) * vector_size); + count -= vector_size; + } + } else { + Sint32 loop_iterations, loop_size; + Label copy_next = a.newLabel(); - const std::vector<LineData> lines; - }; -}; + loop_iterations = count / vector_size; + loop_size = loop_iterations * vector_size * sizeof(UWord); -#include "beam_asm_global.hpp" + from.addOffset(loop_size); + to.addOffset(loop_size); + from.setIndex(spill); + to.setIndex(spill); -class BeamModuleAssembler : public BeamAssembler { - typedef unsigned BeamLabel; + mov_imm(spill, -loop_size); + a.bind(copy_next); + { + a.emit(vector_inst, vector_reg, from); + a.emit(vector_inst, to, vector_reg); - /* Map of label number to asmjit Label */ - typedef std::unordered_map<BeamLabel, const Label> LabelMap; - LabelMap rawLabels; + a.add(spill, imm(vector_size * sizeof(UWord))); + a.short_().jne(copy_next); + } - struct patch { - Label where; - int64_t ptr_offs; - int64_t val_offs; - }; + from.resetIndex(); + to.resetIndex(); - struct patch_catch { - struct patch patch; - Label handler; - }; - std::vector<struct patch_catch> catches; + count %= vector_size; + } + } - /* Map of import entry to patch labels and mfa */ - struct patch_import { - std::vector<struct patch> patches; - ErtsCodeMFA mfa; - }; - typedef std::unordered_map<unsigned, struct patch_import> ImportMap; - ImportMap imports; + if (count == 1) { + a.mov(spill, from); + a.mov(to, spill); - /* Map of fun entry to trampoline labels and patches */ - struct patch_lambda { - std::vector<struct patch> patches; - Label trampoline; - }; - typedef std::unordered_map<unsigned, struct patch_lambda> LambdaMap; - LambdaMap lambdas; + count -= 1; + } - /* Map of literals to patch labels */ - struct patch_literal { - std::vector<struct patch> patches; - }; - typedef std::unordered_map<unsigned, struct patch_literal> LiteralMap; - LiteralMap literals; + ASSERT(count == 0); + (void)count; + } +}; + +#include "beam_asm_global.hpp" + +class BeamModuleAssembler : public BeamAssembler, + public BeamModuleAssemblerCommon { + BeamGlobalAssembler *ga; - /* All string patches */ - std::vector<struct patch> strings; + /* Save the last PC for an error. */ + size_t last_error_offset = 0; - /* All functions that have been seen so far */ - std::vector<BeamLabel> functions; + /* Skip unnecessary moves in mov_arg() and cmp_arg(). */ + size_t last_movarg_offset = 0; + x86::Gp last_movarg_from1, last_movarg_from2; + x86::Mem last_movarg_to1, last_movarg_to2; - /* The BEAM file we've been loaded from, if any. */ - const BeamFile *beam; + /* Private helper. */ + void preserve__cache(x86::Gp dst) { + last_movarg_offset = a.offset(); + invalidate_cache(dst); + } - BeamGlobalAssembler *ga; + bool is_cache_valid() { + return a.offset() == last_movarg_offset; + } - Label code_header; + void preserve_cache(x86::Gp dst, bool cache_valid) { + if (cache_valid) { + preserve__cache(dst); + } + } - /* Used by emit to populate the labelToMFA map */ - Label current_label; + /* Store CPU register into memory and update the cache. */ + void store_cache(x86::Gp src, x86::Mem dst) { + if (is_cache_valid() && dst != last_movarg_to1) { + /* Something is already cached in the first slot. Use the + * second slot. */ + a.mov(dst, src); - /* The module's on_load function, if any. */ - Label on_load; + last_movarg_offset = a.offset(); + last_movarg_to2 = dst; + last_movarg_from2 = src; + } else { + /* Nothing cached yet, or the first slot has the same + * memory address as we will store into. Use the first + * slot and invalidate the second slot. */ + a.mov(dst, src); - /* The end of the last function. */ - Label code_end; + last_movarg_offset = a.offset(); + last_movarg_to1 = dst; + last_movarg_from1 = src; - Eterm mod; + last_movarg_to2 = x86::Mem(); + } + } - /* Save the last PC for an error. */ - size_t last_error_offset = 0; + void invalidate_cache(x86::Gp dst) { + if (dst == last_movarg_from1) { + last_movarg_to1 = x86::Mem(); + last_movarg_from1 = x86::Gp(); + } + if (dst == last_movarg_from2) { + last_movarg_to2 = x86::Mem(); + last_movarg_from2 = x86::Gp(); + } + } + + x86::Gp cached_reg(x86::Mem mem) { + if (is_cache_valid()) { + if (mem == last_movarg_to1) { + return last_movarg_from1; + } + if (mem == last_movarg_to2) { + return last_movarg_from2; + } + } + return x86::Gp(); + } + + void load_cached(x86::Gp dst, x86::Mem mem) { + if (a.offset() == last_movarg_offset) { + x86::Gp reg = cached_reg(mem); + + if (reg.isValid()) { + /* This memory location is cached. */ + if (reg != dst) { + comment("simplified fetching of BEAM register"); + a.mov(dst, reg); + preserve__cache(dst); + } else { + comment("skipped fetching of BEAM register"); + invalidate_cache(dst); + } + } else { + /* Not cached. Load and preserve the cache. */ + a.mov(dst, mem); + preserve__cache(dst); + } + } else { + /* The cache is invalid. */ + a.mov(dst, mem); + } + } /* Maps code pointers to thunks that jump to them, letting us treat global * fragments as if they were local. */ @@ -1027,180 +1178,19 @@ public: const ErtsCodeInfo *getOnLoad(void); unsigned patchCatches(char *rw_base); - void patchLambda(char *rw_base, unsigned index, BeamInstr I); + void patchLambda(char *rw_base, unsigned index, const ErlFunEntry *fe); void patchLiteral(char *rw_base, unsigned index, Eterm lit); - void patchImport(char *rw_base, unsigned index, BeamInstr I); + void patchImport(char *rw_base, unsigned index, const Export *import); void patchStrings(char *rw_base, const byte *string); protected: - int getTypeUnion(const ArgSource &arg) const { - auto typeIndex = - arg.isRegister() ? arg.as<ArgRegister>().typeIndex() : 0; - - ASSERT(typeIndex < beam->types.count); - return beam->types.entries[typeIndex].type_union; - } - - auto getIntRange(const ArgSource &arg) const { - if (arg.isSmall()) { - Sint value = arg.as<ArgSmall>().getSigned(); - return std::make_pair(value, value); - } else { - auto typeIndex = - arg.isRegister() ? arg.as<ArgRegister>().typeIndex() : 0; - - ASSERT(typeIndex < beam->types.count); - const auto &entry = beam->types.entries[typeIndex]; - ASSERT(entry.type_union & BEAM_TYPE_INTEGER); - return std::make_pair(entry.min, entry.max); - } - } - - bool always_small(const ArgSource &arg) const { - if (arg.isSmall()) { - return true; - } - - int type_union = getTypeUnion(arg); - if (type_union == BEAM_TYPE_INTEGER) { - auto [min, max] = getIntRange(arg); - return min <= max; - } else { - return false; - } - } - - bool always_immediate(const ArgSource &arg) const { - if (arg.isImmed() || always_small(arg)) { - return true; - } - - int type_union = getTypeUnion(arg); - return (type_union & BEAM_TYPE_MASK_ALWAYS_IMMEDIATE) == type_union; - } - - bool always_same_types(const ArgSource &lhs, const ArgSource &rhs) const { - int lhs_types = getTypeUnion(lhs); - int rhs_types = getTypeUnion(rhs); - - /* We can only be certain that the types are the same when there's - * one possible type. For example, if one is a number and the other - * is an integer, they could differ if the former is a float. */ - if ((lhs_types & (lhs_types - 1)) == 0) { - return lhs_types == rhs_types; - } - - return false; - } - - bool always_one_of(const ArgSource &arg, int types) const { - if (arg.isImmed()) { - if (arg.isSmall()) { - return !!(types & BEAM_TYPE_INTEGER); - } else if (arg.isAtom()) { - return !!(types & BEAM_TYPE_ATOM); - } else if (arg.isNil()) { - return !!(types & BEAM_TYPE_NIL); - } - - return false; - } else { - int type_union = getTypeUnion(arg); - return type_union == (type_union & types); - } - } - - int masked_types(const ArgSource &arg, int mask) const { - if (arg.isImmed()) { - if (arg.isSmall()) { - return mask & BEAM_TYPE_INTEGER; - } else if (arg.isAtom()) { - return mask & BEAM_TYPE_ATOM; - } else if (arg.isNil()) { - return mask & BEAM_TYPE_NIL; - } - - return BEAM_TYPE_NONE; - } else { - return getTypeUnion(arg) & mask; - } - } - - bool exact_type(const ArgSource &arg, int type_id) const { - return always_one_of(arg, type_id); - } - - bool is_sum_small(const ArgSource &LHS, const ArgSource &RHS) { - if (!(always_small(LHS) && always_small(RHS))) { - return false; - } else { - Sint min, max; - auto [min1, max1] = getIntRange(LHS); - auto [min2, max2] = getIntRange(RHS); - min = min1 + min2; - max = max1 + max2; - return IS_SSMALL(min) && IS_SSMALL(max); - } - } - - bool is_difference_small(const ArgSource &LHS, const ArgSource &RHS) { - if (!(always_small(LHS) && always_small(RHS))) { - return false; - } else { - Sint min, max; - auto [min1, max1] = getIntRange(LHS); - auto [min2, max2] = getIntRange(RHS); - min = min1 - max2; - max = max1 - min2; - return IS_SSMALL(min) && IS_SSMALL(max); - } - } - - bool is_product_small(const ArgSource &LHS, const ArgSource &RHS) { - if (!(always_small(LHS) && always_small(RHS))) { - return false; - } else { - auto [min1, max1] = getIntRange(LHS); - auto [min2, max2] = getIntRange(RHS); - auto mag1 = std::max(std::abs(min1), std::abs(max1)); - auto mag2 = std::max(std::abs(min2), std::abs(max2)); - - /* - * mag1 * mag2 <= MAX_SMALL - * mag1 <= MAX_SMALL / mag2 (when mag2 != 0) - */ - ERTS_CT_ASSERT(MAX_SMALL < -MIN_SMALL); - return mag2 == 0 || mag1 <= MAX_SMALL / mag2; - } - } - - bool is_bsl_small(const ArgSource &LHS, const ArgSource &RHS) { - /* - * In the code compiled by scripts/diffable, there never - * seems to be any range information for the RHS. Therefore, - * don't bother unless RHS is an immediate small. - */ - if (!(always_small(LHS) && RHS.isSmall())) { - return false; - } else { - auto [min1, max1] = getIntRange(LHS); - auto rhs_val = RHS.as<ArgSmall>().getSigned(); - - if (min1 < 0 || max1 == 0 || rhs_val < 0) { - return false; - } - - return rhs_val < Support::clz(max1) - _TAG_IMMED1_SIZE; - } - } - - /* Helpers */ void emit_gc_test(const ArgWord &Stack, const ArgWord &Heap, const ArgWord &Live); void emit_gc_test_preserve(const ArgWord &Need, const ArgWord &Live, - x86::Gp term); + const ArgSource &Preserve, + x86::Gp preserve_reg); x86::Mem emit_variable_apply(bool includeI); x86::Mem emit_fixed_apply(const ArgWord &arity, bool includeI); @@ -1209,11 +1199,6 @@ protected: bool skip_fun_test = false, bool skip_arity_test = false); - x86::Gp emit_is_binary(const ArgLabel &Fail, - const ArgSource &Src, - Label next, - Label subbin); - void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) { BeamAssembler::emit_is_boxed(Fail, Src, dist); } @@ -1222,7 +1207,7 @@ protected: const ArgVal &Arg, x86::Gp Src, Distance dist = dLong) { - if (always_one_of(Arg, BEAM_TYPE_MASK_ALWAYS_BOXED)) { + if (always_one_of<BeamTypeId::AlwaysBoxed>(Arg)) { comment("skipped box test since argument is always boxed"); return; } @@ -1246,10 +1231,12 @@ protected: void emit_error(int code); - x86::Mem emit_bs_get_integer_prologue(Label next, - Label fail, - int flags, - int size); + void emit_bs_get_integer(const ArgRegister &Ctx, + const ArgLabel &Fail, + const ArgWord &Live, + const ArgWord Flags, + int bits, + const ArgRegister &Dst); int emit_bs_get_field_size(const ArgSource &Size, int unit, @@ -1261,6 +1248,40 @@ protected: void emit_bs_get_utf16(const ArgRegister &Ctx, const ArgLabel &Fail, const ArgWord &Flags); + void update_bin_state(x86::Gp bin_offset, + x86::Gp current_byte, + Sint bit_offset, + Sint size, + x86::Gp size_reg); + bool need_mask(const ArgVal Val, Sint size); + void set_zero(Sint effectiveSize); + bool bs_maybe_enter_runtime(bool entered); + void bs_maybe_leave_runtime(bool entered); + void emit_construct_utf8_shared(); + void emit_construct_utf8(const ArgVal &Src, + Sint bit_offset, + bool is_byte_aligned); + + void emit_read_bits(Uint bits, + const x86::Gp bin_base, + const x86::Gp bin_offset, + const x86::Gp bitdata); + void emit_extract_integer(const x86::Gp bitdata, + const x86::Gp tmp, + Uint flags, + Uint bits, + const ArgRegister &Dst); + void emit_extract_binary(const x86::Gp bitdata, + Uint bits, + const ArgRegister &Dst); + void emit_read_integer(const x86::Gp bin_base, + const x86::Gp bin_position, + const x86::Gp tmp, + Uint flags, + Uint bits, + const ArgRegister &Dst); + + UWord bs_get_flags(const ArgVal &val); void emit_raise_exception(); void emit_raise_exception(const ErtsCodeMFA *exp); @@ -1274,7 +1295,8 @@ protected: const ArgVal &Fail, const Span<ArgVal> &args); - void emit_float_instr(uint32_t instId, + void emit_float_instr(uint32_t instIdSSE, + uint32_t instIdAVX, const ArgFRegister &LHS, const ArgFRegister &RHS, const ArgFRegister &Dst); @@ -1294,6 +1316,16 @@ protected: Eterm fail_value, Eterm succ_value); + void emit_cond_to_bool(uint32_t instId, const ArgRegister &Dst); + void emit_bif_is_ge_lt(uint32_t instId, + const ArgSource &LHS, + const ArgSource &RHS, + const ArgRegister &Dst); + void emit_bif_min_max(uint32_t instId, + const ArgSource &LHS, + const ArgSource &RHS, + const ArgRegister &Dst); + void emit_proc_lc_unrequire(void); void emit_proc_lc_require(void); @@ -1338,7 +1370,7 @@ protected: void make_move_patch(x86::Gp to, std::vector<struct patch> &patches, - int64_t offset = 0) { + size_t offset = 0) { const int MOV_IMM64_PAYLOAD_OFFSET = 2; Label lbl = a.newLabel(); @@ -1374,13 +1406,33 @@ protected: } void cmp_arg(x86::Mem mem, const ArgVal &val, const x86::Gp &spill) { - /* Note that the cast to Sint is necessary to handle negative numbers - * such as NIL. */ - if (val.isImmed() && Support::isInt32((Sint)val.as<ArgImmed>().get())) { - a.cmp(mem, imm(val.as<ArgImmed>().get())); + x86::Gp reg = cached_reg(mem); + + if (reg.isValid()) { + /* Note that the cast to Sint is necessary to handle + * negative numbers such as NIL. */ + if (val.isImmed() && + Support::isInt32((Sint)val.as<ArgImmed>().get())) { + comment("simplified compare of BEAM register"); + a.cmp(reg, imm(val.as<ArgImmed>().get())); + } else if (reg != spill) { + comment("simplified compare of BEAM register"); + mov_arg(spill, val); + a.cmp(reg, spill); + } else { + mov_arg(spill, val); + a.cmp(mem, spill); + } } else { - mov_arg(spill, val); - a.cmp(mem, spill); + /* Note that the cast to Sint is necessary to handle + * negative numbers such as NIL. */ + if (val.isImmed() && + Support::isInt32((Sint)val.as<ArgImmed>().get())) { + a.cmp(mem, imm(val.as<ArgImmed>().get())); + } else { + mov_arg(spill, val); + a.cmp(mem, spill); + } } } @@ -1393,8 +1445,31 @@ protected: } } + void cmp(x86::Gp gp, int64_t val, const x86::Gp &spill) { + if (Support::isInt32(val)) { + a.cmp(gp, imm(val)); + } else if (gp.isGpd()) { + mov_imm(spill, val); + a.cmp(gp, spill.r32()); + } else { + mov_imm(spill, val); + a.cmp(gp, spill); + } + } + + void sub(x86::Gp gp, int64_t val, const x86::Gp &spill) { + if (Support::isInt32(val)) { + a.sub(gp, imm(val)); + } else { + mov_imm(spill, val); + a.sub(gp, spill); + } + } + /* Note: May clear flags. */ void mov_arg(x86::Gp to, const ArgVal &from, const x86::Gp &spill) { + bool valid_cache = is_cache_valid(); + if (from.isBytePtr()) { make_move_patch(to, strings, from.as<ArgBytePtr>().get()); } else if (from.isExport()) { @@ -1406,13 +1481,15 @@ protected: } else if (from.isLiteral()) { make_move_patch(to, literals[from.as<ArgLiteral>().get()].patches); } else if (from.isRegister()) { - a.mov(to, getArgRef(from.as<ArgRegister>())); + auto mem = getArgRef(from.as<ArgRegister>()); + load_cached(to, mem); } else if (from.isWord()) { mov_imm(to, from.as<ArgWord>().get()); } else { ASSERT(!"mov_arg with incompatible type"); } + preserve_cache(to, valid_cache); #ifdef DEBUG /* Explicitly clear flags to catch bugs quicker, it may be very rare * for a certain instruction to load values that would otherwise cause @@ -1431,6 +1508,15 @@ protected: a.mov(spill, imm(val)); a.mov(to, spill); } + } else if (from.isWord()) { + auto val = from.as<ArgWord>().get(); + + if (Support::isInt32((Sint)val)) { + a.mov(to, imm(val)); + } else { + a.mov(spill, imm(val)); + a.mov(to, spill); + } } else { mov_arg(spill, from); a.mov(to, spill); @@ -1440,7 +1526,8 @@ protected: void mov_arg(const ArgVal &to, x86::Gp from, const x86::Gp &spill) { (void)spill; - a.mov(getArgRef(to), from); + auto mem = getArgRef(to); + store_cache(from, mem); } void mov_arg(const ArgVal &to, x86::Mem from, const x86::Gp &spill) { @@ -1467,10 +1554,9 @@ protected: } }; -void beamasm_metadata_update( - std::string module_name, - ErtsCodePtr base_address, - size_t code_size, - const std::vector<BeamAssembler::AsmRange> &ranges); +void beamasm_metadata_update(std::string module_name, + ErtsCodePtr base_address, + size_t code_size, + const std::vector<AsmRange> &ranges); void beamasm_metadata_early_init(); void beamasm_metadata_late_init(); |