diff options
Diffstat (limited to 'erts/emulator/beam/jit/x86/instr_bs.cpp')
-rw-r--r-- | erts/emulator/beam/jit/x86/instr_bs.cpp | 3066 |
1 files changed, 2649 insertions, 417 deletions
diff --git a/erts/emulator/beam/jit/x86/instr_bs.cpp b/erts/emulator/beam/jit/x86/instr_bs.cpp index ab6abff6cc..36e95df57c 100644 --- a/erts/emulator/beam/jit/x86/instr_bs.cpp +++ b/erts/emulator/beam/jit/x86/instr_bs.cpp @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2020-2022. All Rights Reserved. + * Copyright Ericsson AB 2020-2023. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ */ #include "beam_asm.hpp" +#include <numeric> extern "C" { @@ -57,12 +58,22 @@ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size, a.jmp(fail); return -1; } else { + bool can_fail = true; + mov_arg(RET, Size); - a.mov(ARG3d, RETd); - a.and_(ARG3d, imm(_TAG_IMMED1_MASK)); - a.cmp(ARG3d, imm(_TAG_IMMED1_SMALL)); - a.jne(fail); + if (always_small(Size)) { + auto [min, max] = getClampedRange(Size); + can_fail = + !(0 <= min && (max >> (SMALL_BITS - ERL_UNIT_BITS)) == 0); + comment("simplified segment size checks because " + "the types are known"); + } else { + a.mov(ARG3d, RETd); + a.and_(ARG3d, imm(_TAG_IMMED1_MASK)); + a.cmp(ARG3d, imm(_TAG_IMMED1_SMALL)); + a.jne(fail); + } if (max_size) { ASSERT(Support::isInt32((Sint)make_small(max_size))); @@ -70,19 +81,35 @@ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size, a.ja(fail); } - if (unit == 1) { + if (unit == 0) { + mov_imm(RET, 0); + } else if (unit == 1) { a.sar(RET, imm(_TAG_IMMED1_SIZE)); - a.js(fail); + if (can_fail) { + a.js(fail); + } + } else if (!can_fail && Support::isPowerOf2(unit)) { + int trailing_bits = Support::ctz<Eterm>(unit); + a.and_(RET, imm(~_TAG_IMMED1_MASK)); + if (trailing_bits < _TAG_IMMED1_SIZE) { + a.sar(RET, imm(_TAG_IMMED1_SIZE - trailing_bits)); + } else if (trailing_bits > _TAG_IMMED1_SIZE) { + a.shl(RET, imm(trailing_bits - _TAG_IMMED1_SIZE)); + } } else { /* Untag the size but don't shift it just yet, we want to fail on * overflow if the final result doesn't fit into a small. */ a.and_(RET, imm(~_TAG_IMMED1_MASK)); - a.js(fail); + if (can_fail) { + a.js(fail); + } /* Size = (Size) * (Unit) */ mov_imm(ARG3, unit); a.mul(ARG3); /* CLOBBERS ARG3! */ - a.jo(fail); + if (can_fail) { + a.jo(fail); + } a.sar(RET, imm(_TAG_IMMED1_SIZE)); } @@ -103,7 +130,7 @@ void BeamModuleAssembler::emit_i_bs_init_heap(const ArgWord &Size, mov_arg(ARG5, Heap); mov_arg(ARG6, Live); - emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(); /* Must be last since mov_arg() may clobber ARG1 */ a.mov(ARG1, c_p); @@ -111,7 +138,7 @@ void BeamModuleAssembler::emit_i_bs_init_heap(const ArgWord &Size, load_erl_bits_state(ARG3); runtime_call<6>(beam_jit_bs_init); - emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(); mov_arg(Dst, RET); } @@ -143,16 +170,14 @@ void BeamModuleAssembler::emit_i_bs_init_fail_heap(const ArgSource &Size, mov_arg(ARG5, Heap); mov_arg(ARG6, Live); - emit_enter_runtime<Update::eReductions | Update::eStack | - Update::eHeap>(); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(); a.mov(ARG1, c_p); load_x_reg_array(ARG2); load_erl_bits_state(ARG3); runtime_call<6>(beam_jit_bs_init); - emit_leave_runtime<Update::eReductions | Update::eStack | - Update::eHeap>(); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(); mov_arg(Dst, RET); } @@ -204,7 +229,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_heap(const ArgWord &NumBits, mov_arg(ARG5, Alloc); mov_arg(ARG6, Live); - emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(); /* Must be last since mov_arg() may clobber ARG1 */ a.mov(ARG1, c_p); @@ -212,7 +237,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_heap(const ArgWord &NumBits, load_erl_bits_state(ARG3); runtime_call<6>(beam_jit_bs_init_bits); - emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(); mov_arg(Dst, RET); } @@ -245,8 +270,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_fail_heap( mov_arg(ARG5, Alloc); mov_arg(ARG6, Live); - emit_enter_runtime<Update::eReductions | Update::eStack | - Update::eHeap>(); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(); /* Must be last since mov_arg() may clobber ARG1 */ a.mov(ARG1, c_p); @@ -254,8 +278,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_fail_heap( load_erl_bits_state(ARG3); runtime_call<6>(beam_jit_bs_init_bits); - emit_leave_runtime<Update::eReductions | Update::eStack | - Update::eHeap>(); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(); mov_arg(Dst, RET); } @@ -576,17 +599,18 @@ void BeamModuleAssembler::emit_i_bs_start_match3(const ArgRegister &Src, a.bind(is_binary); { - /* Src is not guaranteed to be inside the live range, so we need to - * stash it during GC. */ - emit_gc_test_preserve(ArgWord(ERL_BIN_MATCHSTATE_SIZE(0)), Live, ARG2); + emit_gc_test_preserve(ArgWord(ERL_BIN_MATCHSTATE_SIZE(0)), + Live, + Src, + ARG2); - emit_enter_runtime<Update::eStack | Update::eHeap>(); + emit_enter_runtime<Update::eHeapOnlyAlloc>(); a.mov(ARG1, c_p); /* ARG2 was set above */ runtime_call<2>(erts_bs_start_match_3); - emit_leave_runtime<Update::eStack | Update::eHeap>(); + emit_leave_runtime<Update::eHeapOnlyAlloc>(); a.lea(ARG2, x86::qword_ptr(RET, TAG_PRIMARY_BOXED)); } @@ -650,278 +674,92 @@ void BeamModuleAssembler::emit_i_bs_get_position(const ArgRegister &Ctx, mov_arg(Dst, ARG1); } -/* ARG3 = flags | (size << 3), - * ARG4 = tagged match context */ -void BeamGlobalAssembler::emit_bs_fixed_integer_shared() { - emit_enter_runtime<Update::eStack | Update::eHeap>(); - - a.mov(ARG1, c_p); - /* Unpack size ... */ - a.mov(ARG2, ARG3); - a.shr(ARG2, imm(3)); - /* ... flags. */ - a.and_(ARG3, imm(BSF_ALIGNED | BSF_LITTLE | BSF_SIGNED)); - a.lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb))); - runtime_call<4>(erts_bs_get_integer_2); - - emit_leave_runtime<Update::eStack | Update::eHeap>(); - - a.ret(); -} - -x86::Mem BeamModuleAssembler::emit_bs_get_integer_prologue(Label next, - Label fail, - int flags, - int size) { - Label aligned = a.newLabel(); - - a.mov(ARG2, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.offset))); - a.lea(ARG3, x86::qword_ptr(ARG2, size)); - a.cmp(ARG3, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.size))); - a.ja(fail); - - a.test(ARG2.r8(), imm(CHAR_BIT - 1)); - a.short_().je(aligned); - - /* Actually unaligned reads are quite rare, so we handle everything in a - * shared fragment. */ - mov_imm(ARG3, flags | (size << 3)); - safe_fragment_call(ga->get_bs_fixed_integer_shared()); - - /* The above call can't fail since we work on small numbers and - * bounds-tested above. */ -#ifdef JIT_HARD_DEBUG - a.jmp(next); -#else - a.short_().jmp(next); -#endif - - a.bind(aligned); - { - /* Read base address and convert offset to bytes. */ - a.mov(ARG1, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.base))); - a.shr(ARG2, imm(3)); - - /* We cannot fail from here on; bump the match context's position. */ - a.mov(emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.offset)), - ARG3); - - return x86::Mem(ARG1, ARG2, 0, 0, size / 8); - } -} - -void BeamModuleAssembler::emit_i_bs_get_integer_8(const ArgRegister &Ctx, - const ArgWord &Flags, - const ArgLabel &Fail, - const ArgRegister &Dst) { - int flags = Flags.get(); - Label next = a.newLabel(); - x86::Mem address; - - mov_arg(ARG4, Ctx); - - address = emit_bs_get_integer_prologue(next, - resolve_beam_label(Fail), - flags, - 8); - - if (flags & BSF_SIGNED) { - a.movsx(RET, address); - } else { - a.movzx(RET, address); - } - - a.shl(RET, imm(_TAG_IMMED1_SIZE)); - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - - a.bind(next); - mov_arg(Dst, RET); -} - -void BeamModuleAssembler::emit_i_bs_get_integer_16(const ArgRegister &Ctx, - const ArgWord &Flags, - const ArgLabel &Fail, - const ArgRegister &Dst) { - int flags = Flags.get(); - Label next = a.newLabel(); - x86::Mem address; - - mov_arg(ARG4, Ctx); - - address = emit_bs_get_integer_prologue(next, - resolve_beam_label(Fail), - flags, - 16); - - if (flags & BSF_LITTLE) { - if (flags & BSF_SIGNED) { - a.movsx(RET, address); - } else { - a.movzx(RET, address); - } - } else { - if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { - a.movbe(x86::ax, address); - } else { - a.mov(x86::ax, address); - a.xchg(x86::al, x86::ah); - } +void BeamModuleAssembler::emit_bs_get_integer2(const ArgLabel &Fail, + const ArgRegister &Ctx, + const ArgWord &Live, + const ArgSource &Sz, + const ArgWord &Unit, + const ArgWord &Flags, + const ArgRegister &Dst) { + Uint size; + Uint flags = Flags.get(); - if (flags & BSF_SIGNED) { - a.movsx(RET, x86::ax); - } else { - a.movzx(RET, x86::ax); - } + if (flags & BSF_NATIVE) { + flags &= ~BSF_NATIVE; + flags |= BSF_LITTLE; } - a.shl(RET, imm(_TAG_IMMED1_SIZE)); - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - - a.bind(next); - mov_arg(Dst, RET); -} - -void BeamModuleAssembler::emit_i_bs_get_integer_32(const ArgRegister &Ctx, - const ArgWord &Flags, - const ArgLabel &Fail, - const ArgRegister &Dst) { - int flags = Flags.get(); - Label next = a.newLabel(); - x86::Mem address; - - mov_arg(ARG4, Ctx); - - address = emit_bs_get_integer_prologue(next, - resolve_beam_label(Fail), - flags, - 32); - - if (flags & BSF_LITTLE) { - if (flags & BSF_SIGNED) { - a.movsxd(RET, address); - } else { - /* Implicitly zero-extends to 64 bits */ - a.mov(RETd, address); - } + if (Sz.isSmall() && Sz.as<ArgSmall>().getUnsigned() < 8 * sizeof(Uint) && + (size = Sz.as<ArgSmall>().getUnsigned() * Unit.get()) < + 8 * sizeof(Uint)) { + /* Segment of a fixed size supported by bs_match. */ + const ArgVal match[] = {ArgAtom(am_ensure_at_least), + ArgWord(size), + ArgWord(1), + ArgAtom(am_integer), + Live, + ArgWord(flags), + ArgWord(size), + ArgWord(1), + Dst}; + + const Span<ArgVal> args(match, sizeof(match) / sizeof(match[0])); + emit_i_bs_match(Fail, Ctx, args); } else { - if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { - a.movbe(RETd, address); - } else { - a.mov(RETd, address); - a.bswap(RETd); - } - - if (flags & BSF_SIGNED) { - a.movsxd(RET, RETd); - } - } - - a.shl(RET, imm(_TAG_IMMED1_SIZE)); - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - - a.bind(next); - mov_arg(Dst, RET); -} + Label fail = resolve_beam_label(Fail); + int unit = Unit.get(); + + /* Clobbers RET + ARG3, returns a negative result if we always + * fail and further work is redundant. */ + if (emit_bs_get_field_size(Sz, unit, fail, ARG5) >= 0) { + /* This operation can be expensive if a bignum can be + * created because there can be a garbage collection. */ + auto max = std::get<1>(getClampedRange(Sz)); + bool potentially_expensive = + max >= SMALL_BITS || (max * Unit.get()) >= SMALL_BITS; + + mov_arg(ARG3, Ctx); + mov_imm(ARG4, flags); + if (potentially_expensive) { + mov_arg(ARG6, Live); + } else { +#ifdef DEBUG + /* Never actually used. */ + mov_imm(ARG6, 1023); +#endif + } -void BeamModuleAssembler::emit_i_bs_get_integer_64(const ArgRegister &Ctx, - const ArgWord &Flags, - const ArgLabel &Fail, - const ArgWord &Live, - const ArgRegister &Dst) { - int flags = Flags.get(); - Label next = a.newLabel(); - x86::Mem address; + if (potentially_expensive) { + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(); + } else { + comment("simplified entering runtime because result is always " + "small"); + emit_enter_runtime(); + } - mov_arg(ARG4, Ctx); + a.mov(ARG1, c_p); + if (potentially_expensive) { + load_x_reg_array(ARG2); + } else { +#ifdef DEBUG + /* Never actually used. */ + mov_imm(ARG2, 0); +#endif + } + runtime_call<6>(beam_jit_bs_get_integer); - /* Ctx is not guaranteed to be inside the live range, so we need to stash - * it during GC. */ - emit_gc_test_preserve(ArgWord(BIG_UINT_HEAP_SIZE), Live, ARG4); + if (potentially_expensive) { + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(); + } else { + emit_leave_runtime(); + } - address = emit_bs_get_integer_prologue(next, - resolve_beam_label(Fail), - flags, - 64); + emit_test_the_non_value(RET); + a.je(fail); - if (flags & BSF_LITTLE) { - a.mov(RET, address); - } else { - if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { - a.movbe(RET, address); - } else { - a.mov(RET, address); - a.bswap(RET); + mov_arg(Dst, RET); } } - - a.mov(ARG1, RET); - a.mov(ARG2, RET); - - /* Speculatively make a small out of the result even though it might not - * be one, and jump to the next instruction if it is. */ - a.shl(RET, imm(_TAG_IMMED1_SIZE)); - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - - if (flags & BSF_SIGNED) { - a.sar(ARG2, imm(SMALL_BITS - 1)); - a.add(ARG2, imm(1)); - a.cmp(ARG2, imm(1)); - a.jbe(next); - } else { - a.shr(ARG2, imm(SMALL_BITS - 1)); - a.jz(next); - } - - emit_enter_runtime(); - - a.mov(ARG2, HTOP); - if (flags & BSF_SIGNED) { - runtime_call<2>(small_to_big); - } else { - runtime_call<2>(uword_to_big); - } - a.add(HTOP, imm(sizeof(Eterm) * BIG_UINT_HEAP_SIZE)); - - emit_leave_runtime(); - - a.bind(next); - mov_arg(Dst, RET); -} - -void BeamModuleAssembler::emit_i_bs_get_integer(const ArgRegister &Ctx, - const ArgLabel &Fail, - const ArgWord &Live, - const ArgWord &FlagsAndUnit, - const ArgSource &Sz, - const ArgRegister &Dst) { - Label fail; - int unit; - - fail = resolve_beam_label(Fail); - unit = FlagsAndUnit.get() >> 3; - - /* Clobbers RET + ARG3, returns a negative result if we always fail and - * further work is redundant. */ - if (emit_bs_get_field_size(Sz, unit, fail, ARG5) >= 0) { - mov_arg(ARG3, Ctx); - mov_arg(ARG4, FlagsAndUnit); - mov_arg(ARG6, Live); - - emit_enter_runtime<Update::eReductions | Update::eStack | - Update::eHeap>(); - - a.mov(ARG1, c_p); - load_x_reg_array(ARG2); - runtime_call<6>(beam_jit_bs_get_integer); - - emit_leave_runtime<Update::eReductions | Update::eStack | - Update::eHeap>(); - - emit_test_the_non_value(RET); - a.je(fail); - - mov_arg(Dst, RET); - } } void BeamModuleAssembler::emit_bs_test_tail2(const ArgLabel &Fail, @@ -962,9 +800,7 @@ void BeamModuleAssembler::emit_i_bs_get_binary_all2(const ArgRegister &Ctx, mov_arg(ARG1, Ctx); - /* Ctx is not guaranteed to be inside the live range, so we need to stash - * it during GC. */ - emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, ARG1); + emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, Ctx, ARG1); a.mov(RET, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb.size))); a.sub(RET, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb.offset))); @@ -981,19 +817,19 @@ void BeamModuleAssembler::emit_i_bs_get_binary_all2(const ArgRegister &Ctx, a.jne(resolve_beam_label(Fail)); - emit_enter_runtime<Update::eHeap>(); + emit_enter_runtime<Update::eHeapOnlyAlloc>(); a.lea(ARG2, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb))); a.mov(ARG1, c_p); runtime_call<2>(erts_bs_get_binary_all_2); - emit_leave_runtime<Update::eHeap>(); + emit_leave_runtime<Update::eHeapOnlyAlloc>(); mov_arg(Dst, RET); } void BeamGlobalAssembler::emit_bs_get_tail_shared() { - emit_enter_runtime<Update::eHeap>(); + emit_enter_runtime<Update::eHeapOnlyAlloc>(); a.mov(ARG2, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb.orig))); a.mov(ARG3, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb.base))); @@ -1006,7 +842,7 @@ void BeamGlobalAssembler::emit_bs_get_tail_shared() { a.lea(ARG1, x86::qword_ptr(c_p, offsetof(Process, htop))); runtime_call<5>(erts_extract_sub_binary); - emit_leave_runtime<Update::eHeap>(); + emit_leave_runtime<Update::eHeapOnlyAlloc>(); a.ret(); } @@ -1016,9 +852,7 @@ void BeamModuleAssembler::emit_bs_get_tail(const ArgRegister &Ctx, const ArgWord &Live) { mov_arg(ARG1, Ctx); - /* Ctx is not guaranteed to be inside the live range, so we need to stash - * it during GC. */ - emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, ARG1); + emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, Ctx, ARG1); safe_fragment_call(ga->get_bs_get_tail_shared()); @@ -1044,7 +878,6 @@ void BeamModuleAssembler::emit_i_bs_skip_bits2(const ArgRegister &Ctx, Label fail; fail = resolve_beam_label(Fail); - if (emit_bs_get_field_size(Bits, Unit.get(), fail, RET) >= 0) { emit_bs_skip_bits(Fail, Ctx); } @@ -1076,11 +909,12 @@ void BeamModuleAssembler::emit_i_bs_get_binary2(const ArgRegister &Ctx, mov_arg(ARG4, Ctx); - /* Ctx is not guaranteed to be inside the live range, so we need to - * stash it during GC. */ - emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, ARG4); + emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), + Live, + Ctx, + ARG4); - emit_enter_runtime<Update::eHeap>(); + emit_enter_runtime<Update::eHeapOnlyAlloc>(); a.mov(ARG1, c_p); a.mov(ARG2, TMP_MEM1q); @@ -1088,7 +922,7 @@ void BeamModuleAssembler::emit_i_bs_get_binary2(const ArgRegister &Ctx, a.lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb))); runtime_call<4>(erts_bs_get_binary_2); - emit_leave_runtime<Update::eHeap>(); + emit_leave_runtime<Update::eHeapOnlyAlloc>(); emit_test_the_non_value(RET); a.je(fail); @@ -1111,19 +945,17 @@ void BeamModuleAssembler::emit_i_bs_get_float2(const ArgRegister &Ctx, mov_arg(ARG4, Ctx); - /* Ctx is not guaranteed to be inside the live range, so we need to stash - * it during GC. */ - emit_gc_test_preserve(ArgWord(FLOAT_SIZE_OBJECT), Live, ARG4); + emit_gc_test_preserve(ArgWord(FLOAT_SIZE_OBJECT), Live, Ctx, ARG4); if (emit_bs_get_field_size(Sz, unit, fail, ARG2, 64) >= 0) { - emit_enter_runtime<Update::eHeap>(); + emit_enter_runtime<Update::eHeapOnlyAlloc>(); a.mov(ARG1, c_p); mov_imm(ARG3, Flags.get()); a.lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb))); runtime_call<4>(erts_bs_get_float_2); - emit_leave_runtime<Update::eHeap>(); + emit_leave_runtime<Update::eHeapOnlyAlloc>(); emit_test_the_non_value(RET); a.je(fail); @@ -1181,19 +1013,385 @@ void BeamModuleAssembler::emit_i_bs_put_utf8(const ArgLabel &Fail, } } +/* + * ARG1 = pointer to match state + * ARG2 = position in binary in bits + * ARG3 = base pointer to binary data + * RET = number of bits left in binary + * + * This fragment is called if the binary is unaligned and/or the number + * of remaining bits is less than 32. + * + * See the comment for emit_bs_get_utf8_shared() for details about the + * return value. + */ +void BeamGlobalAssembler::emit_bs_get_utf8_short_shared() { + const int position_offset = offsetof(ErlBinMatchBuffer, offset); + + const x86::Gp ctx = ARG1; + const x86::Gp bin_position = ARG2; + const x86::Gp bin_base = ARG3; + + Label at_least_one = a.newLabel(); + Label two = a.newLabel(); + Label three_or_more = a.newLabel(); + Label four = a.newLabel(); + Label five = a.newLabel(); + Label read_done = a.newLabel(); + Label no_masking = a.newLabel(); + Label ascii = a.newLabel(); + + /* Calculate the number of bytes remaining in the binary and error + * out if less than one. */ + a.shr(RET, imm(3)); + a.test(RET, RET); + a.short_().jne(at_least_one); + + /* ZF is is already set. */ + a.ret(); + + a.bind(at_least_one); + + /* Save number of bytes remaining in binary. */ + a.mov(ARG5, RET); + + /* If the position in the binary is not byte-aligned, we'll need + * to read one more byte. */ + a.test(bin_position, imm(7)); + a.setne(ARG4.r8()); + a.movzx(ARG4d, ARG4.r8()); + a.add(RET, ARG4); + + /* Save original position in bits and set up byte offset for + * reading. */ + a.push(bin_position); + a.shr(bin_position, imm(3)); + + a.cmp(RET, imm(2)); + a.short_().je(two); + a.short_().ja(three_or_more); + + /* Read one byte (always byte-aligned). */ + a.mov(RETb, x86::byte_ptr(bin_base, bin_position)); + a.movzx(RETd, RETb); + a.short_().jmp(read_done); + + /* Read two bytes. */ + a.bind(two); + a.mov(RET.r16(), x86::word_ptr(bin_base, bin_position)); + a.movzx(RETd, RET.r16()); + a.short_().jmp(read_done); + + a.bind(three_or_more); + a.cmp(RET, imm(4)); + a.short_().je(four); + a.short_().ja(five); + + /* Read three bytes. */ + a.mov(RET.r8(), x86::byte_ptr(bin_base, bin_position, 0, 2)); + a.movzx(RETd, RETb); + a.shl(RETd, imm(16)); + a.mov(RET.r16(), x86::word_ptr(bin_base, bin_position)); + a.short_().jmp(read_done); + + /* Read four bytes (always unaligned). */ + a.bind(four); + a.mov(RETd, x86::dword_ptr(bin_base, bin_position)); + a.short_().jmp(read_done); + + /* Read five bytes (always unaligned). */ + a.bind(five); + a.mov(RETd, x86::dword_ptr(bin_base, bin_position)); + a.mov(ARG4.r8(), x86::byte_ptr(bin_base, bin_position, 0, 4)); + a.movzx(ARG4d, ARG4.r8()); + a.shl(ARG4, imm(32)); + a.or_(RET, ARG4); + + /* Handle the bytes read. */ + a.bind(read_done); + a.pop(bin_position); + a.bswap(RET); + + if (x86::rcx == ctx) { + a.push(x86::rcx); + } + a.mov(x86::ecx, bin_position.r32()); + a.and_(x86::cl, imm(7)); + a.shl(RET, x86::cl); + + /* Check whether we will need to clear out trailing + * garbage not part of the binary. */ + a.mov(x86::cl, 64); + a.cmp(ARG5, imm(3)); + a.short_().ja(no_masking); + + /* Calculate a byte mask and zero out trailing garbage. */ + a.shl(ARG5d, imm(3)); + a.sub(x86::cl, ARG5.r8()); + mov_imm(ARG5, -1); + a.shl(ARG5, x86::cl); + a.and_(RET, ARG5); + + a.bind(no_masking); + if (x86::rcx == ctx) { + a.pop(x86::rcx); + } + + /* `test rax, rax` is a shorter instruction but can cause a warning + * in valgrind if there are any uninitialized bits in rax. */ + a.bt(RET, imm(63)); + a.short_().jnc(ascii); + + /* The bs_get_utf8_shared fragment expects the contents in RETd. */ + a.shr(RET, imm(32)); + a.jmp(labels[bs_get_utf8_shared]); + + /* Handle plain old ASCII (code point < 128). */ + a.bind(ascii); + a.add(x86::qword_ptr(ctx, position_offset), imm(8)); + a.shr(RET, imm(56 - _TAG_IMMED1_SIZE)); + a.or_(RET, imm(_TAG_IMMED1_SMALL)); /* Always clears ZF. */ + a.ret(); +} + +/* + * ARG1 = pointer to match state + * ARG2 = position in binary in bits + * RETd = 4 bytes read from the binary in big-endian order + * + * On successful return, the extracted code point is in RET, the + * position in the match state has been updated, and the ZF is clear. + * On failure, the ZF is set. + */ +void BeamGlobalAssembler::emit_bs_get_utf8_shared() { + Label error = a.newLabel(); + + x86::Gp shift_q = ARG4, shift_d = ARG4d, shift_b = ARG4.r8(); + x86::Gp original_value_d = RETd; + + x86::Gp byte_count_q = ARG2, byte_count_d = ARG2d; + x86::Gp extracted_value_d = ARG3d, extracted_value_b = ARG3.r8(); + x86::Gp control_mask_d = ARG5d; + x86::Gp error_mask_d = ARG6d; + + ASSERT(extracted_value_d != shift_d); + ASSERT(control_mask_d != shift_d); + ASSERT(error_mask_d != shift_d); + ASSERT(byte_count_d != shift_d); + + /* UTF-8 has the following layout, where 'x' are data bits: + * + * 1 byte: 0xxxxxxx (not handled by this path) + * 2 bytes: 110xxxxx, 10xxxxxx + * 3 bytes: 1110xxxx, 10xxxxxx 10xxxxxx + * 4 bytes: 11110xxx, 10xxxxxx 10xxxxxx 10xxxxxx + * + * Note that the number of leading bits is equal to the number of bytes, + * which makes it very easy to create masks for extraction and error + * checking. */ + + /* The PEXT instruction has poor latency on some processors, so we try to + * hide that by extracting early on. Should this be a problem, it's not + * much slower to hand-roll it with shifts or BEXTR. + * + * The mask covers data bits from all variants. This includes the 23rd bit + * to support the 2-byte case, which is set on all well-formed 4-byte + * codepoints, so it must be cleared before range testing .*/ + a.mov(extracted_value_d, imm(0x1F3F3F3F)); + a.pext(extracted_value_d, original_value_d, extracted_value_d); + + /* Preserve current match buffer and bit offset. */ + a.push(ARG1); + a.push(ARG2); + + /* Byte count = leading bit count. */ + a.mov(byte_count_d, original_value_d); + a.not_(byte_count_d); + a.lzcnt(byte_count_d, byte_count_d); + + /* Mask shift = (4 - byte count) * 8 */ + a.mov(shift_d, imm(4)); + a.sub(shift_d, byte_count_d); + a.lea(shift_d, x86::qword_ptr(0, shift_q, 3)); + + /* Shift the original value and masks into place. */ + a.shrx(original_value_d, original_value_d, shift_d); + + /* Matches the '10xxxxxx' components, leaving the header byte alone. */ + a.mov(control_mask_d, imm(0x00C0C0C0)); + a.shrx(control_mask_d, control_mask_d, shift_d); + a.mov(error_mask_d, imm(0x00808080)); + a.shrx(error_mask_d, error_mask_d, shift_d); + + /* Extracted value shift = (4 - byte count) * 6, as the leading '10' on + * every byte has been removed through PEXT. + * + * We calculate the shift here to avoid depending on byte_count_d later on + * when it may have changed. */ + a.mov(shift_d, imm(4)); + a.sub(shift_d, byte_count_d); + a.add(shift_d, shift_d); + a.lea(shift_d, x86::qword_ptr(shift_q, shift_q, 1)); + + /* Assert that the header bits of each '10xxxxxx' component is correct, + * signalling errors by trashing the byte count with a guaranteed-illegal + * value. */ + a.and_(original_value_d, control_mask_d); + a.cmp(original_value_d, error_mask_d); + a.cmovne(byte_count_d, error_mask_d); + + /* Shift the extracted value into place. */ + a.shrx(RETd, extracted_value_d, shift_d); + + /* The extraction mask is a bit too wide, see above for details. */ + a.and_(RETd, imm(~(1 << 22))); + + /* Check for too large code point. */ + a.cmp(RETd, imm(0x10FFFF)); + a.cmova(byte_count_d, error_mask_d); + + /* Check for the illegal range 16#D800 - 16#DFFF. */ + a.mov(shift_d, RETd); + a.and_(shift_d, imm(-0x800)); + a.cmp(shift_d, imm(0xD800)); + a.cmove(byte_count_d, error_mask_d); + + /* Test for overlong UTF-8 sequence. That can be done by testing + * that the bits marked y below are all zero. + * + * 1 byte: 0xxxxxxx (not handled by this path) + * 2 bytes: 110yyyyx, 10xxxxxx + * 3 bytes: 1110yyyy, 10yxxxxx 10xxxxxx + * 4 bytes: 11110yyy, 10yyxxxx 10xxxxxx 10xxxxxx + * + * 1 byte: xx'xxxxx + * 2 bytes: y'yyyxx'xxxxx + * 3 bytes: y'yyyyx'xxxxx'xxxxx + * 4 bytes: y'yyyyx'xxxxx'xxxxx'xxxxx + * + * The y bits can be isolated by shifting down by the number of bits + * shown in this table: + * + * 2: 7 (byte_count * 4 - 1) + * 3: 11 (byte_count * 4 - 1) + * 4: 16 (byte_count * 4) + */ + + /* Calculate number of bits to shift. */ + a.lea(shift_d, x86::qword_ptr(0, byte_count_q, 2)); + a.cmp(byte_count_d, imm(4)); + a.setne(extracted_value_b); + a.sub(shift_b, extracted_value_b); + a.movzx(shift_q, shift_b); + + /* Now isolate the y bits and compare to zero. */ + a.shrx(extracted_value_d, RETd, shift_d); + a.test(extracted_value_d, extracted_value_d); + a.cmove(byte_count_d, error_mask_d); + + /* Restore current bit offset and match buffer. */ + ASSERT(ARG1 != byte_count_q && ARG3 != byte_count_q); + a.pop(ARG3); + a.pop(ARG1); + + /* Advance our current position. */ + a.lea(ARG3, x86::qword_ptr(ARG3, byte_count_q, 3)); + + /* Byte count must be 2, 3, or 4. */ + a.sub(byte_count_d, imm(2)); + a.cmp(byte_count_d, imm(2)); + a.ja(error); + + a.mov(x86::qword_ptr(ARG1, offsetof(ErlBinMatchBuffer, offset)), ARG3); + + a.shl(RETd, imm(_TAG_IMMED1_SIZE)); + a.or_(RETd, imm(_TAG_IMMED1_SMALL)); /* Always clears ZF. */ + + a.ret(); + + a.bind(error); + { + /* Signal error by setting ZF. */ + a.xor_(RET, RET); + a.ret(); + } +} + void BeamModuleAssembler::emit_bs_get_utf8(const ArgRegister &Ctx, const ArgLabel &Fail) { - mov_arg(ARG1, Ctx); + const int base_offset = offsetof(ErlBinMatchBuffer, base); + const int position_offset = offsetof(ErlBinMatchBuffer, offset); + const int size_offset = offsetof(ErlBinMatchBuffer, size); + + const x86::Gp ctx = ARG1; + const x86::Gp bin_position = ARG2; + const x86::Gp bin_base = ARG3; + + Label multi_byte = a.newLabel(), fallback = a.newLabel(), + check = a.newLabel(), done = a.newLabel(); + + mov_arg(ctx, Ctx); + a.lea(ctx, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb))); + + a.mov(bin_position, x86::qword_ptr(ctx, position_offset)); + a.mov(RET, x86::qword_ptr(ctx, size_offset)); + a.mov(bin_base, x86::qword_ptr(ctx, base_offset)); + a.sub(RET, bin_position); + a.cmp(RET, imm(32)); + a.short_().jb(fallback); + + a.test(bin_position, imm(7)); + a.short_().jnz(fallback); + + /* We're byte-aligned and can read at least 32 bits. */ + a.mov(RET, bin_position); + a.shr(RET, 3); + + /* The most significant bits come first, so we'll read the the next four + * bytes as big-endian so we won't have to reorder them later. */ + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RETd, x86::dword_ptr(bin_base, RET)); + } else { + a.mov(RETd, x86::dword_ptr(bin_base, RET)); + a.bswap(RETd); + } + a.test(RETd, RETd); + a.short_().js(multi_byte); - emit_enter_runtime(); + /* Handle plain old ASCII (code point < 128). */ + a.add(x86::qword_ptr(ctx, position_offset), imm(8)); + a.shr(RETd, imm(24 - _TAG_IMMED1_SIZE)); + a.or_(RETd, imm(_TAG_IMMED1_SMALL)); + a.short_().jmp(done); - a.lea(ARG1, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb))); - runtime_call<1>(erts_bs_get_utf8); + a.bind(multi_byte); - emit_leave_runtime(); + if (hasCpuFeature(CpuFeatures::X86::kBMI2)) { + /* This CPU supports the PEXT and SHRX instructions. */ + safe_fragment_call(ga->get_bs_get_utf8_shared()); + a.short_().jmp(check); + } - emit_test_the_non_value(RET); + /* Take care of unaligned binaries and binaries with less than 32 + * bits left. */ + a.bind(fallback); + if (hasCpuFeature(CpuFeatures::X86::kBMI2)) { + /* This CPU supports the PEXT and SHRX instructions. */ + safe_fragment_call(ga->get_bs_get_utf8_short_shared()); + } else { + emit_enter_runtime(); + + runtime_call<1>(erts_bs_get_utf8); + + emit_leave_runtime(); + + emit_test_the_non_value(RET); + } + + a.bind(check); a.je(resolve_beam_label(Fail)); + + a.bind(done); } void BeamModuleAssembler::emit_i_bs_get_utf8(const ArgRegister &Ctx, @@ -1286,8 +1484,8 @@ void BeamModuleAssembler::emit_validate_unicode(Label next, Label fail, x86::Gp value) { a.mov(ARG3d, value.r32()); - a.and_(ARG3d, imm(_TAG_IMMED1_MASK)); - a.cmp(ARG3d, imm(_TAG_IMMED1_SMALL)); + a.and_(ARG3d.r8(), imm(_TAG_IMMED1_MASK)); + a.cmp(ARG3d.r8(), imm(_TAG_IMMED1_SMALL)); a.jne(fail); a.cmp(value, imm(make_small(0xD800UL))); @@ -1485,13 +1683,13 @@ void BeamModuleAssembler::emit_i_bs_append(const ArgLabel &Fail, mov_arg(ArgXRegister(Live.get()), Bin); - emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(); a.mov(ARG1, c_p); load_x_reg_array(ARG2); runtime_call<6>(erts_bs_append); - emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(); emit_test_the_non_value(RET); @@ -1544,18 +1742,18 @@ void BeamModuleAssembler::emit_i_bs_private_append(const ArgLabel &Fail, } void BeamModuleAssembler::emit_bs_init_writable() { - emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(); a.mov(ARG1, c_p); a.mov(ARG2, getXRef(0)); runtime_call<2>(erts_bs_init_writable); a.mov(getXRef(0), RET); - emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(); } void BeamGlobalAssembler::emit_bs_create_bin_error_shared() { - emit_enter_runtime<Update::eStack | Update::eHeap>(); + emit_enter_runtime<Update::eHeapAlloc>(); /* ARG3 is already set by the caller */ a.mov(ARG2, ARG4); @@ -1563,7 +1761,7 @@ void BeamGlobalAssembler::emit_bs_create_bin_error_shared() { a.mov(ARG1, c_p); runtime_call<4>(beam_jit_bs_construct_fail_info); - emit_leave_runtime<Update::eStack | Update::eHeap>(); + emit_leave_runtime<Update::eHeapAlloc>(); /* We must align the return address to make it a proper tagged CP, in case * we were called with `safe_fragment_call`. This is safe because we will @@ -1587,10 +1785,52 @@ void BeamGlobalAssembler::emit_bs_create_bin_error_shared() { a.jmp(labels[raise_exception_shared]); } +/* + * ARG1 = tagged bignum term + * + * On return, Z is set if ARG1 is not a bignum. Otherwise, Z is clear and + * ARG1 is the 64 least significant bits of the bignum. + */ +void BeamGlobalAssembler::emit_get_sint64_shared() { + Label success = a.newLabel(); + Label fail = a.newLabel(); + + emit_is_boxed(fail, ARG1); + x86::Gp boxed_ptr = emit_ptr_val(ARG4, ARG1); + a.mov(ARG2, emit_boxed_val(boxed_ptr)); + a.mov(ARG3, emit_boxed_val(boxed_ptr, sizeof(Eterm))); + a.and_(ARG2, imm(_TAG_HEADER_MASK)); + a.cmp(ARG2, imm(POS_BIG_SUBTAG)); + a.je(success); + + a.cmp(ARG2, imm(NEG_BIG_SUBTAG)); + a.jne(fail); + + a.neg(ARG3); + + a.bind(success); + { + a.mov(ARG1, ARG3); + /* Clear Z flag. + * + * ARG2 is known to be POS_BIG_SUBTAG or NEG_BIG_SUBTAG at this point. + */ + ERTS_CT_ASSERT(POS_BIG_SUBTAG != 0 && NEG_BIG_SUBTAG != 0); + a.test(ARG2, ARG2); + a.ret(); + } + + a.bind(fail); + { + a.xor_(ARG2, ARG2); /* Set Z flag */ + a.ret(); + } +} + struct BscSegment { BscSegment() : type(am_false), unit(1), flags(0), src(ArgNil()), size(ArgNil()), - error_info(0), effectiveSize(-1) { + error_info(0), effectiveSize(-1), action(action::DIRECT) { } Eterm type; @@ -1601,8 +1841,482 @@ struct BscSegment { Uint error_info; Sint effectiveSize; + + /* Here are sub actions for storing integer segments. + * + * We use the ACCUMULATE_FIRST and ACCUMULATE actions to shift the + * values of segments with known, small sizes (no more than 64 bits) + * into an accumulator register. + * + * When no more segments can be accumulated, the STORE action is + * used to store the value of the accumulator into the binary. + * + * The DIRECT action is used when it is not possible to use the + * accumulator (for unknown or too large sizes). + */ + enum class action { DIRECT, ACCUMULATE_FIRST, ACCUMULATE, STORE } action; }; +static std::vector<BscSegment> bs_combine_segments( + const std::vector<BscSegment> segments) { + std::vector<BscSegment> segs; + + for (auto seg : segments) { + switch (seg.type) { + case am_integer: { + if (!(0 < seg.effectiveSize && seg.effectiveSize <= 64)) { + /* Unknown or too large size. Handle using the default + * DIRECT action. */ + segs.push_back(seg); + continue; + } + + if (seg.flags & BSF_LITTLE || segs.size() == 0 || + segs.back().action == BscSegment::action::DIRECT) { + /* There are no previous compatible ACCUMULATE / STORE + * actions. Create the first ones. */ + seg.action = BscSegment::action::ACCUMULATE_FIRST; + segs.push_back(seg); + seg.action = BscSegment::action::STORE; + segs.push_back(seg); + continue; + } + + auto prev = segs.back(); + if (prev.flags & BSF_LITTLE) { + /* Little-endian segments cannot be combined with other + * segments. Create new ACCUMULATE_FIRST / STORE actions. */ + seg.action = BscSegment::action::ACCUMULATE_FIRST; + segs.push_back(seg); + seg.action = BscSegment::action::STORE; + segs.push_back(seg); + continue; + } + + /* The current segment is compatible with the previous + * segment. Try combining them. */ + if (prev.effectiveSize + seg.effectiveSize <= 64) { + /* The combined values of the segments fits in the + * accumulator. Insert an ACCUMULATE action for the + * current segment before the pre-existing STORE + * action. */ + segs.pop_back(); + prev.effectiveSize += seg.effectiveSize; + seg.action = BscSegment::action::ACCUMULATE; + segs.push_back(seg); + segs.push_back(prev); + } else { + /* The size exceeds 64 bits. Can't combine. */ + seg.action = BscSegment::action::ACCUMULATE_FIRST; + segs.push_back(seg); + seg.action = BscSegment::action::STORE; + segs.push_back(seg); + } + break; + } + default: + segs.push_back(seg); + break; + } + } + return segs; +} + +/* + * In: + * bin_offset = if valid, register to store the lower 32 bits + * of the bit offset into the binary + * bin_ptr = register to store pointer to current byte in + * bit_offset = current bit offset into binary, or -1 if unknown + * size = size of segment to be constructed + * (ignored if size_reg is valid register) + * size_reg = if a valid register, it contains the size of + * the segment to be constructed + * + * Out: + * bin_offset register = the lower 32 bits of the bit offset + * into the binary + * bin_ptr register = pointer to current byte + * + * Preserves all other registers except RET. + */ +void BeamModuleAssembler::update_bin_state(x86::Gp bin_offset, + x86::Gp current_byte, + Sint bit_offset, + Sint size, + x86::Gp size_reg) { + const int x_reg_offset = offsetof(ErtsSchedulerRegisters, x_reg_array.d); + const int cur_bin_base = + offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) + + offsetof(struct erl_bits_state, erts_current_bin_); + const int cur_bin_offset = + offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) + + offsetof(struct erl_bits_state, erts_bin_offset_); + + x86::Mem mem_bin_base = + x86::Mem(registers, cur_bin_base - x_reg_offset, sizeof(UWord)); + x86::Mem mem_bin_offset = + x86::Mem(registers, cur_bin_offset - x_reg_offset, sizeof(UWord)); + + if (bit_offset % 8 != 0 || !Support::isInt32(bit_offset + size)) { + /* The bit offset is unknown or not byte-aligned. Alternatively, + * the sum of bit_offset and size does not fit in an immediate. */ + a.mov(current_byte, mem_bin_offset); + a.mov(RET, mem_bin_base); + + if (bin_offset.isValid()) { + a.mov(bin_offset.r32(), current_byte.r32()); + } + if (size_reg.isValid()) { + a.add(mem_bin_offset, size_reg); + } else { + a.add(mem_bin_offset, imm(size)); + } + a.shr(current_byte, imm(3)); + a.add(current_byte, RET); + } else { + ASSERT(size >= 0 || size_reg.isValid()); + ASSERT(bit_offset % 8 == 0); + + comment("optimized updating of binary construction state"); + a.mov(current_byte, mem_bin_base); + if (bit_offset) { + a.add(current_byte, imm(bit_offset >> 3)); + } + if (size_reg.isValid()) { + a.add(mem_bin_offset, size_reg); + } else { + a.mov(mem_bin_offset, imm(bit_offset + size)); + } + } +} + +bool BeamModuleAssembler::need_mask(const ArgVal Val, Sint size) { + if (size == 64) { + return false; + } else { + auto [min, max] = getClampedRange(Val); + return !(0 <= min && max >> size == 0); + } +} + +/* + * The size of the segment is assumed to be in ARG3. + */ +void BeamModuleAssembler::set_zero(Sint effectiveSize) { + update_bin_state(ARG2, ARG1, -1, -1, ARG3); + + mov_imm(RET, 0); + + if (effectiveSize < 0 || effectiveSize > 128) { + /* Size is unknown or greater than 128. Modern CPUs have an + * enhanced "rep stosb" instruction that in most circumstances + * is the fastest way to clear blocks of more than 128 + * bytes. */ + Label done = a.newLabel(); + + if (effectiveSize < 0) { + a.test(ARG3, ARG3); + a.short_().jz(done); + } + + if (ARG1 != x86::rdi) { + a.mov(x86::rdi, ARG1); + } + a.mov(x86::rcx, ARG3); + a.add(x86::rcx, imm(7)); + a.shr(x86::rcx, imm(3)); + a.rep().stosb(); + + a.bind(done); + } else { + /* The size is known and it is at most 128 bits. */ + Uint offset = 0; + + ASSERT(0 <= effectiveSize && effectiveSize <= 128); + + if (effectiveSize == 128) { + a.mov(x86::Mem(ARG1, offset, 8), RET); + offset += 8; + } + + if (effectiveSize >= 64) { + a.mov(x86::Mem(ARG1, offset, 8), RET); + offset += 8; + } + + if ((effectiveSize & 63) >= 32) { + a.mov(x86::Mem(ARG1, offset, 4), RETd); + offset += 4; + } + + if ((effectiveSize & 31) >= 16) { + a.mov(x86::Mem(ARG1, offset, 2), RET.r16()); + offset += 2; + } + + if ((effectiveSize & 15) >= 8) { + a.mov(x86::Mem(ARG1, offset, 1), RET.r8()); + offset += 1; + } + + if ((effectiveSize & 7) > 0) { + a.mov(x86::Mem(ARG1, offset, 1), RET.r8()); + } + } +} + +/* + * In: + * + * ARG3 = valid unicode code point (=> 0x80) to encode + * + * Out: + * + * ARG3d = the code point encoded in UTF-8. + * ARG2 = number of bits of result (16, 24, or 32) + * + * Clobbers RET and the other ARG* registers. + */ +void BeamGlobalAssembler::emit_construct_utf8_shared() { + Label more_than_two_bytes = a.newLabel(); + Label four_bytes = a.newLabel(); + const x86::Gp tmp1 = ARG1; + const x86::Gp tmp2 = ARG2; + const x86::Gp value = ARG3; + const x86::Gp num_bits = ARG2; + + a.mov(RETd, value.r32()); + a.and_(RETd, imm(0x3f)); + + a.cmp(value.r32(), imm(0x800)); + a.jae(more_than_two_bytes); + + a.shl(RETd, imm(8)); + + a.shr(value, imm(6)); + + a.or_(value.r32(), RETd); + a.or_(value.r32(), imm(0x80c0)); + + mov_imm(num_bits, 16); + a.ret(); + + /* Test whether the value should be encoded in four bytes. */ + a.bind(more_than_two_bytes); + a.cmp(value.r32(), imm(0x10000)); + a.jae(four_bytes); + + /* Encode Unicode code point in three bytes. */ + a.shl(RETd, imm(16)); + + a.lea(tmp1.r32(), x86::Mem(0ULL, ARG3, 2, 0)); + a.and_(tmp1.r32(), imm(0x3f00)); + + a.shr(value.r32(), imm(12)); + a.or_(value.r32(), tmp1.r32()); + a.or_(value.r32(), RETd); + a.or_(value.r32(), imm(0x8080e0)); + + mov_imm(num_bits, 24); + a.ret(); + + /* Encode Unicode code point in four bytes. */ + a.bind(four_bytes); + a.shl(RETd, imm(24)); + + a.mov(tmp1.r32(), value.r32()); + a.shl(tmp1.r32(), imm(10)); + a.and_(tmp1.r32(), imm(0x3f0000)); + + a.mov(tmp2.r32(), value.r32()); + a.shr(tmp2.r32(), imm(4)); + a.and_(tmp2.r32(), imm(0x3f00)); + + a.shr(value.r32(), imm(18)); + + a.or_(value.r32(), RETd); + a.or_(value.r32(), tmp1.r32()); + a.or_(value.r32(), tmp2.r32()); + a.or_(value.r32(), imm(0xffffffff808080f0)); + + mov_imm(num_bits, 32); + a.ret(); +} + +void BeamModuleAssembler::emit_construct_utf8(const ArgVal &Src, + Sint bit_offset, + bool is_byte_aligned) { + Label prepare_store = a.newLabel(); + Label store = a.newLabel(); + Label next = a.newLabel(); + +#ifdef WIN32 + const x86::Gp bin_ptr = ARG4; + const x86::Gp bin_offset = is_byte_aligned ? x86::Gp() : ARG1; +#else + const x86::Gp bin_ptr = ARG1; + const x86::Gp bin_offset = is_byte_aligned ? x86::Gp() : ARG4; +#endif + ASSERT(!bin_offset.isValid() || bin_offset == x86::rcx); + + /* The following two registers must be the same as + * emit_construct_utf8_shared() expects. */ + const x86::Gp code_point = ARG3; + const x86::Gp size_reg = ARG2; + + comment("construct utf8 segment"); + + mov_arg(code_point, Src); + a.shr(code_point.r32(), imm(_TAG_IMMED1_SIZE)); + mov_imm(size_reg, 8); + a.cmp(code_point, imm(0x80)); + a.jb(prepare_store); + + safe_fragment_call(ga->get_construct_utf8_shared()); + + a.bind(prepare_store); + + update_bin_state(bin_offset, bin_ptr, bit_offset, -1, size_reg); + + if (!is_byte_aligned) { + /* Bit offset is unknown and is not known to be + * byte aligned. Must test alignment. */ + a.and_(bin_offset.r32(), imm(7)); + a.je(store); + + /* We must combine the last partial byte with the UTF-8 + * encoded code point. */ + + a.movzx(RETd, x86::byte_ptr(bin_ptr)); + + a.bswap(code_point); + a.shr(code_point, bin_offset.r8()); + a.bswap(code_point); + + a.shl(RETd, bin_offset.r8()); + a.and_(RETd, imm(~0xff)); + a.shr(RETd, bin_offset.r8()); + + a.or_(code_point, RET); + + a.add(size_reg.r32(), imm(8)); + } + + a.bind(store); + if (bit_offset % (4 * 8) == 0) { + /* This segment is aligned on a 4-byte boundary. This implies + * that a 4-byte write will be inside the allocated binary. */ + a.mov(x86::dword_ptr(bin_ptr), code_point.r32()); + } else { + Label do_store_1 = a.newLabel(); + Label do_store_2 = a.newLabel(); + + /* Unsuitable or unknown alignment. We must be careful not + * to write beyound the allocated end of the binary. */ + a.cmp(size_reg.r8(), imm(8)); + a.short_().jne(do_store_1); + + a.mov(x86::byte_ptr(bin_ptr), code_point.r8()); + a.short_().jmp(next); + + a.bind(do_store_1); + a.cmp(size_reg.r8(), imm(24)); + a.ja(do_store_2); + + a.mov(x86::word_ptr(bin_ptr), code_point.r16()); + a.cmp(size_reg.r8(), imm(16)); + a.short_().je(next); + + a.shr(code_point.r32(), imm(16)); + a.mov(x86::byte_ptr(bin_ptr, 2), code_point.r8()); + a.short_().jmp(next); + + a.bind(do_store_2); + a.mov(x86::dword_ptr(bin_ptr), code_point.r32()); + + if (!is_byte_aligned) { + a.cmp(size_reg.r8(), imm(32)); + a.je(next); + + a.shr(code_point, imm(32)); + a.mov(x86::byte_ptr(bin_ptr, 4), code_point.r8()); + } + } + + a.bind(next); +} +/* + * In: + * ARG1 = pointer to current byte + * ARG3 = bit offset + * ARG4 = number of bits to write + * ARG5 = data to write + */ +void BeamGlobalAssembler::emit_store_unaligned() { + Label loop = a.newLabel(); + Label done = a.newLabel(); + const x86::Gp bin_ptr = ARG1; + const x86::Gp left_bit_offset = ARG3; + const x86::Gp right_bit_offset = ARG2; + const x86::Gp num_bits = ARG4; + const x86::Gp bitdata = ARG5; + + a.movzx(RETd, x86::byte_ptr(bin_ptr)); + + a.xchg(left_bit_offset, x86::rcx); + + a.mov(right_bit_offset, bitdata); + a.and_(right_bit_offset, imm(0xff)); + a.shr(right_bit_offset, x86::cl); + + a.shl(RETd, x86::cl); + a.and_(RETd, imm(~0xff)); + a.shr(RETd, x86::cl); + + a.xchg(left_bit_offset, x86::rcx); + + a.or_(RETd, ARG2d); + a.mov(byte_ptr(ARG1), RETb); + a.add(ARG1, imm(1)); + + mov_imm(right_bit_offset, 8); + a.sub(right_bit_offset, left_bit_offset); + + a.xchg(right_bit_offset, x86::rcx); + a.bswap(bitdata); + a.shl(bitdata, x86::cl); + a.xchg(right_bit_offset, x86::rcx); + + a.sub(ARG4, right_bit_offset); + a.jle(done); + + a.bind(loop); + a.rol(bitdata, imm(8)); + a.mov(byte_ptr(ARG1), bitdata.r8()); + a.add(ARG1, imm(1)); + a.sub(num_bits, imm(8)); + a.jg(loop); + + a.bind(done); + a.ret(); +} + +bool BeamModuleAssembler::bs_maybe_enter_runtime(bool entered) { + if (!entered) { + comment("enter runtime"); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(); + } + return true; +} + +void BeamModuleAssembler::bs_maybe_leave_runtime(bool entered) { + if (entered) { + comment("leave runtime"); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(); + } +} + void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, const ArgWord &Alloc, const ArgWord &Live0, @@ -1611,10 +2325,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, Uint num_bits = 0; std::size_t n = args.size(); std::vector<BscSegment> segments; - Label error = a.newLabel(); - Label past_error = a.newLabel(); + Label error; /* Intentionally uninitialized */ ArgWord Live = Live0; x86::Gp sizeReg; + Sint allocated_size = -1; + bool need_error_handler = false; + bool runtime_entered = false; /* * Collect information about each segment and calculate sizes of @@ -1660,12 +2376,45 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, seg.error_info = beam_jit_set_bsc_segment_op(bsc_segment, bsc_op); /* + * Test whether we can omit the code for the error handler. + */ + switch (seg.type) { + case am_append: + if (!(exact_type<BeamTypeId::Bitstring>(seg.src) && + std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit)) { + need_error_handler = true; + } + break; + case am_binary: + if (!(seg.size.isAtom() && seg.size.as<ArgAtom>().get() == am_all && + exact_type<BeamTypeId::Bitstring>(seg.src) && + std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit)) { + need_error_handler = true; + } + break; + case am_integer: + if (!exact_type<BeamTypeId::Integer>(seg.src)) { + need_error_handler = true; + } + break; + case am_private_append: + case am_string: + break; + default: + need_error_handler = true; + break; + } + + /* * As soon as we have entered runtime mode, Y registers can no * longer be accessed in the usual way. Therefore, if the source - * and/or size are in Y register, copy them to X registers. + * and/or size are in Y registers, copy them to X registers. Be + * careful to preserve any associated type information. */ if (seg.src.isYRegister()) { - ArgVal reg = ArgXRegister(Live.get()); + auto reg = + seg.src.as<ArgYRegister>().copy<ArgXRegister>(Live.get()); + ASSERT(reg.typeIndex() == seg.src.as<ArgYRegister>().typeIndex()); mov_arg(reg, seg.src); Live = Live + 1; @@ -1673,7 +2422,9 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, } if (seg.size.isYRegister()) { - ArgVal reg = ArgXRegister(Live.get()); + auto reg = + seg.size.as<ArgYRegister>().copy<ArgXRegister>(Live.get()); + ASSERT(reg.typeIndex() == seg.size.as<ArgYRegister>().typeIndex()); mov_arg(reg, seg.size); Live = Live + 1; @@ -1694,16 +2445,64 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, if (seg.effectiveSize < 0 && seg.type != am_append && seg.type != am_private_append) { sizeReg = FCALLS; + need_error_handler = true; } segments.insert(segments.end(), seg); } - emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); + /* + * Test whether a heap binary of fixed size will result from the + * construction. If so, allocate and construct the binary now + * before entering the runtime mode. + */ + if (!sizeReg.isValid() && num_bits % 8 == 0 && + num_bits / 8 <= ERL_ONHEAP_BIN_LIMIT && segments[0].type != am_append && + segments[0].type != am_private_append) { + const int x_reg_offset = + offsetof(ErtsSchedulerRegisters, x_reg_array.d); + const int cur_bin_base = + offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) + + offsetof(struct erl_bits_state, erts_current_bin_); + const int cur_bin_offset = + offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) + + offsetof(struct erl_bits_state, erts_bin_offset_); + x86::Mem mem_bin_base = + x86::qword_ptr(registers, cur_bin_base - x_reg_offset); + x86::Mem mem_bin_offset = + x86::qword_ptr(registers, cur_bin_offset - x_reg_offset); + Uint num_bytes = num_bits / 8; + + comment("allocate heap binary"); + allocated_size = (num_bytes + 7) & (-8); + + /* Ensure that there is enough room on the heap. */ + Uint need = heap_bin_size(num_bytes) + Alloc.get(); + emit_gc_test(ArgWord(0), ArgWord(need), Live); + + /* Create the heap binary. */ + a.lea(RET, x86::qword_ptr(HTOP, TAG_PRIMARY_BOXED)); + a.mov(TMP_MEM1q, RET); + a.mov(x86::qword_ptr(HTOP, 0), imm(header_heap_bin(num_bytes))); + a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), imm(num_bytes)); + + /* Initialize the erl_bin_state struct. */ + a.add(HTOP, imm(sizeof(Eterm[2]))); + a.mov(mem_bin_base, HTOP); + a.mov(mem_bin_offset, imm(0)); + + /* Update HTOP. */ + a.add(HTOP, imm(allocated_size)); + } + + if (!need_error_handler) { + comment("(cannot fail)"); + } else { + Label past_error = a.newLabel(); + + runtime_entered = bs_maybe_enter_runtime(false); + a.short_().jmp(past_error); - a.short_().jmp(past_error); - a.bind(error); - { /* * ARG1 = optional bad size value; valid if BSC_VALUE_ARG1 is set in * ARG4 @@ -1713,17 +2512,18 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, * * ARG4 = packed error information */ + error = a.newLabel(); + a.bind(error); + bs_maybe_leave_runtime(runtime_entered); comment("handle error"); - emit_leave_runtime<Update::eReductions | Update::eStack | - Update::eHeap>(); if (Fail.get() != 0) { a.jmp(resolve_beam_label(Fail)); } else { safe_fragment_call(ga->get_bs_create_bin_error_shared()); } - } - a.bind(past_error); + a.bind(past_error); + } /* We count the total number of bits in an unsigned integer. To * avoid having to check for overflow when adding to the counter, @@ -1748,12 +2548,50 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, if (seg.size.isAtom() && seg.size.as<ArgAtom>().get() == am_all && seg.type == am_binary) { comment("size of an entire binary"); + runtime_entered = bs_maybe_enter_runtime(runtime_entered); mov_arg(ARG1, seg.src); - runtime_call<1>(beam_jit_bs_bit_size); - if (exact_type(seg.src, BEAM_TYPE_BITSTRING)) { - comment("skipped check for success since the source " - "is always a bit string"); + + if (exact_type<BeamTypeId::Bitstring>(seg.src)) { + auto unit = getSizeUnit(seg.src); + bool is_bitstring = unit == 0 || std::gcd(unit, 8) != 8; + x86::Gp boxed_ptr = emit_ptr_val(ARG1, ARG1); + + if (is_bitstring) { + comment("inlined size code because the value is always " + "a bitstring"); + } else { + comment("inlined size code because the value is always " + "a binary"); + } + + a.mov(ARG2, emit_boxed_val(boxed_ptr, sizeof(Eterm))); + + if (is_bitstring) { + a.mov(RETd, emit_boxed_val(boxed_ptr, 0, sizeof(Uint32))); + } + + a.lea(sizeReg, x86::Mem(sizeReg, ARG2, 3, 0, 1)); + + if (is_bitstring) { + Label not_sub_bin = a.newLabel(); + const auto diff_mask = + _TAG_HEADER_SUB_BIN - _TAG_HEADER_REFC_BIN; + ERTS_CT_ASSERT((_TAG_HEADER_SUB_BIN & diff_mask) != 0 && + (_TAG_HEADER_REFC_BIN & diff_mask) == 0 && + (_TAG_HEADER_HEAP_BIN & diff_mask) == 0); + a.test(RETb, imm(diff_mask)); + a.short_().jz(not_sub_bin); + + a.movzx(RETd, + emit_boxed_val(boxed_ptr, + offsetof(ErlSubBin, bitsize), + 1)); + a.add(sizeReg, RET); + + a.bind(not_sub_bin); + } } else { + runtime_call<1>(beam_jit_bs_bit_size); if (Fail.get() == 0) { mov_arg(ARG1, seg.src); mov_imm(ARG4, @@ -1764,17 +2602,15 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, } a.test(RET, RET); a.js(error); + a.add(sizeReg, RET); } - a.add(sizeReg, RET); } else if (seg.unit != 0) { bool can_fail = true; comment("size binary/integer/float/string"); - if (always_small(seg.size)) { - auto min = std::get<0>(getIntRange(seg.size)); - if (min >= 0) { - can_fail = false; - } + if (std::get<0>(getClampedRange(seg.size)) >= 0) { + /* Can't fail if size is always positive. */ + can_fail = false; } if (can_fail && Fail.get() == 0) { @@ -1789,10 +2625,9 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, if (always_small(seg.size)) { comment("skipped test for small size since it is always small"); - } else if (always_one_of(seg.size, - BEAM_TYPE_FLOAT | BEAM_TYPE_INTEGER)) { + } else if (always_one_of<BeamTypeId::Number>(seg.size)) { comment("simplified test for small size since it is a number"); - a.test(ARG1d, imm(TAG_PRIMARY_LIST)); + a.test(ARG1.r8(), imm(TAG_PRIMARY_LIST)); a.je(error); } else { a.mov(RETd, ARG1d); @@ -1827,23 +2662,59 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, comment("size utf8"); mov_arg(ARG1, seg.src); + if (Fail.get() == 0) { + mov_imm(ARG4, + beam_jit_update_bsc_reason_info(seg.error_info, + BSC_REASON_BADARG, + BSC_INFO_TYPE, + BSC_VALUE_ARG1)); + } + + if (always_small(seg.src)) { + comment("skipped test for small value since it is always " + "small"); + } else if (always_one_of<BeamTypeId::Integer, + BeamTypeId::AlwaysBoxed>(seg.src)) { + comment("simplified test for small operand since other " + "types are boxed"); + emit_is_not_boxed(error, ARG1); + } else { + a.mov(RETd, ARG1d); + a.and_(RETb, imm(_TAG_IMMED1_MASK)); + a.cmp(RETb, imm(_TAG_IMMED1_SMALL)); + a.jne(error); + } + mov_imm(RET, 0); - a.mov(RETb, imm(1 * 8)); + a.mov(RETb, imm(1)); a.cmp(ARG1, imm(make_small(0x80UL))); - a.short_().jl(next); + a.short_().jb(next); - a.mov(RETb, imm(2 * 8)); + a.mov(RETb, imm(2)); a.cmp(ARG1, imm(make_small(0x800UL))); - a.short_().jl(next); + a.short_().jb(next); - a.mov(RETb, imm(3 * 8)); - a.cmp(ARG1, imm(make_small(0x10000UL))); - a.short_().jl(next); + /* Ensure that the value is not in the invalid range + * 0xD800 through 0xDFFF. */ + a.mov(ARG2, ARG1); + a.sar(ARG2, imm(11 + _TAG_IMMED1_SIZE)); + a.cmp(ARG2, imm(0x1b)); + a.je(error); - a.mov(RETb, imm(4 * 8)); + a.cmp(ARG1, imm(make_small(0x10000UL))); + a.setae(RETb); + a.add(RETb, imm(3)); + + auto [min, max] = getClampedRange(seg.src); + if (0 <= min && max < 0x110000) { + comment("skipped range check for unicode code point"); + } else { + a.cmp(ARG1, imm(make_small(0x110000))); + a.jae(error); + } a.bind(next); - a.add(sizeReg, RET); + a.lea(sizeReg, x86::Mem(sizeReg, RET, 3, 0, 1)); break; } case am_utf16: { @@ -1891,9 +2762,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, } } + segments = bs_combine_segments(segments); + /* Allocate the binary. */ if (segments[0].type == am_append) { BscSegment seg = segments[0]; + runtime_entered = bs_maybe_enter_runtime(runtime_entered); comment("append to binary"); mov_arg(ARG3, Live); if (sizeReg.isValid()) { @@ -1907,18 +2781,28 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, a.mov(ARG1, c_p); load_x_reg_array(ARG2); runtime_call<6>(erts_bs_append_checked); - if (Fail.get() == 0) { - mov_arg(ARG1, ArgXRegister(Live.get())); - mov_imm(ARG4, - beam_jit_update_bsc_reason_info(seg.error_info, - BSC_REASON_BADARG, - BSC_INFO_FVALUE, - BSC_VALUE_ARG1)); + + if (exact_type<BeamTypeId::Bitstring>(seg.src) && + std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) { + /* There is no way the call can fail with a system_limit + * exception on a 64-bit architecture. */ + comment("skipped test for success because units are compatible"); + } else { + if (Fail.get() == 0) { + mov_arg(ARG1, ArgXRegister(Live.get())); + mov_imm(ARG4, + beam_jit_update_bsc_reason_info(seg.error_info, + BSC_REASON_BADARG, + BSC_INFO_FVALUE, + BSC_VALUE_ARG1)); + } + emit_test_the_non_value(RET); + a.je(error); } - emit_test_the_non_value(RET); - a.je(error); + a.mov(TMP_MEM1q, RET); } else if (segments[0].type == am_private_append) { BscSegment seg = segments[0]; + runtime_entered = bs_maybe_enter_runtime(runtime_entered); comment("private append to binary"); ASSERT(Alloc.get() == 0); mov_arg(ARG2, seg.src); @@ -1931,38 +2815,53 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, a.mov(ARG1, c_p); runtime_call<4>(erts_bs_private_append_checked); /* There is no way the call can fail on a 64-bit architecture. */ + a.mov(TMP_MEM1q, RET); + } else if (allocated_size >= 0) { + /* The binary has already been allocated. */ } else { comment("allocate binary"); + runtime_entered = bs_maybe_enter_runtime(runtime_entered); mov_arg(ARG5, Alloc); mov_arg(ARG6, Live); load_erl_bits_state(ARG3); load_x_reg_array(ARG2); a.mov(ARG1, c_p); if (sizeReg.isValid()) { - comment("(size in bits)"); a.mov(ARG4, sizeReg); runtime_call<6>(beam_jit_bs_init_bits); - } else if (num_bits % 8 == 0) { - comment("(size in bytes)"); - mov_imm(ARG4, num_bits / 8); - runtime_call<6>(beam_jit_bs_init); } else { + allocated_size = (num_bits + 7) / 8; + if (allocated_size <= ERL_ONHEAP_BIN_LIMIT) { + allocated_size = (allocated_size + 7) & (-8); + } mov_imm(ARG4, num_bits); runtime_call<6>(beam_jit_bs_init_bits); } + a.mov(TMP_MEM1q, RET); } - a.mov(TMP_MEM1q, RET); + + /* Keep track of the bit offset from the being of the binary. + * Set to -1 if offset is not known (when a segment of unknown + * size has been seen). */ + Sint bit_offset = 0; + + /* Keep track of whether the current segment is byte-aligned. (A + * segment can be known to be byte-aligned even if the bit offset + * is unknown.) */ + bool is_byte_aligned = true; /* Build each segment of the binary. */ for (auto seg : segments) { switch (seg.type) { case am_append: case am_private_append: + bit_offset = -1; break; case am_binary: { Uint error_info; bool can_fail = true; + runtime_entered = bs_maybe_enter_runtime(runtime_entered); comment("construct a binary segment"); if (seg.effectiveSize >= 0) { /* The segment has a literal size. */ @@ -1986,8 +2885,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, BSC_REASON_BADARG, BSC_INFO_UNIT, BSC_VALUE_FVALUE); - if (seg.unit == 1) { - comment("skipped test for success because unit =:= 1"); + if (exact_type<BeamTypeId::Bitstring>(seg.src) && + std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) { + comment("skipped test for success because units are " + "compatible"); can_fail = false; } } else { @@ -2021,6 +2922,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, break; } case am_float: + runtime_entered = bs_maybe_enter_runtime(runtime_entered); comment("construct float segment"); if (seg.effectiveSize >= 0) { mov_imm(ARG3, seg.effectiveSize); @@ -2049,42 +2951,292 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, a.jne(error); break; case am_integer: - comment("construct integer segment"); - if (seg.effectiveSize >= 0) { - mov_imm(ARG3, seg.effectiveSize); - } else { - mov_arg(ARG3, seg.size); - a.sar(ARG3, imm(_TAG_IMMED1_SIZE)); - if (seg.unit != 1) { - mov_imm(RET, seg.unit); - a.mul(ARG3); /* CLOBBERS RDX = ARG3! */ - a.mov(ARG3, RET); + switch (seg.action) { + case BscSegment::action::ACCUMULATE_FIRST: + case BscSegment::action::ACCUMULATE: { + /* Shift an integer of known size (no more than 64 bits) + * into a word-size accumulator. */ + Label accumulate = a.newLabel(); + Label value_is_small = a.newLabel(); + x86::Gp tmp = ARG4; + x86::Gp bin_data = ARG5; + + comment("accumulate value for integer segment"); + if (seg.action == BscSegment::action::ACCUMULATE_FIRST) { + mov_imm(bin_data, 0); + } else if (seg.effectiveSize < 64) { + a.shl(bin_data, imm(seg.effectiveSize)); + } + mov_arg(ARG1, seg.src); + + if (!always_small(seg.src)) { + if (always_one_of<BeamTypeId::Integer, + BeamTypeId::AlwaysBoxed>(seg.src)) { + comment("simplified small test since all other types " + "are boxed"); + emit_is_boxed(value_is_small, seg.src, ARG1); + } else { + a.mov(ARG2d, ARG1d); + a.and_(ARG2d, imm(_TAG_IMMED1_MASK)); + a.cmp(ARG2d, imm(_TAG_IMMED1_SMALL)); + a.short_().je(value_is_small); + } + + /* The value is boxed. If it is a bignum, extract the + * least significant 64 bits. */ + safe_fragment_call(ga->get_get_sint64_shared()); + if (exact_type<BeamTypeId::Integer>(seg.src)) { + a.short_().jmp(accumulate); + } else { + a.short_().jne(accumulate); + + /* Not a bignum. Signal error. */ + if (Fail.get() == 0) { + mov_imm(ARG4, + beam_jit_update_bsc_reason_info( + seg.error_info, + BSC_REASON_BADARG, + BSC_INFO_TYPE, + BSC_VALUE_ARG1)); + } + a.jmp(error); + } } + + a.bind(value_is_small); + a.sar(ARG1, imm(_TAG_IMMED1_SIZE)); + + /* Mask (if needed) and accumulate. */ + a.bind(accumulate); + if (seg.effectiveSize == 64) { + a.mov(bin_data, ARG1); + } else if (!need_mask(seg.src, seg.effectiveSize)) { + comment("skipped masking because the value always fits"); + a.or_(bin_data, ARG1); + } else if (seg.effectiveSize == 32) { + a.mov(ARG1d, ARG1d); + a.or_(bin_data, ARG1); + } else if (seg.effectiveSize < 32) { + a.and_(ARG1, (1ULL << seg.effectiveSize) - 1); + a.or_(bin_data, ARG1); + } else { + mov_imm(tmp, (1ULL << seg.effectiveSize) - 1); + a.and_(ARG1, tmp); + a.or_(bin_data, ARG1); + } + break; } - mov_arg(ARG2, seg.src); - mov_imm(ARG4, seg.flags); - load_erl_bits_state(ARG1); - runtime_call<4>(erts_new_bs_put_integer); - if (exact_type(seg.src, BEAM_TYPE_INTEGER)) { - comment("skipped test for success because construction can't " - "fail"); - } else { - if (Fail.get() == 0) { - mov_arg(ARG1, seg.src); - mov_imm(ARG4, - beam_jit_update_bsc_reason_info(seg.error_info, - BSC_REASON_BADARG, - BSC_INFO_TYPE, - BSC_VALUE_ARG1)); + case BscSegment::action::STORE: { + /* The accumulator is now full or the next segment is + * not possible to accumulate, so it's time to store + * the accumulator to the current position in the + * binary. */ + Label store = a.newLabel(); + Label done = a.newLabel(); + x86::Gp bin_ptr = ARG1; + x86::Gp bin_offset = ARG3; + x86::Gp tmp = ARG4; + x86::Gp bin_data = ARG5; + + comment("construct integer segment from accumulator"); + + /* First we'll need to ensure that the value in the + * accumulator is in little endian format. */ + if (seg.effectiveSize % 8 != 0) { + Uint complete_bytes = 8 * (seg.effectiveSize / 8); + Uint num_partial = seg.effectiveSize % 8; + if ((seg.flags & BSF_LITTLE) == 0) { + a.shl(bin_data, imm(64 - seg.effectiveSize)); + a.bswap(bin_data); + } else { + Sint mask = (1ll << complete_bytes) - 1; + a.mov(RET, bin_data); + a.shr(RET, imm(complete_bytes)); + a.and_(RETd, imm((1ull << num_partial) - 1)); + a.shl(RET, imm(complete_bytes + 8 - num_partial)); + if (Support::isInt32(mask)) { + a.and_(bin_data, imm(mask)); + } else { + mov_imm(tmp, mask); + a.and_(bin_data, tmp); + } + a.or_(bin_data, RET); + } + } else if ((seg.flags & BSF_LITTLE) == 0) { + switch (seg.effectiveSize) { + case 8: + break; + case 32: + a.bswap(bin_data.r32()); + break; + case 64: + a.bswap(bin_data); + break; + default: + a.bswap(bin_data); + a.shr(bin_data, imm(64 - seg.effectiveSize)); + break; + } } - a.test(RETd, RETd); - a.je(error); + + update_bin_state(bin_offset, + bin_ptr, + bit_offset, + seg.effectiveSize, + x86::Gp()); + + if (!is_byte_aligned) { + if (bit_offset < 0) { + /* Bit offset is unknown. Must test alignment. */ + a.and_(bin_offset, imm(7)); + a.short_().je(store); + } else if (bit_offset >= 0) { + /* Alignment is known to be unaligned. */ + mov_imm(bin_offset, bit_offset & 7); + } + + /* Bit offset is tested or known to be unaligned. */ + mov_imm(ARG4, seg.effectiveSize); + safe_fragment_call(ga->get_store_unaligned()); + + if (bit_offset < 0) { + /* The bit offset is unknown, which implies + * that there exists store code that we will + * need to branch past. */ + a.short_().jmp(done); + } + } + + a.bind(store); + + if (bit_offset <= 0 || is_byte_aligned) { + /* Bit offset is tested or known to be + * byte-aligned. Emit inline code to store the + * value of the accumulator into the binary. */ + int num_bytes = (seg.effectiveSize + 7) / 8; + + /* If more than one instruction is required for + * doing the store, test whether it would be safe + * to do a single 32 or 64 bit store. */ + switch (num_bytes) { + case 3: + if (bit_offset >= 0 && + allocated_size * 8 - bit_offset >= 32) { + comment("simplified complicated store"); + num_bytes = 4; + } + break; + case 5: + case 6: + case 7: + if (bit_offset >= 0 && + allocated_size * 8 - bit_offset >= 64) { + comment("simplified complicated store"); + num_bytes = 8; + } + break; + } + + do { + switch (num_bytes) { + case 1: + a.mov(x86::Mem(bin_ptr, 0, 1), bin_data.r8()); + break; + case 2: + a.mov(x86::Mem(bin_ptr, 0, 2), bin_data.r16()); + break; + case 3: + a.mov(x86::Mem(bin_ptr, 0, 2), bin_data.r16()); + a.shr(bin_data, imm(16)); + a.mov(x86::Mem(bin_ptr, 2, 1), bin_data.r8()); + break; + case 4: + a.mov(x86::Mem(bin_ptr, 0, 4), bin_data.r32()); + break; + case 5: + case 6: + case 7: + a.mov(x86::Mem(bin_ptr, 0, 4), bin_data.r32()); + a.add(bin_ptr, imm(4)); + a.shr(bin_data, imm(32)); + break; + case 8: + a.mov(x86::Mem(bin_ptr, 0, 8), bin_data); + num_bytes = 0; + break; + default: + ASSERT(0); + } + num_bytes -= 4; + } while (num_bytes > 0); + } + + a.bind(done); + break; + } + case BscSegment::action::DIRECT: + /* This segment either has a size exceeding the maximum + * accumulator size of 64 bits or has a variable size. + * + * First load the effective size (size * unit) into ARG3. + */ + comment("construct integer segment"); + if (seg.effectiveSize >= 0) { + mov_imm(ARG3, seg.effectiveSize); + } else { + mov_arg(ARG3, seg.size); + a.sar(ARG3, imm(_TAG_IMMED1_SIZE)); + if (Support::isPowerOf2(seg.unit)) { + Uint trailing_bits = Support::ctz<Eterm>(seg.unit); + if (trailing_bits) { + a.shl(ARG3, imm(trailing_bits)); + } + } else { + mov_imm(RET, seg.unit); + a.mul(ARG3); /* CLOBBERS RDX = ARG3! */ + a.mov(ARG3, RET); + } + } + + if (is_byte_aligned && seg.src.isSmall() && + seg.src.as<ArgSmall>().getSigned() == 0) { + /* Optimize the special case of setting a known + * byte-aligned segment to zero. */ + comment("optimized setting segment to 0"); + set_zero(seg.effectiveSize); + } else { + /* Call the helper function to fetch and store the + * integer into the binary. */ + runtime_entered = bs_maybe_enter_runtime(runtime_entered); + mov_arg(ARG2, seg.src); + mov_imm(ARG4, seg.flags); + load_erl_bits_state(ARG1); + runtime_call<4>(erts_new_bs_put_integer); + if (exact_type<BeamTypeId::Integer>(seg.src)) { + comment("skipped test for success because construction " + "can't fail"); + } else { + if (Fail.get() == 0) { + mov_arg(ARG1, seg.src); + mov_imm(ARG4, + beam_jit_update_bsc_reason_info( + seg.error_info, + BSC_REASON_BADARG, + BSC_INFO_TYPE, + BSC_VALUE_ARG1)); + } + a.test(RETd, RETd); + a.je(error); + } + } + break; } break; case am_string: { ArgBytePtr string_ptr( ArgVal(ArgVal::BytePtr, seg.src.as<ArgWord>().get())); + runtime_entered = bs_maybe_enter_runtime(runtime_entered); comment("insert string"); ASSERT(seg.effectiveSize >= 0); mov_imm(ARG3, seg.effectiveSize / 8); @@ -2092,22 +3244,13 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, load_erl_bits_state(ARG1); runtime_call<3>(erts_new_bs_put_string); } break; - case am_utf8: - mov_arg(ARG2, seg.src); - load_erl_bits_state(ARG1); - runtime_call<2>(erts_bs_put_utf8); - if (Fail.get() == 0) { - mov_arg(ARG1, seg.src); - mov_imm(ARG4, - beam_jit_update_bsc_reason_info(seg.error_info, - BSC_REASON_BADARG, - BSC_INFO_TYPE, - BSC_VALUE_ARG1)); - } - a.test(RETd, RETd); - a.je(error); + case am_utf8: { + runtime_entered = bs_maybe_enter_runtime(runtime_entered); + emit_construct_utf8(seg.src, bit_offset, is_byte_aligned); break; + } case am_utf16: + runtime_entered = bs_maybe_enter_runtime(runtime_entered); mov_arg(ARG2, seg.src); a.mov(ARG3, seg.flags); load_erl_bits_state(ARG1); @@ -2124,6 +3267,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, a.je(error); break; case am_utf32: + runtime_entered = bs_maybe_enter_runtime(runtime_entered); mov_arg(ARG2, seg.src); mov_imm(ARG3, 4 * 8); a.mov(ARG4, seg.flags); @@ -2144,10 +3288,1098 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, ASSERT(0); break; } + + /* Try to keep track of the bit offset. */ + if (bit_offset >= 0 && (seg.action == BscSegment::action::DIRECT || + seg.action == BscSegment::action::STORE)) { + if (seg.effectiveSize >= 0) { + bit_offset += seg.effectiveSize; + } else { + bit_offset = -1; + } + } + + /* Try to keep track whether the next segment is byte + * aligned. */ + if (seg.type == am_append || seg.type == am_private_append) { + if (!exact_type<BeamTypeId::Bitstring>(seg.src) || + std::gcd(getSizeUnit(seg.src), 8) != 8) { + is_byte_aligned = false; + } + } else if (bit_offset % 8 == 0) { + is_byte_aligned = true; + } else if (seg.effectiveSize >= 0) { + if (seg.effectiveSize % 8 != 0) { + is_byte_aligned = false; + } + } else if (std::gcd(seg.unit, 8) != 8) { + is_byte_aligned = false; + } } + bs_maybe_leave_runtime(runtime_entered); comment("done"); - emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); a.mov(RET, TMP_MEM1q); mov_arg(Dst, RET); } + +/* + * Here follows the bs_match instruction and friends. + */ + +struct BsmSegment { + BsmSegment() + : action(action::TEST_HEAP), live(ArgNil()), size(0), unit(1), + flags(0), dst(ArgXRegister(0)){}; + + enum class action { + TEST_HEAP, + ENSURE_AT_LEAST, + ENSURE_EXACTLY, + READ, + EXTRACT_BINARY, + EXTRACT_INTEGER, + READ_INTEGER, + GET_INTEGER, + GET_BINARY, + SKIP, + DROP, + GET_TAIL, + EQ + } action; + ArgVal live; + Uint size; + Uint unit; + Uint flags; + ArgRegister dst; +}; + +void BeamModuleAssembler::emit_read_bits(Uint bits, + const x86::Gp bin_base, + const x86::Gp bin_offset, + const x86::Gp bitdata) { + Label read_done = a.newLabel(); + auto num_partial = bits % 8; + auto num_bytes_to_read = (bits + 7) / 8; + + ASSERT(bin_offset == x86::rcx); + + a.mov(RET, bin_offset); + a.shr(RET, imm(3)); + if (num_bytes_to_read != 1) { + a.add(bin_base, RET); + } + a.and_(bin_offset.r32(), imm(7)); + + /* + * Special-case handling of reading 8 or 9 bytes. + */ + if (num_bytes_to_read == 8) { + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8)); + } else { + a.mov(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8)); + a.bswap(bitdata); + } + + a.shl(bitdata, bin_offset.r8()); + + a.test(x86::cl, imm(7)); + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, this + * segment always needs an additional byte. */ + a.jz(read_done); + } else if (num_partial > 1) { + /* Non-byte-sized segment. Test whether we will need an + * additional byte. */ + a.cmp(bin_offset.r32(), imm(8 - num_partial)); + a.jle(read_done); + } + + if (num_partial != 1) { + /* Read an extra byte. */ + a.movzx(RETd, x86::byte_ptr(bin_base, 8)); + a.shl(RETd, bin_offset.r8()); + a.shr(RETd, imm(8)); + a.or_(bitdata, RET); + } + + a.bind(read_done); + + return; + } + + /* + * Handle reading of up to 7 bytes. + */ + Label handle_partial = a.newLabel(); + Label swap = a.newLabel(); + Label shift = a.newLabel(); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, this + * segment always needs an additional byte. */ + a.jnz(handle_partial); + } else if (num_partial > 1) { + /* Non-byte-sized segment. Test whether we will need an + * additional byte. */ + a.cmp(bin_offset.r32(), imm(8 - num_partial)); + a.jg(handle_partial); + } + + /* We don't need an extra byte. */ + if (num_bytes_to_read == 1) { + a.movzx(bitdata.r32(), x86::byte_ptr(bin_base, RET)); + if (num_partial == 0) { + a.bswap(bitdata); + a.short_().jmp(read_done); + } else if (num_partial > 1) { + a.short_().jmp(swap); + } + } else if (num_bytes_to_read <= 4) { + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(bitdata.r32(), + x86::dword_ptr(bin_base, num_bytes_to_read - 4)); + } else { + a.mov(bitdata.r32(), + x86::dword_ptr(bin_base, num_bytes_to_read - 4)); + a.bswap(bitdata.r32()); + } + a.add(bin_offset.r32(), imm(64 - 8 * num_bytes_to_read)); + a.short_().jmp(shift); + } else { + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8)); + } else { + a.mov(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8)); + a.bswap(bitdata); + } + ASSERT(num_bytes_to_read < 8); + a.add(bin_offset.r32(), imm(64 - 8 * num_bytes_to_read)); + a.short_().jmp(shift); + } + + /* We'll need an extra byte and we will need to shift. */ + a.bind(handle_partial); + if (num_partial != 1) { + if (num_bytes_to_read == 1) { + a.mov(bitdata.r16(), x86::word_ptr(bin_base, RET)); + } else { + ASSERT(num_bytes_to_read < 8); + a.mov(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 7)); + a.shr(bitdata, imm(64 - 8 * (num_bytes_to_read + 1))); + } + } + + a.bind(swap); + a.bswap(bitdata); + + /* Shift the read data into the most significant bits of the + * word. */ + a.bind(shift); + a.shl(bitdata, bin_offset.r8()); + + a.bind(read_done); +} + +/* + * Read an integer and store as a term. This function only handles + * integers of certain common sizes. This is a special optimization + * when only one integer is to be extracted from a binary. + * + * Input: bin_base, bin_offset + * + * Clobbers: bin_base, bin_offset, tmp, RET + */ +void BeamModuleAssembler::emit_read_integer(const x86::Gp bin_base, + const x86::Gp bin_offset, + const x86::Gp tmp, + Uint flags, + Uint bits, + const ArgRegister &Dst) { + Label handle_unaligned = a.newLabel(); + Label store = a.newLabel(); + x86::Mem address; + + a.mov(tmp, bin_offset); + a.shr(tmp, imm(3)); + a.and_(bin_offset.r32(), imm(7)); + + switch (bits) { + case 8: + address = x86::Mem(bin_base, tmp, 0, 0, 1); + if ((flags & BSF_SIGNED) == 0) { + a.movzx(RETd, address); + } else { + a.movsx(RET, address); + } + + a.short_().jz(store); + + a.bind(handle_unaligned); + address = x86::Mem(bin_base, tmp, 0, 0, 2); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RET.r16(), address); + } else { + a.mov(RET.r16(), address); + a.xchg(x86::al, x86::ah); + } + ASSERT(bin_offset == x86::rcx); + a.shl(RETd, bin_offset.r8()); + a.mov(x86::al, x86::ah); + if ((flags & BSF_SIGNED) == 0) { + a.movzx(RETd, RETb); + } else { + a.movsx(RET, RETb); + } + break; + case 16: + address = x86::Mem(bin_base, tmp, 0, 0, 2); + if ((flags & BSF_LITTLE) != 0) { + if ((flags & BSF_SIGNED) == 0) { + a.movzx(RETd, address); + } else { + a.movsx(RET, address); + } + } else { + /* Big-endian segment. */ + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RET.r16(), address); + } else { + a.mov(RET.r16(), address); + a.xchg(x86::al, x86::ah); + } + + if ((flags & BSF_SIGNED) != 0) { + a.movsx(RET, RET.r16()); + } else { + a.movzx(RET, RET.r16()); + } + } + + a.short_().jz(store); + + a.bind(handle_unaligned); + a.add(bin_base, tmp); + address = x86::Mem(bin_base, -1, 4); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RETd, address); + } else { + a.mov(RETd, address); + a.bswap(RETd); + } + ASSERT(bin_offset == x86::rcx); + a.shl(RETd, bin_offset.r8()); + a.shr(RETd, imm(8)); + + if ((flags & BSF_LITTLE) != 0) { + a.xchg(x86::al, x86::ah); + } + + if ((flags & BSF_SIGNED) == 0) { + a.movzx(RETd, RET.r16()); + } else { + a.movsx(RET, RET.r16()); + } + break; + case 32: + address = x86::Mem(bin_base, tmp, 0, 0, 4); + if ((flags & BSF_LITTLE) != 0) { + /* Little-endian segment. */ + if ((flags & BSF_SIGNED) == 0) { + a.mov(RETd, address); + } else { + a.movsxd(RET, address); + } + } else { + /* Big-endian segment. */ + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RETd, address); + } else { + a.mov(RETd, address); + a.bswap(RETd); + } + + if ((flags & BSF_SIGNED) != 0) { + a.movsxd(RET, RETd); + } + } + + a.short_().jz(store); + + a.bind(handle_unaligned); + a.add(bin_base, tmp); + address = x86::Mem(bin_base, -3, 8); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RET, address); + } else { + a.mov(RET, address); + a.bswap(RET); + } + ASSERT(bin_offset == x86::rcx); + a.shl(RET, bin_offset.r8()); + a.shr(RET, imm(8)); + + if ((flags & BSF_LITTLE) != 0) { + a.bswap(RETd); + } + + if ((flags & BSF_SIGNED) == 0) { + a.mov(RETd, RETd); + } else { + a.movsxd(RET, RETd); + } + break; + default: + ASSERT(0); + break; + } + + a.bind(store); + a.shl(RET, imm(_TAG_IMMED1_SIZE)); + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + mov_arg(Dst, RET); +} + +void BeamModuleAssembler::emit_extract_integer(const x86::Gp bitdata, + const x86::Gp tmp, + Uint flags, + Uint bits, + const ArgRegister &Dst) { + if (bits == 0) { + /* Necessary for correctness when matching a zero-size + * signed segment. + */ + mov_arg(Dst, make_small(0)); + return; + } + + Label big = a.newLabel(); + Label done = a.newLabel(); + Uint num_partial = bits % 8; + Uint num_complete = 8 * (bits / 8); + + if (bits <= 8) { + /* Endian does not matter for values that fit in a byte. */ + flags &= ~BSF_LITTLE; + } + + if ((flags & BSF_LITTLE) == 0) { + /* Big-endian segment. */ + a.mov(RET, bitdata); + } else if ((flags & BSF_LITTLE) != 0) { + /* Reverse endianness for this little-endian segment. */ + if (num_partial == 0) { + a.mov(RET, bitdata); + a.bswap(RET); + if (bits < 64) { + a.shl(RET, imm(64 - num_complete)); + } + } else { + Uint shifted_mask = ((1 << num_partial) - 1) << (8 - num_partial); + a.mov(tmp, bitdata); + a.shr(tmp, imm(64 - num_complete)); + a.bswap(tmp); + a.shr(tmp, imm(num_partial)); + + a.mov(RET, bitdata); + a.rol(RET, imm(num_complete + 8)); + a.and_(RETd, imm(shifted_mask)); + a.ror(RET, imm(8)); + a.or_(RET, tmp); + } + } + + /* Now the extracted data is in RET. */ + if (bits >= SMALL_BITS) { + /* Handle segments whose values might not fit in a small + * integer. */ + Label small = a.newLabel(); + comment("test whether this integer is a small"); + if (bits < 64) { + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned segment. */ + a.shr(RET, imm(64 - bits)); + } else { + /* Signed segment. */ + a.sar(RET, imm(64 - bits)); + } + } + a.mov(tmp, RET); + a.shr(tmp, imm(SMALL_BITS - 1)); + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned segment. */ + a.jnz(big); + } else { + /* Signed segment. */ + a.jz(small); + a.cmp(tmp.r32(), imm(_TAG_IMMED1_MASK << 1 | 1)); + a.jnz(big); + } + + comment("store extracted integer as a small"); + a.bind(small); + a.shl(RET, imm(_TAG_IMMED1_SIZE)); + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + a.short_().jmp(done); + } else { + /* This segment always fits in a small. */ + comment("store extracted integer as a small"); + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned segment. */ + a.shr(RET, imm(64 - bits - _TAG_IMMED1_SIZE)); + } else { + /* Signed segment. */ + a.sar(RET, imm(64 - bits - _TAG_IMMED1_SIZE)); + } + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == (1 << _TAG_IMMED1_SIZE) - 1); + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + } + + a.bind(big); + if (bits >= SMALL_BITS) { + comment("store extracted integer as a bignum"); + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned segment. */ + a.mov(x86::qword_ptr(HTOP), make_pos_bignum_header(1)); + a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), RET); + } else { + Label negative = a.newLabel(); + Label sign_done = a.newLabel(); + + /* Signed segment. */ + a.test(RET, RET); + a.short_().jl(negative); + + a.mov(x86::qword_ptr(HTOP), make_pos_bignum_header(1)); + a.short_().jmp(sign_done); + + a.bind(negative); + a.mov(x86::qword_ptr(HTOP), make_neg_bignum_header(1)); + a.neg(RET); + + a.bind(sign_done); + a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), RET); + } + a.lea(RET, x86::qword_ptr(HTOP, TAG_PRIMARY_BOXED)); + a.add(HTOP, imm(sizeof(Eterm[2]))); + } + + a.bind(done); + mov_arg(Dst, RET); +} + +/* + * Clobbers: RET + */ +void BeamModuleAssembler::emit_extract_binary(const x86::Gp bitdata, + Uint bits, + const ArgRegister &Dst) { + Uint num_bytes = bits / 8; + + a.lea(RET, x86::qword_ptr(HTOP, TAG_PRIMARY_BOXED)); + mov_arg(Dst, RET); + a.mov(x86::qword_ptr(HTOP), header_heap_bin(num_bytes)); + a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), imm(num_bytes)); + a.mov(RET, bitdata); + a.bswap(RET); + if (num_bytes == 0) { + a.add(HTOP, imm(sizeof(Eterm[2]))); + } else { + a.mov(x86::qword_ptr(HTOP, 2 * sizeof(Eterm)), RET); + a.add(HTOP, imm(sizeof(Eterm[3]))); + } +} + +static std::vector<BsmSegment> opt_bsm_segments( + const std::vector<BsmSegment> segments, + const ArgWord &Need, + const ArgWord &Live) { + std::vector<BsmSegment> segs; + + Uint heap_need = Need.get(); + + /* + * First calculate the total number of heap words needed for + * bignums and binaries. + */ + for (auto seg : segments) { + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + if (seg.size >= SMALL_BITS) { + heap_need += BIG_NEED_FOR_BITS(seg.size); + } + break; + case BsmSegment::action::GET_BINARY: + heap_need += heap_bin_size((seg.size + 7) / 8); + break; + case BsmSegment::action::GET_TAIL: + heap_need += EXTRACT_SUB_BIN_HEAP_NEED; + break; + default: + break; + } + } + + int read_action_pos = -1; + int seg_index = 0; + int count = segments.size(); + + for (int i = 0; i < count; i++) { + auto seg = segments[i]; + if (heap_need != 0 && seg.live.isWord()) { + BsmSegment s = seg; + + read_action_pos = -1; + s.action = BsmSegment::action::TEST_HEAP; + s.size = heap_need; + segs.push_back(s); + heap_need = 0; + seg_index++; + } + + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + case BsmSegment::action::GET_BINARY: { + bool is_common_size; + switch (seg.size) { + case 8: + case 16: + case 32: + is_common_size = true; + break; + default: + is_common_size = false; + break; + } + + if (seg.size > 64) { + read_action_pos = -1; + } else if (seg.action == BsmSegment::action::GET_BINARY && + seg.size % 8 != 0) { + read_action_pos = -1; + } else if ((seg.flags & BSF_LITTLE) != 0 && is_common_size) { + seg.action = BsmSegment::action::READ_INTEGER; + read_action_pos = -1; + } else if (read_action_pos < 0 && + seg.action == BsmSegment::action::GET_INTEGER && + is_common_size && i + 1 == count) { + seg.action = BsmSegment::action::READ_INTEGER; + read_action_pos = -1; + } else { + if ((seg.flags & BSF_LITTLE) != 0 || read_action_pos < 0 || + seg.size + segs.at(read_action_pos).size > 64) { + BsmSegment s; + + /* Create a new READ action. */ + read_action_pos = seg_index; + s.action = BsmSegment::action::READ; + s.size = seg.size; + segs.push_back(s); + seg_index++; + } else { + /* Reuse previous READ action. */ + segs.at(read_action_pos).size += seg.size; + } + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + seg.action = BsmSegment::action::EXTRACT_INTEGER; + break; + case BsmSegment::action::GET_BINARY: + seg.action = BsmSegment::action::EXTRACT_BINARY; + break; + default: + break; + } + } + segs.push_back(seg); + break; + } + case BsmSegment::action::EQ: { + if (read_action_pos < 0 || + seg.size + segs.at(read_action_pos).size > 64) { + BsmSegment s; + + /* Create a new READ action. */ + read_action_pos = seg_index; + s.action = BsmSegment::action::READ; + s.size = seg.size; + segs.push_back(s); + seg_index++; + } else { + /* Reuse previous READ action. */ + segs.at(read_action_pos).size += seg.size; + } + auto &prev = segs.back(); + if (prev.action == BsmSegment::action::EQ && + prev.size + seg.size <= 64) { + /* Coalesce with the previous EQ instruction. */ + prev.size += seg.size; + prev.unit = prev.unit << seg.size | seg.unit; + seg_index--; + } else { + segs.push_back(seg); + } + break; + } + case BsmSegment::action::SKIP: + if (read_action_pos >= 0 && + seg.size + segs.at(read_action_pos).size <= 64) { + segs.at(read_action_pos).size += seg.size; + seg.action = BsmSegment::action::DROP; + } else { + read_action_pos = -1; + } + segs.push_back(seg); + break; + default: + read_action_pos = -1; + segs.push_back(seg); + break; + } + seg_index++; + } + + /* Handle a trailing test_heap instruction (for the + * i_bs_match_test_heap instruction). */ + if (heap_need) { + BsmSegment seg; + + seg.action = BsmSegment::action::TEST_HEAP; + seg.size = heap_need; + seg.live = Live; + segs.push_back(seg); + } + return segs; +} + +UWord BeamModuleAssembler::bs_get_flags(const ArgVal &val) { + if (val.isNil()) { + return 0; + } else if (val.isLiteral()) { + Eterm term = beamfile_get_literal(beam, val.as<ArgLiteral>().get()); + UWord flags = 0; + + while (is_list(term)) { + Eterm *consp = list_val(term); + Eterm elem = CAR(consp); + switch (elem) { + case am_little: + case am_native: + flags |= BSF_LITTLE; + break; + case am_signed: + flags |= BSF_SIGNED; + break; + } + term = CDR(consp); + } + ASSERT(is_nil(term)); + return flags; + } else if (val.isWord()) { + /* Originates from bs_get_integer2 instruction. */ + return val.as<ArgWord>().get(); + } else { + ASSERT(0); /* Should not happen. */ + return 0; + } +} + +void BeamModuleAssembler::emit_i_bs_match(ArgLabel const &Fail, + ArgRegister const &Ctx, + Span<ArgVal> const &List) { + emit_i_bs_match_test_heap(Fail, Ctx, ArgWord(0), ArgWord(0), List); +} + +void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, + ArgRegister const &Ctx, + ArgWord const &Need, + ArgWord const &Live, + Span<ArgVal> const &List) { + const int orig_offset = offsetof(ErlBinMatchState, mb.orig); + const int base_offset = offsetof(ErlBinMatchState, mb.base); + const int position_offset = offsetof(ErlBinMatchState, mb.offset); + const int size_offset = offsetof(ErlBinMatchState, mb.size); + + std::vector<BsmSegment> segments; + + auto current = List.begin(); + auto end = List.begin() + List.size(); + + while (current < end) { + auto cmd = current++->as<ArgImmed>().get(); + BsmSegment seg; + + switch (cmd) { + case am_ensure_at_least: { + seg.action = BsmSegment::action::ENSURE_AT_LEAST; + seg.size = current[0].as<ArgWord>().get(); + seg.unit = current[1].as<ArgWord>().get(); + current += 2; + break; + } + case am_ensure_exactly: { + seg.action = BsmSegment::action::ENSURE_EXACTLY; + seg.size = current[0].as<ArgWord>().get(); + current += 1; + break; + } + case am_binary: + case am_integer: { + auto size = current[2].as<ArgWord>().get(); + auto unit = current[3].as<ArgWord>().get(); + + switch (cmd) { + case am_integer: + seg.action = BsmSegment::action::GET_INTEGER; + break; + case am_binary: + seg.action = BsmSegment::action::GET_BINARY; + break; + } + + seg.live = current[0]; + seg.size = size * unit; + seg.unit = unit; + seg.flags = bs_get_flags(current[1]); + seg.dst = current[4].as<ArgRegister>(); + current += 5; + break; + } + case am_get_tail: { + seg.action = BsmSegment::action::GET_TAIL; + seg.live = current[0].as<ArgWord>(); + seg.dst = current[2].as<ArgRegister>(); + current += 3; + break; + } + case am_skip: { + seg.action = BsmSegment::action::SKIP; + seg.size = current[0].as<ArgWord>().get(); + seg.flags = 0; + current += 1; + break; + } + case am_Eq: { + seg.action = BsmSegment::action::EQ; + seg.live = current[0]; + seg.size = current[1].as<ArgWord>().get(); + seg.unit = current[2].as<ArgWord>().get(); + current += 3; + break; + } + default: + abort(); + break; + } + segments.push_back(seg); + } + + segments = opt_bsm_segments(segments, Need, Live); + + /* Constraints: + * + * bin_position must be RCX because only CL can be used for + * a variable shift without using the SHLX instruction from BMI2. + */ +#ifdef WIN32 + const x86::Gp bin_position = ARG1; + const x86::Gp bitdata = ARG2; + const x86::Gp bin_base = ARG3; + const x86::Gp ctx = ARG4; +#else + const x86::Gp bin_position = ARG4; + const x86::Gp bitdata = ARG3; + const x86::Gp bin_base = ARG1; + const x86::Gp ctx = ARG2; +#endif + ASSERT(bin_position == x86::rcx); + const x86::Gp tmp = ARG5; + + bool is_ctx_valid = false; + bool is_position_valid = false; + bool next_instr_clobbers = false; + int count = segments.size(); + + for (int i = 0; i < count; i++) { + auto seg = segments[i]; + + /* Find out whether the next sub instruction clobbers + * registers or is the last. */ + next_instr_clobbers = + i == count - 1 || + (i < count - 1 && + segments[i + 1].action == BsmSegment::action::TEST_HEAP); + + switch (seg.action) { + case BsmSegment::action::ENSURE_AT_LEAST: { + auto size = seg.size; + auto unit = seg.unit; + comment("ensure_at_least %ld %ld", size, seg.unit); + mov_arg(ctx, Ctx); + if (unit == 1) { + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + a.lea(RET, qword_ptr(bin_position, size)); + a.cmp(RET, emit_boxed_val(ctx, size_offset)); + a.ja(resolve_beam_label(Fail)); + } else if (size == 0 && next_instr_clobbers) { + a.mov(RET, emit_boxed_val(ctx, size_offset)); + a.sub(RET, emit_boxed_val(ctx, position_offset)); + is_ctx_valid = is_position_valid = false; + } else { + a.mov(RET, emit_boxed_val(ctx, size_offset)); + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + a.sub(RET, bin_position); + cmp(RET, size, tmp); + a.jl(resolve_beam_label(Fail)); + } + + is_ctx_valid = is_position_valid = true; + + if (unit != 1) { + if (size % unit != 0) { + sub(RET, size, tmp); + } + + if ((unit & (unit - 1))) { + /* Clobbers ARG3 */ + a.cqo(); + mov_imm(tmp, unit); + a.div(tmp); + a.test(x86::rdx, x86::rdx); + is_ctx_valid = is_position_valid = false; + } else { + a.test(RETb, imm(unit - 1)); + } + a.jnz(resolve_beam_label(Fail)); + } + break; + } + case BsmSegment::action::ENSURE_EXACTLY: { + auto size = seg.size; + comment("ensure_exactly %ld", size); + + mov_arg(ctx, Ctx); + a.mov(RET, emit_boxed_val(ctx, size_offset)); + if (next_instr_clobbers) { + a.sub(RET, emit_boxed_val(ctx, position_offset)); + is_ctx_valid = is_position_valid = false; + } else { + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + a.sub(RET, bin_position); + is_ctx_valid = is_position_valid = true; + } + if (size != 0) { + cmp(RET, size, tmp); + } + a.jne(resolve_beam_label(Fail)); + break; + } + case BsmSegment::action::EQ: { + comment("=:= %ld %ld", seg.size, seg.unit); + auto bits = seg.size; + x86::Gp extract_reg; + + if (next_instr_clobbers) { + extract_reg = bitdata; + } else { + extract_reg = RET; + a.mov(extract_reg, bitdata); + } + if (bits != 0 && bits != 64) { + a.shr(extract_reg, imm(64 - bits)); + } + + if (seg.size <= 32) { + cmp(extract_reg.r32(), seg.unit, tmp); + } else { + cmp(extract_reg, seg.unit, tmp); + } + + a.jne(resolve_beam_label(Fail)); + + if (!next_instr_clobbers && bits != 0 && bits != 64) { + a.shl(bitdata, imm(bits)); + } + + /* bin_position is clobbered. */ + is_position_valid = false; + break; + } + case BsmSegment::action::TEST_HEAP: { + comment("test_heap %ld", seg.size); + emit_gc_test(ArgWord(0), ArgWord(seg.size), seg.live); + is_ctx_valid = is_position_valid = false; + break; + } + case BsmSegment::action::READ: { + comment("read %ld", seg.size); + if (seg.size == 0) { + comment("(nothing to do)"); + } else { + if (!is_ctx_valid) { + mov_arg(ctx, Ctx); + is_ctx_valid = true; + } + if (!is_position_valid) { + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + is_position_valid = true; + } + a.mov(bin_base, emit_boxed_val(ctx, base_offset)); + a.add(emit_boxed_val(ctx, position_offset), imm(seg.size)); + + emit_read_bits(seg.size, bin_base, bin_position, bitdata); + } + + is_position_valid = false; + break; + } + case BsmSegment::action::EXTRACT_BINARY: { + auto bits = seg.size; + auto Dst = seg.dst; + + comment("extract binary %ld", bits); + emit_extract_binary(bitdata, bits, Dst); + if (!next_instr_clobbers && bits != 0 && bits != 64) { + a.shl(bitdata, imm(bits)); + } + break; + } + case BsmSegment::action::EXTRACT_INTEGER: { + auto bits = seg.size; + auto flags = seg.flags; + auto Dst = seg.dst; + + comment("extract integer %ld", bits); + if (next_instr_clobbers && flags == 0 && bits < SMALL_BITS) { + a.shr(bitdata, imm(64 - bits - _TAG_IMMED1_SIZE)); + a.or_(bitdata, imm(_TAG_IMMED1_SMALL)); + mov_arg(Dst, bitdata); + } else { + emit_extract_integer(bitdata, tmp, flags, bits, Dst); + if (!next_instr_clobbers && bits != 0 && bits != 64) { + a.shl(bitdata, imm(bits)); + } + } + + /* bin_position is clobbered. */ + is_position_valid = false; + break; + } + case BsmSegment::action::READ_INTEGER: { + auto bits = seg.size; + auto flags = seg.flags; + auto Dst = seg.dst; + + comment("read integer %ld", bits); + if (!is_ctx_valid) { + mov_arg(ctx, Ctx); + is_ctx_valid = true; + } + if (!is_position_valid) { + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + is_position_valid = true; + } + + a.mov(bin_base, emit_boxed_val(ctx, base_offset)); + a.add(emit_boxed_val(ctx, position_offset), imm(seg.size)); + emit_read_integer(bin_base, bin_position, tmp, flags, bits, Dst); + + is_position_valid = false; + break; + } + case BsmSegment::action::GET_INTEGER: { + Uint flags = seg.flags; + auto bits = seg.size; + auto Dst = seg.dst; + + comment("get integer %ld", bits); + if (!is_ctx_valid) { + mov_arg(ctx, Ctx); + } + + a.lea(ARG4, emit_boxed_val(ctx, offsetof(ErlBinMatchState, mb))); + + if (bits >= SMALL_BITS) { + emit_enter_runtime<Update::eReductions | + Update::eHeapOnlyAlloc>(); + } else { + emit_enter_runtime(); + } + + a.mov(ARG1, c_p); + a.mov(ARG2, bits); + a.mov(ARG3, flags); + /* ARG4 set above */ + runtime_call<4>(erts_bs_get_integer_2); + + if (bits >= SMALL_BITS) { + emit_leave_runtime<Update::eReductions | + Update::eHeapOnlyAlloc>(); + } else { + emit_leave_runtime(); + } + + mov_arg(Dst, RET); + + is_ctx_valid = is_position_valid = false; + break; + } + case BsmSegment::action::GET_BINARY: { + comment("get binary %ld", seg.size); + if (is_ctx_valid) { + a.mov(RET, ctx); + } else { + mov_arg(RET, Ctx); + } + emit_enter_runtime<Update::eHeapOnlyAlloc>(); + a.lea(ARG1, x86::qword_ptr(c_p, offsetof(Process, htop))); + a.mov(ARG2, emit_boxed_val(RET, orig_offset)); + a.mov(ARG3, emit_boxed_val(RET, base_offset)); + a.mov(ARG4, emit_boxed_val(RET, position_offset)); + mov_imm(ARG5, seg.size); + a.add(emit_boxed_val(RET, position_offset), ARG5); + + runtime_call<5>(erts_extract_sub_binary); + + emit_leave_runtime<Update::eHeapOnlyAlloc>(); + mov_arg(seg.dst, RET); + + is_ctx_valid = is_position_valid = false; + break; + } + case BsmSegment::action::GET_TAIL: { + comment("get_tail"); + if (is_ctx_valid) { + a.mov(ARG1, ctx); + } else { + mov_arg(ARG1, Ctx); + } + safe_fragment_call(ga->get_bs_get_tail_shared()); + mov_arg(seg.dst, RET); + is_ctx_valid = is_position_valid = false; + break; + } + case BsmSegment::action::SKIP: { + comment("skip %ld", seg.size); + if (!is_ctx_valid) { + mov_arg(ctx, Ctx); + is_ctx_valid = true; + } + /* The compiler limits the size of any segment in a bs_match + * instruction to 24 bits. */ + ASSERT((seg.size >> 24) == 0); + a.add(emit_boxed_val(ctx, position_offset), imm(seg.size)); + is_position_valid = false; + break; + } + case BsmSegment::action::DROP: + auto bits = seg.size; + comment("drop %ld", bits); + if (bits != 0 && bits != 64) { + a.shl(bitdata, imm(bits)); + } + break; + } + } +} |