diff options
Diffstat (limited to 'erts/emulator/beam/jit/arm/instr_bs.cpp')
-rw-r--r-- | erts/emulator/beam/jit/arm/instr_bs.cpp | 2579 |
1 files changed, 2359 insertions, 220 deletions
diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp index 06873cd709..d7e8f70d83 100644 --- a/erts/emulator/beam/jit/arm/instr_bs.cpp +++ b/erts/emulator/beam/jit/arm/instr_bs.cpp @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2020-2022. All Rights Reserved. + * Copyright Ericsson AB 2020-2023. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ */ #include "beam_asm.hpp" +#include <numeric> extern "C" { @@ -29,8 +30,6 @@ extern "C" /* Clobbers TMP1+TMP2 * - * If max_size > 0, we jump to the fail label when Size > max_size - * * Returns -1 when the field check always fails, 1 if it may fail, and 0 if it * never fails. */ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size, @@ -55,18 +54,40 @@ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size, return -1; } else { auto size_reg = load_source(Size, TMP2); + bool can_fail = true; + + if (always_small(Size)) { + auto [min, max] = getClampedRange(Size); + can_fail = + !(0 <= min && (max >> (SMALL_BITS - ERL_UNIT_BITS)) == 0); + } /* Negating the tag bits lets us guard against non-smalls, negative * numbers, and overflow with a single `tst` instruction. */ ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == _TAG_IMMED1_MASK); ASSERT(unit <= 1024); - a.eor(out, size_reg.reg, imm(_TAG_IMMED1_SMALL)); - a.tst(out, imm(0xFFF0000000000000UL | _TAG_IMMED1_MASK)); + if (!can_fail) { + comment("simplified segment size checks because " + "the types are known"); + } + + if (unit == 1 && !can_fail) { + a.lsr(out, size_reg.reg, imm(_TAG_IMMED1_SIZE)); + } else { + a.eor(out, size_reg.reg, imm(_TAG_IMMED1_SMALL)); + } + + if (can_fail) { + a.tst(out, imm(0xFFF0000000000000UL | _TAG_IMMED1_MASK)); + } if (unit == 0) { /* Silly but legal.*/ mov_imm(out, 0); + } else if (unit == 1 && !can_fail) { + /* The result is already in the out register. */ + ; } else if (Support::isPowerOf2(unit)) { int trailing_bits = Support::ctz<Eterm>(unit); @@ -88,9 +109,11 @@ int BeamModuleAssembler::emit_bs_get_field_size(const ArgSource &Size, a.mul(out, out, TMP1); } - a.b_ne(fail); + if (can_fail) { + a.b_ne(fail); + } - return 1; + return can_fail; } } @@ -102,7 +125,7 @@ void BeamModuleAssembler::emit_i_bs_init_heap(const ArgWord &Size, mov_arg(ARG5, Heap); mov_arg(ARG6, Live); - emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get()); a.mov(ARG1, c_p); @@ -110,7 +133,7 @@ void BeamModuleAssembler::emit_i_bs_init_heap(const ArgWord &Size, load_erl_bits_state(ARG3); runtime_call<6>(beam_jit_bs_init); - emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get()); mov_arg(Dst, ARG1); @@ -148,7 +171,7 @@ void BeamModuleAssembler::emit_i_bs_init_fail_heap(const ArgSource &Size, mov_arg(ARG5, Heap); mov_arg(ARG6, Live); - emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get()); a.mov(ARG1, c_p); @@ -156,7 +179,7 @@ void BeamModuleAssembler::emit_i_bs_init_fail_heap(const ArgSource &Size, load_erl_bits_state(ARG3); runtime_call<6>(beam_jit_bs_init); - emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get()); mov_arg(Dst, ARG1); @@ -207,7 +230,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_heap(const ArgWord &NumBits, mov_arg(ARG5, Alloc); mov_arg(ARG6, Live); - emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get()); a.mov(ARG1, c_p); @@ -215,7 +238,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_heap(const ArgWord &NumBits, load_erl_bits_state(ARG3); runtime_call<6>(beam_jit_bs_init_bits); - emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get()); mov_arg(Dst, ARG1); @@ -248,7 +271,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_fail_heap( mov_arg(ARG5, Alloc); mov_arg(ARG6, Live); - emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get()); a.mov(ARG1, c_p); @@ -256,7 +279,7 @@ void BeamModuleAssembler::emit_i_bs_init_bits_fail_heap( load_erl_bits_state(ARG3); runtime_call<6>(beam_jit_bs_init_bits); - emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get()); mov_arg(Dst, ARG1); @@ -541,19 +564,18 @@ void BeamModuleAssembler::emit_i_bs_start_match3(const ArgRegister &Src, a.bind(is_binary); { - /* Src is not guaranteed to be inside the live range, so we need to - * stash it during GC. */ - emit_gc_test_preserve(ArgVal(ArgVal::Word, ERL_BIN_MATCHSTATE_SIZE(0)), + emit_gc_test_preserve(ArgWord(ERL_BIN_MATCHSTATE_SIZE(0)), Live, + Src, ARG2); - emit_enter_runtime<Update::eStack | Update::eHeap>(Live.get()); + emit_enter_runtime<Update::eHeapOnlyAlloc>(Live.get()); a.mov(ARG1, c_p); /* ARG2 was set above */ runtime_call<2>(erts_bs_start_match_3); - emit_leave_runtime<Update::eStack | Update::eHeap>(Live.get()); + emit_leave_runtime<Update::eHeapOnlyAlloc>(Live.get()); a.add(ARG2, ARG1, imm(TAG_PRIMARY_BOXED)); } @@ -585,9 +607,8 @@ void BeamModuleAssembler::emit_i_bs_match_string(const ArgRegister &Ctx, a.and_(ARG4, TMP2, imm(7)); /* ARG3 = mb->base + (mb->offset >> 3) */ - a.lsr(TMP2, TMP2, imm(3)); a.ldur(TMP1, emit_boxed_val(ctx_reg.reg, base_offset)); - a.add(ARG3, TMP1, TMP2); + a.add(ARG3, TMP1, TMP2, arm::lsr(3)); } emit_enter_runtime(); @@ -624,77 +645,89 @@ void BeamModuleAssembler::emit_i_bs_get_position(const ArgRegister &Ctx, flush_var(dst_reg); } -void BeamModuleAssembler::emit_i_bs_get_fixed_integer(const ArgRegister &Ctx, - const ArgLabel &Fail, - const ArgWord &Live, - const ArgWord &Flags, - const ArgWord &Bits, - const ArgRegister &Dst) { - auto ctx = load_source(Ctx, TMP1); - int flags, bits; - - flags = Flags.get(); - bits = Bits.get(); +void BeamModuleAssembler::emit_bs_get_integer2(const ArgLabel &Fail, + const ArgRegister &Ctx, + const ArgWord &Live, + const ArgSource &Sz, + const ArgWord &Unit, + const ArgWord &Flags, + const ArgRegister &Dst) { + Uint size; + Uint flags = Flags.get(); - if (bits >= SMALL_BITS) { - emit_gc_test_preserve(ArgVal(ArgVal::Word, BIG_NEED_FOR_BITS(bits)), - Live, - ctx.reg); + if (flags & BSF_NATIVE) { + flags &= ~BSF_NATIVE; + flags |= BSF_LITTLE; } - lea(ARG4, emit_boxed_val(ctx.reg, offsetof(ErlBinMatchState, mb))); - - if (bits >= SMALL_BITS) { - emit_enter_runtime<Update::eHeap>(Live.get()); + if (Sz.isSmall() && Sz.as<ArgSmall>().getUnsigned() < 8 * sizeof(Uint) && + (size = Sz.as<ArgSmall>().getUnsigned() * Unit.get()) < + 8 * sizeof(Uint)) { + /* Segment of a fixed size supported by bs_match. */ + const ArgVal match[] = {ArgAtom(am_ensure_at_least), + ArgWord(size), + ArgWord(1), + ArgAtom(am_integer), + Live, + ArgWord(flags), + ArgWord(size), + ArgWord(1), + Dst}; + + const Span<ArgVal> args(match, sizeof(match) / sizeof(match[0])); + emit_i_bs_match(Fail, Ctx, args); } else { - emit_enter_runtime(Live.get()); - } - - a.mov(ARG1, c_p); - a.mov(ARG2, bits); - a.mov(ARG3, flags); - /* ARG4 set above. */ - runtime_call<4>(erts_bs_get_integer_2); - - if (bits >= SMALL_BITS) { - emit_leave_runtime<Update::eHeap>(Live.get()); - } else { - emit_leave_runtime(Live.get()); - } - - emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown)); - mov_arg(Dst, ARG1); -} - -void BeamModuleAssembler::emit_i_bs_get_integer(const ArgRegister &Ctx, - const ArgLabel &Fail, - const ArgWord &Live, - const ArgWord &FlagsAndUnit, - const ArgSource &Sz, - const ArgRegister &Dst) { - Label fail; - int unit; - - fail = resolve_beam_label(Fail, dispUnknown); - unit = FlagsAndUnit.get() >> 3; - - if (emit_bs_get_field_size(Sz, unit, fail, ARG5) >= 0) { - mov_arg(ARG3, Ctx); - mov_arg(ARG4, FlagsAndUnit); - mov_arg(ARG6, Live); + Label fail = resolve_beam_label(Fail, dispUnknown); + int unit = Unit.get(); + + if (emit_bs_get_field_size(Sz, unit, fail, ARG5) >= 0) { + /* This operation can be expensive if a bignum can be + * created because there can be a garbage collection. */ + auto max = std::get<1>(getClampedRange(Sz)); + bool potentially_expensive = + max >= SMALL_BITS || (max * Unit.get()) >= SMALL_BITS; + + mov_arg(ARG3, Ctx); + mov_imm(ARG4, flags); + if (potentially_expensive) { + mov_arg(ARG6, Live); + } else { +#ifdef DEBUG + /* Never actually used. */ + mov_imm(ARG6, 1023); +#endif + } - emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs | - Update::eReductions>(Live.get()); + if (potentially_expensive) { + emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs | + Update::eReductions>(Live.get()); + } else { + comment("simplified entering runtime because result is always " + "small"); + emit_enter_runtime(Live.get()); + } - a.mov(ARG1, c_p); - load_x_reg_array(ARG2); - runtime_call<6>(beam_jit_bs_get_integer); + a.mov(ARG1, c_p); + if (potentially_expensive) { + load_x_reg_array(ARG2); + } else { +#ifdef DEBUG + /* Never actually used. */ + mov_imm(ARG2, 0); +#endif + } + runtime_call<6>(beam_jit_bs_get_integer); - emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs | - Update::eReductions>(Live.get()); + if (potentially_expensive) { + emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs | + Update::eReductions>(Live.get()); + } else { + emit_leave_runtime(Live.get()); + } - emit_branch_if_not_value(ARG1, fail); - mov_arg(Dst, ARG1); + emit_branch_if_not_value(ARG1, fail); + mov_arg(Dst, ARG1); + } } } @@ -738,11 +771,7 @@ void BeamModuleAssembler::emit_i_bs_get_binary_all2(const ArgRegister &Ctx, mov_arg(ARG1, Ctx); - /* Ctx is not guaranteed to be inside the live range, so we need to stash - * it during GC. */ - emit_gc_test_preserve(ArgVal(ArgVal::Word, EXTRACT_SUB_BIN_HEAP_NEED), - Live, - ARG1); + emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, Ctx, ARG1); /* Make field fetching slightly more compact by pre-loading the match * buffer into the right argument slot for `erts_bs_get_binary_all_2`. */ @@ -770,13 +799,13 @@ void BeamModuleAssembler::emit_i_bs_get_binary_all2(const ArgRegister &Ctx, } } - emit_enter_runtime<Update::eHeap>(Live.get()); + emit_enter_runtime<Update::eHeapOnlyAlloc>(Live.get()); a.mov(ARG1, c_p); /* ARG2 was set above. */ runtime_call<2>(erts_bs_get_binary_all_2); - emit_leave_runtime<Update::eHeap>(Live.get()); + emit_leave_runtime<Update::eHeapOnlyAlloc>(Live.get()); mov_arg(Dst, ARG1); } @@ -796,11 +825,11 @@ void BeamGlobalAssembler::emit_bs_get_tail_shared() { a.sub(ARG5, TMP1, ARG4); emit_enter_runtime_frame(); - emit_enter_runtime<Update::eHeap>(); + emit_enter_runtime<Update::eHeapOnlyAlloc>(); runtime_call<5>(erts_extract_sub_binary); - emit_leave_runtime<Update::eHeap>(); + emit_leave_runtime<Update::eHeapOnlyAlloc>(); emit_leave_runtime_frame(); a.ret(a64::x30); @@ -811,11 +840,7 @@ void BeamModuleAssembler::emit_bs_get_tail(const ArgRegister &Ctx, const ArgWord &Live) { mov_arg(ARG1, Ctx); - /* Ctx is not guaranteed to be inside the live range, so we need to stash - * it during GC. */ - emit_gc_test_preserve(ArgVal(ArgVal::Word, EXTRACT_SUB_BIN_HEAP_NEED), - Live, - ARG1); + emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, Ctx, ARG1); fragment_call(ga->get_bs_get_tail_shared()); @@ -841,12 +866,34 @@ void BeamModuleAssembler::emit_bs_skip_bits(const ArgLabel &Fail, } void BeamModuleAssembler::emit_i_bs_skip_bits2(const ArgRegister &Ctx, - const ArgRegister &Bits, + const ArgRegister &Size, const ArgLabel &Fail, const ArgWord &Unit) { Label fail = resolve_beam_label(Fail, dispUnknown); - if (emit_bs_get_field_size(Bits, Unit.get(), fail, ARG1) >= 0) { + bool can_fail = true; + + if (always_small(Size)) { + auto [min, max] = getClampedRange(Size); + can_fail = !(0 <= min && (max >> (SMALL_BITS - ERL_UNIT_BITS)) == 0); + } + + if (!can_fail && Unit.get() == 1) { + comment("simplified skipping because the types are known"); + + const int position_offset = offsetof(ErlBinMatchState, mb.offset); + const int size_offset = offsetof(ErlBinMatchState, mb.size); + auto [ctx, size] = load_sources(Ctx, TMP1, Size, TMP2); + + a.ldur(TMP3, emit_boxed_val(ctx.reg, position_offset)); + a.ldur(TMP4, emit_boxed_val(ctx.reg, size_offset)); + + a.add(TMP3, TMP3, size.reg, arm::lsr(_TAG_IMMED1_SIZE)); + a.cmp(TMP3, TMP4); + a.b_hi(resolve_beam_label(Fail, disp1MB)); + + a.stur(TMP3, emit_boxed_val(ctx.reg, position_offset)); + } else if (emit_bs_get_field_size(Size, Unit.get(), fail, ARG1) >= 0) { emit_bs_skip_bits(Fail, Ctx); } } @@ -875,22 +922,21 @@ void BeamModuleAssembler::emit_i_bs_get_binary2(const ArgRegister &Ctx, mov_arg(ARG4, Ctx); - /* Ctx is not guaranteed to be inside the live range, so we need to - * stash it during GC. */ - emit_gc_test_preserve(ArgVal(ArgVal::Word, EXTRACT_SUB_BIN_HEAP_NEED), + emit_gc_test_preserve(ArgWord(EXTRACT_SUB_BIN_HEAP_NEED), Live, + Ctx, ARG4); lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb))); - emit_enter_runtime<Update::eHeap>(Live.get()); + emit_enter_runtime<Update::eHeapOnlyAlloc>(Live.get()); a.mov(ARG1, c_p); a.ldr(ARG2, TMP_MEM1q); mov_imm(ARG3, Flags.get()); runtime_call<4>(erts_bs_get_binary_2); - emit_leave_runtime<Update::eHeap>(Live.get()); + emit_leave_runtime<Update::eHeapOnlyAlloc>(Live.get()); emit_branch_if_not_value(ARG1, fail); @@ -912,20 +958,18 @@ void BeamModuleAssembler::emit_i_bs_get_float2(const ArgRegister &Ctx, mov_arg(ARG4, Ctx); - /* Ctx is not guaranteed to be inside the live range, so we need to stash - * it during GC. */ - emit_gc_test_preserve(ArgWord(FLOAT_SIZE_OBJECT), Live, ARG4); + emit_gc_test_preserve(ArgWord(FLOAT_SIZE_OBJECT), Live, Ctx, ARG4); if (emit_bs_get_field_size(Sz, unit, fail, ARG2) >= 0) { lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb))); - emit_enter_runtime<Update::eHeap>(Live.get()); + emit_enter_runtime<Update::eHeapOnlyAlloc>(Live.get()); a.mov(ARG1, c_p); mov_imm(ARG3, Flags.get()); runtime_call<4>(erts_bs_get_float_2); - emit_leave_runtime<Update::eHeap>(Live.get()); + emit_leave_runtime<Update::eHeapOnlyAlloc>(Live.get()); emit_branch_if_not_value(ARG1, fail); @@ -983,18 +1027,283 @@ void BeamModuleAssembler::emit_i_bs_put_utf8(const ArgLabel &Fail, } } +/* + * ARG1 = pointer to match state + * ARG2 = number of bits left in binary (< 32) + * ARG3 = position in binary in bits + * ARG4 = base pointer to binary data + * + * See the comment for emit_bs_get_utf8_shared() for details about the + * return value. + */ +void BeamGlobalAssembler::emit_bs_get_utf8_short_shared() { + const int position_offset = offsetof(ErlBinMatchBuffer, offset); + + const arm::Gp match_state = ARG1; + const arm::Gp bitdata = ARG2; + const arm::Gp bin_position = ARG3; + const arm::Gp bin_base = ARG4; + + Label two = a.newLabel(); + Label three_or_more = a.newLabel(); + Label four = a.newLabel(); + Label read_done = a.newLabel(); + Label ascii = a.newLabel(); + Label error = a.newLabel(); + + /* Calculate the number of bytes remaining in the binary and error + * out if less than one. */ + a.lsr(bitdata, bitdata, imm(3)); + a.cbz(bitdata, error); + + /* Calculate a byte mask so we can zero out trailing garbage. */ + a.neg(TMP5, bitdata, arm::lsl(3)); + mov_imm(TMP4, -1); + a.lsl(TMP4, TMP4, TMP5); + + /* If the position in the binary is not byte-aligned, we'll need + * to read one more byte. */ + a.ands(TMP1, bin_position, imm(7)); + a.cinc(bitdata, bitdata, imm(arm::CondCode::kNE)); + + /* Set up pointer to the first byte to read. */ + a.add(TMP2, bin_base, bin_position, arm::lsr(3)); + + a.cmp(bitdata, 2); + a.b_eq(two); + a.b_hi(three_or_more); + + /* Read one byte (always byte-aligned). */ + a.ldrb(bitdata.w(), arm::Mem(TMP2)); + a.b(read_done); + + /* Read two bytes. */ + a.bind(two); + a.ldrh(bitdata.w(), arm::Mem(TMP2)); + a.b(read_done); + + a.bind(three_or_more); + a.cmp(bitdata, 3); + a.b_ne(four); + + /* Read three bytes. */ + a.ldrh(bitdata.w(), arm::Mem(TMP2)); + a.ldrb(TMP3.w(), arm::Mem(TMP2, 2)); + a.orr(bitdata, bitdata, TMP3, arm::lsl(16)); + a.b(read_done); + + /* Read four bytes (always unaligned). */ + a.bind(four); + a.ldr(bitdata.w(), arm::Mem(TMP2)); + + /* Handle the bytes read. */ + a.bind(read_done); + a.rev64(bitdata, bitdata); + a.lsl(bitdata, bitdata, TMP1); + a.and_(bitdata, bitdata, TMP4); + a.tbz(bitdata, imm(63), ascii); + a.b(labels[bs_get_utf8_shared]); + + /* Handle plain old ASCII (code point < 128). */ + a.bind(ascii); + a.add(bin_position, bin_position, imm(8)); + a.str(bin_position, arm::Mem(match_state, position_offset)); + a.mov(ARG1, imm(_TAG_IMMED1_SMALL)); + a.orr(ARG1, ARG1, bitdata, arm::lsr(56 - _TAG_IMMED1_SIZE)); + a.ret(a64::x30); + + /* Signal error. */ + a.bind(error); + mov_imm(ARG1, 0); + a.ret(a64::x30); +} + +/* + * ARG1 = pointer to match state + * ARG2 = 4 bytes read from the binary in big-endian order + * ARG3 = position in binary in bits + * + * On successful return, the extracted code point is a term tagged + * small in ARG1 and the position in the match state has been updated. On + * failure, ARG1 contains an invalid term where the tags bits are zero. + */ +void BeamGlobalAssembler::emit_bs_get_utf8_shared() { + const int position_offset = offsetof(ErlBinMatchBuffer, offset); + + const arm::Gp match_state = ARG1; + const arm::Gp bitdata = ARG2; + const arm::Gp bin_position = ARG3; + + const arm::Gp byte_count = ARG4; + + const arm::Gp shift = TMP4; + const arm::Gp control_mask = TMP5; + const arm::Gp error_mask = TMP6; + + /* UTF-8 has the following layout, where 'x' are data bits: + * + * 1 byte: 0xxxxxxx (not handled by this path) + * 2 bytes: 110xxxxx, 10xxxxxx + * 3 bytes: 1110xxxx, 10xxxxxx 10xxxxxx + * 4 bytes: 11110xxx, 10xxxxxx 10xxxxxx 10xxxxxx + * + * Note that the number of leading bits is equal to the number of bytes, + * which makes it very easy to create masks for extraction and error + * checking. */ + + /* Calculate the number of bytes. */ + a.cls(byte_count, bitdata); + a.add(byte_count, byte_count, imm(1)); + + /* Get rid of the prefix bits. */ + a.lsl(bitdata, bitdata, byte_count); + a.lsr(bitdata, bitdata, byte_count); + + /* Calculate the bit shift now before we start to corrupt the + * byte_count. */ + mov_imm(shift, 64); + a.sub(shift, shift, byte_count, arm::lsl(3)); + + /* Shift down the value to the least significant part of the word. */ + a.lsr(bitdata, bitdata, shift); + + /* Matches the '10xxxxxx' components, leaving the header byte alone. */ + mov_imm(error_mask, 0x00808080ull << 32); + a.lsr(error_mask, error_mask, shift); + + /* Construct the control mask '0x00C0C0C0' (already shifted). */ + a.orr(control_mask, error_mask, error_mask, arm::lsr(1)); + + /* Assert that the header bits of each '10xxxxxx' component are correct, + * signaling errors by trashing the byte count with an illegal + * value (0). */ + a.and_(TMP3, bitdata, control_mask); + a.cmp(TMP3, error_mask); + + a.ubfx(TMP1, bitdata, imm(8), imm(6)); + a.ubfx(TMP2, bitdata, imm(16), imm(6)); + a.ubfx(TMP3, bitdata, imm(24), imm(3)); + a.ubfx(bitdata, bitdata, imm(0), imm(6)); + + a.orr(bitdata, bitdata, TMP1, arm::lsl(6)); + a.orr(bitdata, bitdata, TMP2, arm::lsl(12)); + a.orr(bitdata, bitdata, TMP3, arm::lsl(18)); + + /* Check for too large code point. */ + mov_imm(TMP1, 0x10FFFF); + a.ccmp(bitdata, TMP1, imm(NZCV::kCF), arm::CondCode::kEQ); + + /* Check for the illegal range 16#D800 - 16#DFFF. */ + a.lsr(TMP1, bitdata, imm(11)); + a.ccmp(TMP1, imm(0xD800 >> 11), imm(NZCV::kZF), arm::CondCode::kLS); + a.csel(byte_count, byte_count, ZERO, imm(arm::CondCode::kNE)); + + /* Test for overlong UTF-8 sequence. That can be done by testing + * that the bits marked y below are all zero. + * + * 1 byte: 0xxxxxxx (not handled by this path) + * 2 bytes: 110yyyyx, 10xxxxxx + * 3 bytes: 1110yyyy, 10yxxxxx 10xxxxxx + * 4 bytes: 11110yyy, 10yyxxxx 10xxxxxx 10xxxxxx + * + * 1 byte: xx'xxxxx + * 2 bytes: y'yyyxx'xxxxx + * 3 bytes: y'yyyyx'xxxxx'xxxxx + * 4 bytes: y'yyyyx'xxxxx'xxxxx'xxxxx + * + * The y bits can be isolated by shifting down by the number of bits + * shown in this table: + * + * 2: 7 (byte_count * 4 - 1) + * 3: 11 (byte_count * 4 - 1) + * 4: 16 (byte_count * 4) + */ + + /* Calculate number of bits to shift. */ + a.lsl(TMP1, byte_count, imm(2)); + a.cmp(byte_count, imm(4)); + a.csetm(TMP2, imm(arm::CondCode::kNE)); + a.add(TMP1, TMP1, TMP2); + + /* Pre-fill the tag bits so that we can clear them on error. */ + mov_imm(TMP2, _TAG_IMMED1_SMALL); + + /* Now isolate the y bits and compare to zero. This check will + * be used in a CCMP further down. */ + a.lsr(TMP1, bitdata, TMP1); + a.cmp(TMP1, 0); + + /* Byte count must be 2, 3, or 4. */ + a.sub(TMP1, byte_count, imm(2)); + a.ccmp(TMP1, imm(2), imm(NZCV::kCF), imm(arm::CondCode::kNE)); + + /* If we have failed, we set byte_count to zero to ensure that the + * position update nops, and set the pre-tagged result to zero so + * that we can check for error in module code by testing the tag + * bits. */ + a.csel(byte_count, byte_count, ZERO, imm(arm::CondCode::kLS)); + a.csel(TMP2, TMP2, ZERO, imm(arm::CondCode::kLS)); + + a.add(bin_position, bin_position, byte_count, arm::lsl(3)); + a.str(bin_position, arm::Mem(match_state, position_offset)); + a.orr(ARG1, TMP2, bitdata, arm::lsl(_TAG_IMMED1_SIZE)); + + a.ret(a64::x30); +} + void BeamModuleAssembler::emit_bs_get_utf8(const ArgRegister &Ctx, const ArgLabel &Fail) { - mov_arg(ARG1, Ctx); - lea(ARG1, emit_boxed_val(ARG1, offsetof(ErlBinMatchState, mb))); + const int base_offset = offsetof(ErlBinMatchBuffer, base); + const int position_offset = offsetof(ErlBinMatchBuffer, offset); - emit_enter_runtime(); + const arm::Gp match_state = ARG1; + const arm::Gp bitdata = ARG2; + const arm::Gp bin_position = ARG3; + const arm::Gp bin_base = ARG4; + const arm::Gp bin_size = ARG5; - runtime_call<1>(erts_bs_get_utf8); + auto ctx = load_source(Ctx, ARG6); - emit_leave_runtime(); + Label non_ascii = a.newLabel(); + Label fallback = a.newLabel(); + Label check = a.newLabel(); + Label done = a.newLabel(); - emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown)); + lea(match_state, emit_boxed_val(ctx.reg, offsetof(ErlBinMatchState, mb))); + ERTS_CT_ASSERT_FIELD_PAIR(ErlBinMatchBuffer, offset, size); + a.ldp(bin_position, bin_size, arm::Mem(ARG1, position_offset)); + a.ldr(bin_base, arm::Mem(ARG1, base_offset)); + a.sub(bitdata, bin_size, bin_position); + a.cmp(bitdata, imm(32)); + a.b_lo(fallback); + + emit_read_bits(32, bin_base, bin_position, bitdata); + a.tbnz(bitdata, imm(63), non_ascii); + + /* Handle plain old ASCII (code point < 128). */ + a.add(bin_position, bin_position, imm(8)); + a.str(bin_position, arm::Mem(ARG1, position_offset)); + a.mov(ARG1, imm(_TAG_IMMED1_SMALL)); + a.orr(ARG1, ARG1, bitdata, arm::lsr(56 - _TAG_IMMED1_SIZE)); + a.b(done); + + /* Handle code point >= 128. */ + a.bind(non_ascii); + fragment_call(ga->get_bs_get_utf8_shared()); + a.b(check); + + /* + * Handle the case that there are not 4 bytes available in the binary. + */ + + a.bind(fallback); + fragment_call(ga->get_bs_get_utf8_short_shared()); + + a.bind(check); + ERTS_CT_ASSERT((_TAG_IMMED1_SMALL & 1) != 0); + a.tbz(ARG1, imm(0), resolve_beam_label(Fail, disp32K)); + + a.bind(done); } void BeamModuleAssembler::emit_i_bs_get_utf8(const ArgRegister &Ctx, @@ -1291,14 +1600,14 @@ void BeamModuleAssembler::emit_i_bs_append(const ArgLabel &Fail, mov_arg(ArgXRegister(Live.get()), Bin); - emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get() + 1); a.mov(ARG1, c_p); load_x_reg_array(ARG2); runtime_call<6>(erts_bs_append); - emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get() + 1); if (Fail.get() != 0) { @@ -1355,11 +1664,11 @@ void BeamModuleAssembler::emit_bs_init_writable() { /* We have an implicit liveness of 0, so we don't need to stash X * registers. */ - emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>(0); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc>(0); runtime_call<2>(erts_bs_init_writable); - emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap>(0); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc>(0); a.mov(XREG0, ARG1); } @@ -1367,7 +1676,7 @@ void BeamModuleAssembler::emit_bs_init_writable() { void BeamGlobalAssembler::emit_bs_create_bin_error_shared() { a.mov(XREG0, a64::x30); - emit_enter_runtime<Update::eStack | Update::eHeap>(0); + emit_enter_runtime<Update::eHeapAlloc>(0); /* ARG3 is already set by the caller */ a.mov(ARG2, ARG4); @@ -1375,7 +1684,7 @@ void BeamGlobalAssembler::emit_bs_create_bin_error_shared() { a.mov(ARG1, c_p); runtime_call<4>(beam_jit_bs_construct_fail_info); - emit_leave_runtime<Update::eStack | Update::eHeap>(0); + emit_leave_runtime<Update::eHeapAlloc>(0); a.mov(ARG4, ZERO); a.mov(ARG2, XREG0); @@ -1429,10 +1738,49 @@ void BeamGlobalAssembler::emit_bs_bit_size_shared() { a.ret(a64::x30); } +/* + * ARG1 = tagged bignum term + */ +void BeamGlobalAssembler::emit_get_sint64_shared() { + Label success = a.newLabel(); + Label fail = a.newLabel(); + + emit_is_boxed(fail, ARG1); + arm::Gp boxed_ptr = emit_ptr_val(TMP3, ARG1); + a.ldr(TMP1, emit_boxed_val(boxed_ptr)); + a.ldr(TMP2, emit_boxed_val(boxed_ptr, sizeof(Eterm))); + a.and_(TMP1, TMP1, imm(_TAG_HEADER_MASK)); + a.cmp(TMP1, imm(POS_BIG_SUBTAG)); + a.b_eq(success); + + a.cmp(TMP1, imm(NEG_BIG_SUBTAG)); + a.b_ne(fail); + + a.neg(TMP2, TMP2); + + a.bind(success); + { + a.mov(ARG1, TMP2); + /* Clear Z flag. + * + * TMP1 is known to be POS_BIG_SUBTAG or NEG_BIG_SUBTAG at this point. + */ + ERTS_CT_ASSERT(POS_BIG_SUBTAG != 0 && NEG_BIG_SUBTAG != 0); + a.tst(TMP1, TMP1); + a.ret(a64::x30); + } + + a.bind(fail); + { + a.tst(ZERO, ZERO); + a.ret(a64::x30); + } +} + struct BscSegment { BscSegment() : type(am_false), unit(1), flags(0), src(ArgNil()), size(ArgNil()), - error_info(0), effectiveSize(-1) { + error_info(0), effectiveSize(-1), action(action::DIRECT) { } Eterm type; @@ -1443,19 +1791,443 @@ struct BscSegment { Uint error_info; Sint effectiveSize; + + /* Here are sub actions for storing integer segments. + * + * We use the ACCUMULATE_FIRST and ACCUMULATE actions to shift the + * values of segments with known, small sizes (no more than 64 bits) + * into an accumulator register. + * + * When no more segments can be accumulated, the STORE action is + * used to store the value of the accumulator into the binary. + * + * The DIRECT action is used when it is not possible to use the + * accumulator (for unknown or too large sizes). + */ + enum class action { DIRECT, ACCUMULATE_FIRST, ACCUMULATE, STORE } action; }; +static std::vector<BscSegment> bs_combine_segments( + const std::vector<BscSegment> segments) { + std::vector<BscSegment> segs; + + for (auto seg : segments) { + switch (seg.type) { + case am_integer: { + if (!(0 < seg.effectiveSize && seg.effectiveSize <= 64)) { + /* Unknown or too large size. Handle using the default + * DIRECT action. */ + segs.push_back(seg); + continue; + } + + if (seg.flags & BSF_LITTLE || segs.size() == 0 || + segs.back().action == BscSegment::action::DIRECT) { + /* There are no previous compatible ACCUMULATE / STORE + * actions. Create the first ones. */ + seg.action = BscSegment::action::ACCUMULATE_FIRST; + segs.push_back(seg); + seg.action = BscSegment::action::STORE; + segs.push_back(seg); + continue; + } + + auto prev = segs.back(); + if (prev.flags & BSF_LITTLE) { + /* Little-endian segments cannot be combined with other + * segments. Create new ACCUMULATE_FIRST / STORE actions. */ + seg.action = BscSegment::action::ACCUMULATE_FIRST; + segs.push_back(seg); + seg.action = BscSegment::action::STORE; + segs.push_back(seg); + continue; + } + + /* The current segment is compatible with the previous + * segment. Try combining them. */ + if (prev.effectiveSize + seg.effectiveSize <= 64) { + /* The combined values of the segments fit in the + * accumulator. Insert an ACCUMULATE action for the + * current segment before the pre-existing STORE + * action. */ + segs.pop_back(); + prev.effectiveSize += seg.effectiveSize; + seg.action = BscSegment::action::ACCUMULATE; + segs.push_back(seg); + segs.push_back(prev); + } else { + /* The size exceeds 64 bits. Can't combine. */ + seg.action = BscSegment::action::ACCUMULATE_FIRST; + segs.push_back(seg); + seg.action = BscSegment::action::STORE; + segs.push_back(seg); + } + break; + } + default: + segs.push_back(seg); + break; + } + } + return segs; +} + +/* + * In: + * bin_offset = register to store the bit offset into the binary + * bit_offset = current bit offset into binary, or -1 if unknown + * size = size of segment to be constructed + * (ignored if size_reg is valid register) + * size_reg = if a valid register, it contains the size of + * the segment to be constructed + * + * Out: + * bin_offset register = if bit_offset is not byte aligned, the bit + * offset into the binary + * TMP1 = pointer to the current byte in the binary + * + * Preserves all other ARG* registers. + */ +void BeamModuleAssembler::update_bin_state(arm::Gp bin_offset, + Sint bit_offset, + Sint size, + arm::Gp size_reg) { + int cur_bin_offset = offsetof(ErtsSchedulerRegisters, + aux_regs.d.erl_bits_state.erts_current_bin_); + arm::Mem mem_bin_base = arm::Mem(scheduler_registers, cur_bin_offset); + arm::Mem mem_bin_offset = + arm::Mem(scheduler_registers, cur_bin_offset + sizeof(Eterm)); + + if (bit_offset % 8 != 0) { + /* The bit offset is unknown or not byte-aligned. */ + ERTS_CT_ASSERT_FIELD_PAIR(struct erl_bits_state, + erts_current_bin_, + erts_bin_offset_); + a.ldp(TMP2, bin_offset, mem_bin_base); + + if (size_reg.isValid()) { + a.add(TMP1, bin_offset, size_reg); + } else { + add(TMP1, bin_offset, size); + } + a.str(TMP1, mem_bin_offset); + + a.add(TMP1, TMP2, bin_offset, arm::lsr(3)); + } else { + comment("optimized updating of binary construction state"); + ASSERT(size >= 0 || size_reg.isValid()); + ASSERT(bit_offset % 8 == 0); + a.ldr(TMP1, mem_bin_base); + if (size_reg.isValid()) { + if (bit_offset == 0) { + a.str(size_reg, mem_bin_offset); + } else { + add(TMP2, size_reg, bit_offset); + a.str(TMP2, mem_bin_offset); + } + } else { + mov_imm(TMP2, bit_offset + size); + a.str(TMP2, mem_bin_offset); + } + if (bit_offset != 0) { + add(TMP1, TMP1, bit_offset >> 3); + } + } +} + +/* + * The size of the segment is assumed to be in ARG3. + */ +void BeamModuleAssembler::set_zero(Sint effectiveSize) { + Label store_units = a.newLabel(); + Label less_than_a_store_unit = a.newLabel(); + Sint store_unit = 1; + + update_bin_state(ARG2, -1, -1, ARG3); + + if (effectiveSize >= 256) { + /* Store four 64-bit words machine words when the size is + * known and at least 256 bits. */ + store_unit = 4; + a.movi(a64::d31, 0); + } else if (effectiveSize >= 128) { + /* Store two 64-bit words machine words when the size is + * known and at least 128 bits. */ + store_unit = 2; + } + + if (effectiveSize < Sint(store_unit * 8 * sizeof(Eterm))) { + /* The size is either not known or smaller than a word. */ + a.cmp(ARG3, imm(store_unit * 8 * sizeof(Eterm))); + a.b_lt(less_than_a_store_unit); + } + + a.bind(store_units); + if (store_unit == 4) { + a.stp(a64::q31, a64::q31, arm::Mem(TMP1).post(sizeof(Eterm[4]))); + } else if (store_unit == 2) { + a.stp(ZERO, ZERO, arm::Mem(TMP1).post(sizeof(Eterm[2]))); + } else { + a.str(ZERO, arm::Mem(TMP1).post(sizeof(Eterm))); + } + a.sub(ARG3, ARG3, imm(store_unit * 8 * sizeof(Eterm))); + + a.cmp(ARG3, imm(store_unit * 8 * sizeof(Eterm))); + a.b_ge(store_units); + + a.bind(less_than_a_store_unit); + if (effectiveSize < 0) { + /* Unknown size. */ + Label byte_loop = a.newLabel(); + Label done = a.newLabel(); + + ASSERT(store_unit = 1); + + a.cbz(ARG3, done); + + a.bind(byte_loop); + a.strb(ZERO.w(), arm::Mem(TMP1).post(1)); + a.subs(ARG3, ARG3, imm(8)); + a.b_gt(byte_loop); + + a.bind(done); + } else if (effectiveSize % (store_unit * 8 * sizeof(Eterm)) != 0) { + /* The size is known, and we know that there are less than + * 256 bits to initialize. */ + if (store_unit == 4 && (effectiveSize & 255) >= 128) { + a.stp(ZERO, ZERO, arm::Mem(TMP1).post(16)); + } + + if ((effectiveSize & 127) >= 64) { + a.str(ZERO, arm::Mem(TMP1).post(8)); + } + + if ((effectiveSize & 63) >= 32) { + a.str(ZERO.w(), arm::Mem(TMP1).post(4)); + } + + if ((effectiveSize & 31) >= 16) { + a.strh(ZERO.w(), arm::Mem(TMP1).post(2)); + } + + if ((effectiveSize & 15) >= 8) { + a.strb(ZERO.w(), arm::Mem(TMP1).post(1)); + } + + if ((effectiveSize & 7) > 0) { + a.strb(ZERO.w(), arm::Mem(TMP1)); + } + } +} + +/* + * In: + * + * ARG1 = valid unicode code point (=> 0x80) to encode + * + * Out: + * + * ARG1 = the code point encoded in UTF-8. + * ARG4 = number of bits of result (16, 24, or 32) + * + * Preserves other ARG* registers, clobbers TMP* registers + */ +void BeamGlobalAssembler::emit_construct_utf8_shared() { + Label more_than_two_bytes = a.newLabel(); + Label four_bytes = a.newLabel(); + const arm::Gp value = ARG1; + const arm::Gp num_bits = ARG4; + + a.cmp(value, imm(0x800)); + a.b_hs(more_than_two_bytes); + + /* Encode Unicode code point in two bytes. */ + a.ubfiz(TMP1, value, imm(8), imm(6)); + mov_imm(TMP2, 0x80c0); + a.orr(TMP1, TMP1, value, arm::lsr(6)); + mov_imm(num_bits, 16); + a.orr(value, TMP1, TMP2); + a.ret(a64::x30); + + /* Test whether the value should be encoded in four bytes. */ + a.bind(more_than_two_bytes); + a.lsr(TMP1, value, imm(16)); + a.cbnz(TMP1, four_bytes); + + /* Encode Unicode code point in three bytes. */ + a.lsl(TMP1, value, imm(2)); + a.ubfiz(TMP2, value, imm(16), imm(6)); + a.and_(TMP1, TMP1, imm(0x3f00)); + mov_imm(num_bits, 24); + a.orr(TMP1, TMP1, value, arm::lsr(12)); + a.orr(TMP1, TMP1, TMP2); + mov_imm(TMP2, 0x8080e0); + a.orr(value, TMP1, TMP2); + a.ret(a64::x30); + + /* Encode Unicode code point in four bytes. */ + a.bind(four_bytes); + a.lsl(TMP1, value, imm(10)); + a.lsr(TMP2, value, imm(4)); + a.and_(TMP1, TMP1, imm(0x3f0000)); + a.and_(TMP2, TMP2, imm(0x3f00)); + a.bfxil(TMP1, value, imm(18), imm(14)); + mov_imm(num_bits, 32); + a.bfi(TMP1, value, imm(24), imm(6)); + a.orr(TMP1, TMP1, TMP2); + mov_imm(TMP2, 0x808080f0); + a.orr(value, TMP1, TMP2); + a.ret(a64::x30); +} + +void BeamModuleAssembler::emit_construct_utf8(const ArgVal &Src, + Sint bit_offset, + bool is_byte_aligned) { + Label prepare_store = a.newLabel(); + Label store = a.newLabel(); + Label next = a.newLabel(); + + comment("construct utf8 segment"); + auto src = load_source(Src, ARG1); + + a.lsr(ARG1, src.reg, imm(_TAG_IMMED1_SIZE)); + mov_imm(ARG4, 8); + a.cmp(ARG1, imm(0x80)); + a.b_lo(prepare_store); + + fragment_call(ga->get_construct_utf8_shared()); + + a.bind(prepare_store); + arm::Gp bin_offset = ARG3; + update_bin_state(bin_offset, bit_offset, -1, ARG4); + + if (!is_byte_aligned) { + /* Not known to be byte-aligned. Must test alignment. */ + a.ands(TMP2, bin_offset, imm(7)); + a.b_eq(store); + + /* We must combine the last partial byte with the UTF-8 + * encoded code point. */ + a.ldrb(TMP5.w(), arm::Mem(TMP1)); + + a.rev64(TMP4, ARG1); + a.lsr(TMP4, TMP4, TMP2); + a.rev64(TMP4, TMP4); + + a.lsl(TMP5, TMP5, TMP2); + a.and_(TMP5, TMP5, imm(~0xff)); + a.lsr(TMP5, TMP5, TMP2); + + a.orr(ARG1, TMP4, TMP5); + + a.add(ARG4, ARG4, imm(8)); + } + + a.bind(store); + if (bit_offset % (4 * 8) == 0) { + /* This segment is aligned on a 4-byte boundary. This implies + * that a 4-byte write will be inside the allocated binary. */ + a.str(ARG1.w(), arm::Mem(TMP1)); + } else { + Label do_store_1 = a.newLabel(); + Label do_store_2 = a.newLabel(); + + /* Unsuitable or unknown alignment. We must be careful not + * to write beyound the allocated end of the binary. */ + a.cmp(ARG4, imm(8)); + a.b_ne(do_store_1); + + a.strb(ARG1.w(), arm::Mem(TMP1)); + a.b(next); + + a.bind(do_store_1); + a.cmp(ARG4, imm(24)); + a.b_hi(do_store_2); + + a.strh(ARG1.w(), arm::Mem(TMP1)); + a.cmp(ARG4, imm(16)); + a.b_eq(next); + + a.lsr(ARG1, ARG1, imm(16)); + a.strb(ARG1.w(), arm::Mem(TMP1, 2)); + a.b(next); + + a.bind(do_store_2); + a.str(ARG1.w(), arm::Mem(TMP1)); + + if (!is_byte_aligned) { + a.cmp(ARG4, imm(32)); + a.b_eq(next); + + a.lsr(ARG1, ARG1, imm(32)); + a.strb(ARG1.w(), arm::Mem(TMP1, 4)); + } + } + + a.bind(next); +} + +/* + * In: + * TMP1 = pointer to current byte + * ARG3 = bit offset + * ARG4 = number of bits to write + * ARG8 = data to write + */ +void BeamGlobalAssembler::emit_store_unaligned() { + Label loop = a.newLabel(); + Label done = a.newLabel(); + const arm::Gp left_bit_offset = ARG3; + const arm::Gp right_bit_offset = TMP6; + const arm::Gp num_bits = ARG4; + const arm::Gp bitdata = ARG8; + + a.ldrb(TMP5.w(), arm::Mem(TMP1)); + + a.and_(TMP4, bitdata, imm(0xff)); + a.lsr(TMP4, TMP4, left_bit_offset); + + a.lsl(TMP5, TMP5, left_bit_offset); + a.and_(TMP5, TMP5, imm(~0xff)); + a.lsr(TMP5, TMP5, left_bit_offset); + + a.orr(TMP5, TMP4, TMP5); + + a.strb(TMP5.w(), arm::Mem(TMP1).post(1)); + + mov_imm(right_bit_offset, 8); + a.sub(right_bit_offset, right_bit_offset, left_bit_offset); + + a.rev64(bitdata, bitdata); + a.lsl(bitdata, bitdata, right_bit_offset); + + a.subs(num_bits, num_bits, right_bit_offset); + a.b_le(done); + + a.bind(loop); + a.ror(bitdata, bitdata, imm(56)); + a.strb(bitdata.w(), arm::Mem(TMP1).post(1)); + a.subs(num_bits, num_bits, imm(8)); + a.b_gt(loop); + + a.bind(done); + a.ret(a64::x30); +} + void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, const ArgWord &Alloc, const ArgWord &Live0, const ArgRegister &Dst, const Span<ArgVal> &args) { Uint num_bits = 0; + Uint estimated_num_bits = 0; std::size_t n = args.size(); std::vector<BscSegment> segments; - Label error; + Label error; /* Intentionally uninitialized */ ArgWord Live = Live0; arm::Gp sizeReg; + Sint allocated_size = -1; + bool need_error_handler = false; /* * Collect information about each segment and calculate sizes of @@ -1501,17 +2273,67 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, seg.error_info = beam_jit_set_bsc_segment_op(bsc_segment, bsc_op); /* + * Test whether we can omit the code for the error handler. + */ + switch (seg.type) { + case am_append: + if (!(exact_type<BeamTypeId::Bitstring>(seg.src) && + std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit)) { + need_error_handler = true; + } + break; + case am_binary: + if (!(seg.size.isAtom() && seg.size.as<ArgAtom>().get() == am_all && + exact_type<BeamTypeId::Bitstring>(seg.src) && + std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit)) { + need_error_handler = true; + } + break; + case am_integer: + if (!exact_type<BeamTypeId::Integer>(seg.src)) { + need_error_handler = true; + } + break; + case am_private_append: + case am_string: + break; + default: + need_error_handler = true; + break; + } + + /* * Attempt to calculate the effective size of this segment. - * Give up is variable or invalid. + * Give up if variable or invalid. */ if (seg.size.isSmall() && seg.unit != 0) { Uint unsigned_size = seg.size.as<ArgSmall>().getUnsigned(); - if ((unsigned_size >> (sizeof(Eterm) - 1) * 8) == 0) { + if ((unsigned_size >> (sizeof(Eterm) - 1) * 8) != 0) { + /* Suppress creation of heap binary. */ + estimated_num_bits += (ERL_ONHEAP_BIN_LIMIT + 1) * 8; + } else { /* This multiplication cannot overflow. */ Uint seg_size = seg.unit * unsigned_size; seg.effectiveSize = seg_size; num_bits += seg_size; + estimated_num_bits += seg_size; + } + } else if (seg.unit > 0) { + auto max = std::min(std::get<1>(getClampedRange(seg.size)), + Sint((ERL_ONHEAP_BIN_LIMIT + 1) * 8)); + estimated_num_bits += max * seg.unit; + } else { + switch (seg.type) { + case am_utf8: + case am_utf16: + case am_utf32: + estimated_num_bits += 32; + break; + default: + /* Suppress creation of heap binary. */ + estimated_num_bits += (ERL_ONHEAP_BIN_LIMIT + 1) * 8; + break; } } @@ -1520,14 +2342,15 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, /* At least one segment will need a dynamic size * calculation. */ sizeReg = ARG8; + need_error_handler = true; } segments.insert(segments.end(), seg); } - if (Fail.get() != 0) { + if (need_error_handler && Fail.get() != 0) { error = resolve_beam_label(Fail, dispUnknown); - } else { + } else if (need_error_handler) { Label past_error = a.newLabel(); a.b(past_error); @@ -1550,6 +2373,8 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, } a.bind(past_error); + } else { + comment("(cannot fail)"); } /* We count the total number of bits in an unsigned integer. To @@ -1575,13 +2400,49 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, if (seg.size.isAtom() && seg.size.as<ArgAtom>().get() == am_all && seg.type == am_binary) { comment("size of an entire binary"); - mov_arg(ARG1, seg.src); - a.mov(ARG3, ARG1); - fragment_call(ga->get_bs_bit_size_shared()); - if (exact_type(seg.src, BEAM_TYPE_BITSTRING)) { - comment("skipped check for success since the source " - "is always a bit string"); + if (exact_type<BeamTypeId::Bitstring>(seg.src)) { + auto src = load_source(seg.src, ARG1); + arm::Gp boxed_ptr = emit_ptr_val(ARG1, src.reg); + auto unit = getSizeUnit(seg.src); + bool is_bitstring = unit == 0 || std::gcd(unit, 8) != 8; + + if (is_bitstring) { + comment("inlined size code because the value is always " + "a bitstring"); + } else { + comment("inlined size code because the value is always " + "a binary"); + } + + a.ldur(TMP2, emit_boxed_val(boxed_ptr, sizeof(Eterm))); + + if (is_bitstring) { + a.ldur(TMP1, emit_boxed_val(boxed_ptr)); + } + + a.add(sizeReg, sizeReg, TMP2, arm::lsl(3)); + + if (is_bitstring) { + Label not_sub_bin = a.newLabel(); + const int bit_number = 3; + ERTS_CT_ASSERT( + (_TAG_HEADER_SUB_BIN & (1 << bit_number)) != 0 && + (_TAG_HEADER_REFC_BIN & (1 << bit_number)) == 0 && + (_TAG_HEADER_HEAP_BIN & (1 << bit_number)) == 0); + + a.tbz(TMP1, imm(bit_number), not_sub_bin); + + a.ldurb(TMP2.w(), + emit_boxed_val(boxed_ptr, + offsetof(ErlSubBin, bitsize))); + a.add(sizeReg, sizeReg, TMP2); + + a.bind(not_sub_bin); + } } else { + mov_arg(ARG1, seg.src); + a.mov(ARG3, ARG1); + fragment_call(ga->get_bs_bit_size_shared()); if (Fail.get() == 0) { mov_imm(ARG4, beam_jit_update_bsc_reason_info(seg.error_info, @@ -1590,14 +2451,14 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, BSC_VALUE_ARG3)); } a.b_mi(resolve_label(error, disp1MB)); + a.add(sizeReg, sizeReg, ARG1); } - a.add(sizeReg, sizeReg, ARG1); } else if (seg.unit != 0) { bool can_fail = true; comment("size binary/integer/float/string"); if (always_small(seg.size)) { - auto [min, _] = getIntRange(seg.size); + auto min = std::get<0>(getClampedRange(seg.size)); if (min >= 0) { can_fail = false; } @@ -1615,8 +2476,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, if (always_small(seg.size)) { comment("skipped test for small size since it is always small"); - } else if (always_one_of(seg.size, - BEAM_TYPE_FLOAT | BEAM_TYPE_INTEGER)) { + } else if (always_one_of<BeamTypeId::Number>(seg.size)) { comment("simplified test for small size since it is a number"); emit_is_not_boxed(error, ARG3); } else { @@ -1627,10 +2487,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, if (can_fail) { a.tbnz(ARG3, 63, resolve_label(error, disp32K)); } - a.asr(TMP1, ARG3, imm(_TAG_IMMED1_SIZE)); if (seg.unit == 1) { - a.add(sizeReg, sizeReg, TMP1); + a.add(sizeReg, sizeReg, ARG3, arm::asr(_TAG_IMMED1_SIZE)); } else { + a.asr(TMP1, ARG3, imm(_TAG_IMMED1_SIZE)); if (Fail.get() == 0) { mov_imm(ARG4, beam_jit_update_bsc_reason_info( @@ -1639,7 +2499,7 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, BSC_INFO_SIZE, BSC_VALUE_ARG3)); } - a.tst(TMP1, imm(0xffful << 52)); + a.tst(TMP1, imm(0xffful << (SMALL_BITS - ERL_UNIT_BITS))); a.b_ne(resolve_label(error, disp1MB)); mov_imm(TMP2, seg.unit); a.madd(sizeReg, TMP1, TMP2, sizeReg); @@ -1649,24 +2509,60 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, case am_utf8: { comment("size utf8"); Label next = a.newLabel(); - auto src_reg = load_source(seg.src, TMP1); - a.lsr(TMP1, src_reg.reg, imm(_TAG_IMMED1_SIZE)); - mov_imm(TMP2, 1 * 8); + mov_arg(ARG3, seg.src); + + if (Fail.get() == 0) { + mov_imm(ARG4, + beam_jit_update_bsc_reason_info(seg.error_info, + BSC_REASON_BADARG, + BSC_INFO_TYPE, + BSC_VALUE_ARG3)); + } + + if (always_small(seg.src)) { + comment("skipped test for small value since it is always " + "small"); + } else if (always_one_of<BeamTypeId::Integer, + BeamTypeId::AlwaysBoxed>(seg.src)) { + comment("simplified test for small operand since other " + "types are boxed"); + emit_is_not_boxed(resolve_label(error, dispUnknown), ARG3); + } else { + a.and_(TMP1, ARG3, imm(_TAG_IMMED1_MASK)); + a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); + a.b_ne(resolve_label(error, disp1MB)); + } + + a.asr(TMP1, ARG3, imm(_TAG_IMMED1_SIZE)); + mov_imm(TMP2, 1); a.cmp(TMP1, imm(0x7F)); a.b_ls(next); - mov_imm(TMP2, 2 * 8); + mov_imm(TMP2, 2); a.cmp(TMP1, imm(0x7FFUL)); a.b_ls(next); + /* Ensure that the value is not in the invalid range + * 0xD800 through 0xDFFF. */ + a.lsr(TMP3, TMP1, imm(11)); + a.cmp(TMP3, 0x1b); + a.b_eq(resolve_label(error, disp1MB)); + a.cmp(TMP1, imm(0x10000UL)); - mov_imm(TMP2, 3 * 8); - mov_imm(TMP3, 4 * 8); - a.csel(TMP2, TMP2, TMP3, arm::CondCode::kLO); + a.cset(TMP2, arm::CondCode::kHS); + a.add(TMP2, TMP2, imm(3)); + + auto [min, max] = getClampedRange(seg.src); + if (0 <= min && max < 0x110000) { + comment("skipped range check for unicode code point"); + } else { + a.cmp(TMP1, 0x110000); + a.b_hs(resolve_label(error, disp1MB)); + } a.bind(next); - a.add(sizeReg, sizeReg, TMP2); + a.add(sizeReg, sizeReg, TMP2, arm::lsl(3)); break; } case am_utf16: { @@ -1742,21 +2638,28 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, a.mov(ARG1, c_p); load_x_reg_array(ARG2); - emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_enter_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get() + 1); runtime_call<6>(erts_bs_append_checked); - emit_leave_runtime<Update::eStack | Update::eHeap | Update::eXRegs | + emit_leave_runtime<Update::eHeapAlloc | Update::eXRegs | Update::eReductions>(Live.get() + 1); - if (Fail.get() == 0) { - mov_arg(ARG3, ArgXRegister(Live.get())); - mov_imm(ARG4, - beam_jit_update_bsc_reason_info(seg.error_info, - BSC_REASON_BADARG, - BSC_INFO_FVALUE, - BSC_VALUE_ARG3)); + if (exact_type<BeamTypeId::Bitstring>(seg.src) && + std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) { + /* There is no way the call can fail with a system_limit + * exception on a 64-bit architecture. */ + comment("skipped test for success because units are compatible"); + } else { + if (Fail.get() == 0) { + mov_arg(ARG3, ArgXRegister(Live.get())); + mov_imm(ARG4, + beam_jit_update_bsc_reason_info(seg.error_info, + BSC_REASON_BADARG, + BSC_INFO_FVALUE, + BSC_VALUE_ARG3)); + } + emit_branch_if_not_value(ARG1, resolve_label(error, dispUnknown)); } - emit_branch_if_not_value(ARG1, resolve_label(error, dispUnknown)); } else if (segments[0].type == am_private_append) { BscSegment seg = segments[0]; comment("private append to binary"); @@ -1773,6 +2676,82 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, runtime_call<4>(erts_bs_private_append_checked); emit_leave_runtime(Live.get()); /* There is no way the call can fail on a 64-bit architecture. */ + } else if (estimated_num_bits % 8 == 0 && + estimated_num_bits / 8 <= ERL_ONHEAP_BIN_LIMIT) { + static constexpr auto cur_bin_offset = + offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state) + + offsetof(struct erl_bits_state, erts_current_bin_); + Uint need; + + arm::Mem mem_bin_base = arm::Mem(scheduler_registers, cur_bin_offset); + + if (sizeReg.isValid()) { + Label after_gc_check = a.newLabel(); + + comment("allocate heap binary of dynamic size (=< %ld bits)", + estimated_num_bits); + + /* Calculate number of bytes to allocate. */ + need = (heap_bin_size(0) + Alloc.get() + S_RESERVED); + a.lsr(sizeReg, sizeReg, imm(3)); + a.add(TMP3, sizeReg, imm(7)); + a.and_(TMP3, TMP3, imm(-8)); + a.add(TMP1, TMP3, imm(need * sizeof(Eterm))); + + /* Do a GC test. */ + a.add(ARG3, HTOP, TMP1); + a.cmp(ARG3, E); + a.b_ls(after_gc_check); + + a.stp(sizeReg, TMP3, TMP_MEM1q); + + mov_imm(ARG4, Live.get()); + fragment_call(ga->get_garbage_collect()); + + a.ldp(sizeReg, TMP3, TMP_MEM1q); + + a.bind(after_gc_check); + + mov_imm(TMP1, header_heap_bin(0)); + a.lsr(TMP4, TMP3, imm(3)); + a.add(TMP1, TMP1, TMP4, arm::lsl(_HEADER_ARITY_OFFS)); + + /* Create the heap binary. */ + a.add(ARG1, HTOP, imm(TAG_PRIMARY_BOXED)); + a.stp(TMP1, sizeReg, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + + /* Initialize the erl_bin_state struct. */ + a.stp(HTOP, ZERO, mem_bin_base); + + /* Update HTOP. */ + a.add(HTOP, HTOP, TMP3); + } else { + Uint num_bytes = num_bits / 8; + + comment("allocate heap binary of static size"); + + allocated_size = (num_bytes + 7) & (-8); + + /* Ensure that there is sufficient room on the heap. */ + need = heap_bin_size(num_bytes) + Alloc.get(); + emit_gc_test(ArgWord(0), ArgWord(need), Live); + + mov_imm(TMP1, header_heap_bin(num_bytes)); + mov_imm(TMP2, num_bytes); + + /* Create the heap binary. */ + a.add(ARG1, HTOP, imm(TAG_PRIMARY_BOXED)); + a.stp(TMP1, TMP2, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + + /* Initialize the erl_bin_state struct. */ + ERTS_CT_ASSERT_FIELD_PAIR(struct erl_bits_state, + erts_current_bin_, + erts_bin_offset_); + a.stp(HTOP, ZERO, mem_bin_base); + + /* Update HTOP. */ + a.add(HTOP, HTOP, imm(allocated_size)); + } } else { comment("allocate binary"); mov_arg(ARG5, Alloc); @@ -1780,30 +2759,43 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, load_erl_bits_state(ARG3); load_x_reg_array(ARG2); a.mov(ARG1, c_p); - emit_enter_runtime<Update::eReductions | Update::eStack | - Update::eHeap | Update::eXRegs>(Live.get()); + emit_enter_runtime<Update::eReductions | Update::eHeapAlloc | + Update::eXRegs>(Live.get()); if (sizeReg.isValid()) { comment("(size in bits)"); a.mov(ARG4, sizeReg); runtime_call<6>(beam_jit_bs_init_bits); - } else if (num_bits % 8 == 0) { - comment("(size in bytes)"); - mov_imm(ARG4, num_bits / 8); - runtime_call<6>(beam_jit_bs_init); } else { + allocated_size = (num_bits + 7) / 8; + if (allocated_size <= ERL_ONHEAP_BIN_LIMIT) { + allocated_size = (allocated_size + 7) & (-8); + } mov_imm(ARG4, num_bits); runtime_call<6>(beam_jit_bs_init_bits); } - emit_leave_runtime<Update::eReductions | Update::eStack | - Update::eHeap | Update::eXRegs>(Live.get()); + emit_leave_runtime<Update::eReductions | Update::eHeapAlloc | + Update::eXRegs>(Live.get()); } a.str(ARG1, TMP_MEM1q); + segments = bs_combine_segments(segments); + + /* Keep track of the bit offset from the being of the binary. + * Set to -1 if offset is not known (when a segment of unknown + * size has been seen). */ + Sint bit_offset = 0; + + /* Keep track of whether the current segment is byte-aligned. (A + * segment can be known to be byte-aligned even if the bit offset + * is unknown.) */ + bool is_byte_aligned = true; + /* Build each segment of the binary. */ for (auto seg : segments) { switch (seg.type) { case am_append: case am_private_append: + bit_offset = -1; break; case am_binary: { Uint error_info; @@ -1838,8 +2830,10 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, BSC_REASON_BADARG, BSC_INFO_UNIT, BSC_VALUE_FVALUE); - if (seg.unit == 1) { - comment("skipped test for success because unit =:= 1"); + if (exact_type<BeamTypeId::Bitstring>(seg.src) && + std::gcd(seg.unit, getSizeUnit(seg.src)) == seg.unit) { + comment("skipped test for success because units are " + "compatible"); can_fail = false; } } else { @@ -1847,8 +2841,8 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, * the value is a non-negative small in the * appropriate range. Multiply the size with the * unit. */ - mov_arg(ARG3, seg.size); - a.asr(ARG3, ARG3, imm(_TAG_IMMED1_SIZE)); + auto r = load_source(seg.size, ARG3); + a.asr(ARG3, r.reg, imm(_TAG_IMMED1_SIZE)); if (seg.unit != 1) { mov_imm(TMP1, seg.unit); a.mul(ARG3, ARG3, TMP1); @@ -1879,8 +2873,8 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, if (seg.effectiveSize >= 0) { mov_imm(ARG3, seg.effectiveSize); } else { - mov_arg(ARG3, seg.size); - a.asr(ARG3, ARG3, imm(_TAG_IMMED1_SIZE)); + auto r = load_source(seg.size, ARG3); + a.asr(ARG3, r.reg, imm(_TAG_IMMED1_SIZE)); if (seg.unit != 1) { mov_imm(TMP1, seg.unit); a.mul(ARG3, ARG3, TMP1); @@ -1904,38 +2898,281 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, emit_branch_if_value(ARG1, resolve_label(error, dispUnknown)); break; case am_integer: - comment("construct integer segment"); - if (seg.effectiveSize >= 0) { - mov_imm(ARG3, seg.effectiveSize); - } else { - mov_arg(ARG3, seg.size); - a.asr(ARG3, ARG3, imm(_TAG_IMMED1_SIZE)); - if (seg.unit != 1) { - mov_imm(TMP1, seg.unit); - a.mul(ARG3, ARG3, TMP1); + switch (seg.action) { + case BscSegment::action::ACCUMULATE_FIRST: + case BscSegment::action::ACCUMULATE: { + /* Shift an integer of known size (no more than 64 bits) + * into a word-size accumulator. */ + Label value_is_small = a.newLabel(); + Label done = a.newLabel(); + + comment("accumulate value for integer segment"); + auto src = load_source(seg.src, ARG1); + if (seg.effectiveSize < 64 && + seg.action == BscSegment::action::ACCUMULATE) { + a.lsl(ARG8, ARG8, imm(seg.effectiveSize)); + } + + if (!always_small(seg.src)) { + if (always_one_of<BeamTypeId::Integer, + BeamTypeId::AlwaysBoxed>(seg.src)) { + comment("simplified small test since all other types " + "are boxed"); + emit_is_boxed(value_is_small, seg.src, src.reg); + } else { + a.and_(TMP1, src.reg, imm(_TAG_IMMED1_MASK)); + a.cmp(TMP1, imm(_TAG_IMMED1_SMALL)); + a.b_eq(value_is_small); + } + + /* The value is boxed. If it is a bignum, extract the + * least significant 64 bits. */ + mov_var(ARG1, src); + fragment_call(ga->get_get_sint64_shared()); + if (seg.effectiveSize == 64) { + a.mov(ARG8, ARG1); + } else { + a.bfxil(ARG8, + ARG1, + arm::lsr(0), + imm(seg.effectiveSize)); + } + + if (exact_type<BeamTypeId::Integer>(seg.src)) { + a.b(done); + } else { + a.b_ne(done); + + /* Not a bignum. Signal error. */ + if (Fail.get() == 0) { + mov_imm(ARG4, + beam_jit_update_bsc_reason_info( + seg.error_info, + BSC_REASON_BADARG, + BSC_INFO_TYPE, + BSC_VALUE_ARG1)); + } + a.b(resolve_label(error, disp128MB)); + } + } + + a.bind(value_is_small); + if (seg.effectiveSize == 64) { + a.asr(ARG8, src.reg, imm(_TAG_IMMED1_SIZE)); + } else if (seg.effectiveSize + _TAG_IMMED1_SIZE > 64) { + a.asr(TMP1, src.reg, imm(_TAG_IMMED1_SIZE)); + a.bfxil(ARG8, TMP1, arm::lsr(0), imm(seg.effectiveSize)); + } else { + a.bfxil(ARG8, + src.reg, + arm::lsr(_TAG_IMMED1_SIZE), + imm(seg.effectiveSize)); } + + a.bind(done); + break; } - mov_arg(ARG2, seg.src); - mov_imm(ARG4, seg.flags); - load_erl_bits_state(ARG1); + case BscSegment::action::STORE: { + /* The accumulator is now full or the next segment is + * not possible to accumulate, so it's time to store + * the accumulator to the current position in the + * binary. */ + Label store = a.newLabel(); + Label done = a.newLabel(); + + comment("construct integer segment from accumulator"); + + /* First we'll need to ensure that the value in the + * accumulator is in little endian format. */ + ASSERT(seg.effectiveSize >= 0); + if (seg.effectiveSize % 8) { + Uint complete_bytes = 8 * (seg.effectiveSize / 8); + Uint num_partial = seg.effectiveSize % 8; + if (seg.flags & BSF_LITTLE) { + a.ubfx(TMP1, + ARG8, + imm(complete_bytes), + imm(num_partial)); + a.bfc(ARG8, + arm::lsr(complete_bytes), + imm(64 - complete_bytes)); + a.bfi(ARG8, + TMP1, + imm(complete_bytes + 8 - num_partial), + imm(num_partial)); + } else { + a.lsl(ARG8, ARG8, imm(64 - seg.effectiveSize)); + a.rev64(ARG8, ARG8); + } + } else if ((seg.flags & BSF_LITTLE) == 0) { + switch (seg.effectiveSize) { + case 8: + break; + case 16: + a.rev16(ARG8, ARG8); + break; + case 32: + a.rev32(ARG8, ARG8); + break; + case 64: + a.rev64(ARG8, ARG8); + break; + default: + a.rev64(ARG8, ARG8); + a.lsr(ARG8, ARG8, imm(64 - seg.effectiveSize)); + } + } - emit_enter_runtime(Live.get()); - runtime_call<4>(erts_new_bs_put_integer); - emit_leave_runtime(Live.get()); + arm::Gp bin_offset = ARG3; + arm::Gp bin_data = ARG8; + + update_bin_state(bin_offset, + bit_offset, + seg.effectiveSize, + arm::Gp()); + + if (!is_byte_aligned) { + if (bit_offset < 0) { + /* Bit offset is unknown. Must test alignment. */ + a.ands(bin_offset, bin_offset, imm(7)); + a.b_eq(store); + } else if (bit_offset >= 0) { + /* Alignment is known to be unaligned. */ + mov_imm(bin_offset, bit_offset & 7); + } + + /* Bit offset is tested or known to be unaligned. */ + mov_imm(ARG4, seg.effectiveSize); + fragment_call(ga->get_store_unaligned()); + + if (bit_offset < 0) { + /* The bit offset is unknown, which implies that + * there exists store code that we will need to + * branch past. */ + a.b(done); + } + } - if (exact_type(seg.src, BEAM_TYPE_INTEGER)) { - comment("skipped test for success because construction can't " - "fail"); - } else { - if (Fail.get() == 0) { - mov_arg(ARG3, seg.src); - mov_imm(ARG4, - beam_jit_update_bsc_reason_info(seg.error_info, - BSC_REASON_BADARG, - BSC_INFO_TYPE, - BSC_VALUE_ARG3)); + a.bind(store); + + if (bit_offset < 0 || is_byte_aligned) { + /* Bit offset is tested or known to be + * byte-aligned. Emit inline code to store the + * value of the accumulator into the binary. */ + int num_bytes = (seg.effectiveSize + 7) / 8; + + /* If more than one instruction is required for + * doing the store, test whether it would be safe + * to do a single 32 or 64 bit store. */ + switch (num_bytes) { + case 3: + if (bit_offset >= 0 && + allocated_size * 8 - bit_offset >= 32) { + comment("simplified complicated store"); + num_bytes = 4; + } + break; + case 5: + case 6: + case 7: + if (bit_offset >= 0 && + allocated_size * 8 - bit_offset >= 64) { + comment("simplified complicated store"); + num_bytes = 8; + } + break; + } + + do { + switch (num_bytes) { + case 1: + a.strb(bin_data.w(), arm::Mem(TMP1)); + break; + case 2: + a.strh(bin_data.w(), arm::Mem(TMP1)); + break; + case 3: + a.strh(bin_data.w(), arm::Mem(TMP1)); + a.lsr(bin_data, bin_data, imm(16)); + a.strb(bin_data.w(), arm::Mem(TMP1, 2)); + break; + case 4: + a.str(bin_data.w(), arm::Mem(TMP1)); + break; + case 5: + case 6: + case 7: + a.str(bin_data.w(), arm::Mem(TMP1).post(4)); + a.lsr(bin_data, bin_data, imm(32)); + break; + case 8: + a.str(bin_data, arm::Mem(TMP1)); + num_bytes = 0; + break; + } + num_bytes -= 4; + } while (num_bytes > 0); + } + + a.bind(done); + break; + } + case BscSegment::action::DIRECT: + /* This segment either has a size exceeding the maximum + * accumulator size of 64 bits or has a variable size. + * + * First load the effective size (size * unit) into ARG3. + */ + comment("construct integer segment"); + if (seg.effectiveSize >= 0) { + mov_imm(ARG3, seg.effectiveSize); + } else { + auto size = load_source(seg.size, TMP1); + a.lsr(ARG3, size.reg, imm(_TAG_IMMED1_SIZE)); + if (Support::isPowerOf2(seg.unit)) { + Uint trailing_bits = Support::ctz<Eterm>(seg.unit); + if (trailing_bits) { + a.lsl(ARG3, ARG3, imm(trailing_bits)); + } + } else { + mov_imm(TMP1, seg.unit); + a.mul(ARG3, ARG3, TMP1); + } + } + + if (is_byte_aligned && seg.src.isSmall() && + seg.src.as<ArgSmall>().getSigned() == 0) { + /* Optimize the special case of setting a known + * byte-aligned segment to zero. */ + comment("optimized setting segment to 0"); + set_zero(seg.effectiveSize); + } else { + /* Call the helper function to fetch and store the + * integer into the binary. */ + mov_arg(ARG2, seg.src); + mov_imm(ARG4, seg.flags); + load_erl_bits_state(ARG1); + + emit_enter_runtime(Live.get()); + runtime_call<4>(erts_new_bs_put_integer); + emit_leave_runtime(Live.get()); + + if (exact_type<BeamTypeId::Integer>(seg.src)) { + comment("skipped test for success because construction " + "can't fail"); + } else { + if (Fail.get() == 0) { + mov_arg(ARG3, seg.src); + mov_imm(ARG4, + beam_jit_update_bsc_reason_info( + seg.error_info, + BSC_REASON_BADARG, + BSC_INFO_TYPE, + BSC_VALUE_ARG3)); + } + a.cbz(ARG1, resolve_label(error, disp1MB)); + } } - a.cbz(ARG1, resolve_label(error, disp1MB)); } break; case am_string: { @@ -1953,27 +3190,12 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, emit_leave_runtime(Live.get()); break; } - case am_utf8: - comment("construct utf8 segment"); - mov_arg(ARG2, seg.src); - load_erl_bits_state(ARG1); - - emit_enter_runtime(Live.get()); - runtime_call<2>(erts_bs_put_utf8); - - emit_leave_runtime(Live.get()); - if (Fail.get() == 0) { - mov_arg(ARG3, seg.src); - mov_imm(ARG4, - beam_jit_update_bsc_reason_info(seg.error_info, - BSC_REASON_BADARG, - BSC_INFO_TYPE, - BSC_VALUE_ARG3)); - } - a.cbz(ARG1, resolve_label(error, disp1MB)); + case am_utf8: { + emit_construct_utf8(seg.src, bit_offset, is_byte_aligned); break; + } case am_utf16: - comment("construct utf8 segment"); + comment("construct utf16 segment"); mov_arg(ARG2, seg.src); a.mov(ARG3, seg.flags); load_erl_bits_state(ARG1); @@ -2016,8 +3238,925 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, ASSERT(0); break; } + + /* Try to keep track of the bit offset. */ + if (bit_offset >= 0 && (seg.action == BscSegment::action::DIRECT || + seg.action == BscSegment::action::STORE)) { + if (seg.effectiveSize >= 0) { + bit_offset += seg.effectiveSize; + } else { + bit_offset = -1; + } + } + + /* Try to keep track whether the next segment is byte + * aligned. */ + if (seg.type == am_append || seg.type == am_private_append) { + if (!exact_type<BeamTypeId::Bitstring>(seg.src) || + std::gcd(getSizeUnit(seg.src), 8) != 8) { + is_byte_aligned = false; + } + } else if (bit_offset % 8 == 0) { + is_byte_aligned = true; + } else if (seg.effectiveSize >= 0) { + if (seg.effectiveSize % 8 != 0) { + is_byte_aligned = false; + } + } else if (std::gcd(seg.unit, 8) != 8) { + is_byte_aligned = false; + } } comment("done"); mov_arg(Dst, TMP_MEM1q); } + +/* + * Here follows the bs_match instruction and friends. + */ + +struct BsmSegment { + BsmSegment() + : action(action::TEST_HEAP), live(ArgNil()), size(0), unit(1), + flags(0), dst(ArgXRegister(0)){}; + + enum class action { + TEST_HEAP, + ENSURE_AT_LEAST, + ENSURE_EXACTLY, + READ, + EXTRACT_BINARY, + EXTRACT_INTEGER, + GET_INTEGER, + GET_BINARY, + SKIP, + DROP, + GET_TAIL, + EQ + } action; + ArgVal live; + Uint size; + Uint unit; + Uint flags; + ArgRegister dst; +}; + +void BeamModuleAssembler::emit_read_bits(Uint bits, + const arm::Gp bin_base, + const arm::Gp bin_offset, + const arm::Gp bitdata) { + Label handle_partial = a.newLabel(); + Label rev64 = a.newLabel(); + Label shift = a.newLabel(); + Label read_done = a.newLabel(); + + bool need_rev64 = false; + + const arm::Gp bin_byte_ptr = TMP2; + const arm::Gp bit_offset = TMP4; + const arm::Gp tmp = TMP5; + + auto num_partial = bits % 8; + + ASSERT(1 <= bits && bits <= 64); + + a.add(bin_byte_ptr, bin_base, bin_offset, arm::lsr(3)); + + if (bits <= 8) { + a.ands(bit_offset, bin_offset, imm(7)); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans two bytes. */ + a.b_ne(handle_partial); + } else if (num_partial > 1) { + /* The segment is smaller than one byte but more than one + * bit. Test whether it fits within the current byte. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_gt(handle_partial); + } + + /* The segment fits in the current byte. */ + a.ldrb(bitdata.w(), arm::Mem(bin_byte_ptr)); + if (num_partial == 0) { + a.rev64(bitdata, bitdata); + a.b(read_done); + } else if (num_partial > 1) { + a.b(rev64); + } + + /* The segment is unaligned and spans two bytes. */ + a.bind(handle_partial); + if (num_partial != 1) { + a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr)); + } + need_rev64 = true; + } else if (bits <= 16) { + a.ands(bit_offset, bin_offset, imm(7)); + + /* We always need to read at least two bytes. */ + a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + a.b_eq(read_done); /* Done if segment is byte-aligned. */ + + /* The segment is unaligned. If its size is 9, it always fits + * in two bytes and we fall through to the shift instruction. */ + a.bind(handle_partial); + if (num_partial > 1) { + /* If segment size is less than 15 bits or less, it is + * possible that it fits into two bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_le(shift); + } + + if (num_partial != 1) { + /* The segment spans three bytes. Read an additional byte and + * shift into place (right below the already read two bytes a + * the top of the word). */ + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 2)); + a.orr(bitdata, bitdata, tmp, arm::lsl(40)); + } + } else if (bits <= 24) { + a.ands(bit_offset, bin_offset, imm(7)); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans four bytes. */ + a.b_ne(handle_partial); + } else if (num_partial > 1) { + /* The segment is smaller than three bytes. Test whether + * it spans three or four bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_gt(handle_partial); + } + + /* This segment spans three bytes. */ + a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 2)); + a.orr(bitdata, bitdata, tmp, arm::lsl(16)); + if (num_partial == 0) { + a.rev64(bitdata, bitdata); + a.b(read_done); + } else if (num_partial > 1) { + a.b(rev64); + } + + /* This segment spans four bytes. */ + a.bind(handle_partial); + if (num_partial != 1) { + a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr)); + } + need_rev64 = true; + } else if (bits <= 32) { + a.ands(bit_offset, bin_offset, imm(7)); + + /* We always need to read at least four bytes. */ + a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + a.b_eq(read_done); + + a.bind(handle_partial); + if (num_partial > 0) { + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_le(shift); + } + + if (num_partial != 1) { + /* The segment spans five bytes. Read an additional byte and + * shift into place. */ + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 4)); + a.orr(bitdata, bitdata, tmp, arm::lsl(24)); + } + } else if (bits <= 40) { + a.ands(bit_offset, bin_offset, imm(7)); + + /* We always need to read four bytes. */ + a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans six bytes. */ + a.b_ne(handle_partial); + } else if (num_partial > 1) { + /* The segment is smaller than five bytes. Test whether it + * spans five or six bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_gt(handle_partial); + } + + /* This segment spans five bytes. Read an additional byte. */ + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 4)); + a.orr(bitdata, bitdata, tmp, arm::lsl(24)); + if (num_partial == 0) { + a.b(read_done); + } else if (num_partial > 1) { + a.b(shift); + } + + a.bind(handle_partial); + if (num_partial != 1) { + /* This segment spans six bytes. Read two additional bytes. */ + a.ldrh(tmp.w(), arm::Mem(bin_byte_ptr, 4)); + a.rev16(tmp.w(), tmp.w()); + a.orr(bitdata, bitdata, tmp, arm::lsl(16)); + } + } else if (bits <= 48) { + a.ands(bit_offset, bin_offset, imm(7)); + a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.ldrh(tmp.w(), arm::Mem(bin_byte_ptr, 4)); + a.orr(bitdata, bitdata, tmp, arm::lsl(32)); + a.rev64(bitdata, bitdata); + a.b_eq(read_done); + + a.bind(handle_partial); + if (num_partial > 1) { + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_le(shift); + } + + if (num_partial != 1) { + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 6)); + a.orr(bitdata, bitdata, tmp, arm::lsl(8)); + } + } else if (bits <= 56) { + a.ands(bit_offset, bin_offset, imm(7)); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans 8 bytes. */ + a.b_ne(handle_partial); + } else if (num_partial > 1) { + /* The segment is smaller than 8 bytes. Test whether it + * spans 7 or 8 bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_gt(handle_partial); + } + + /* This segment spans 7 bytes. */ + a.ldr(bitdata, arm::Mem(bin_byte_ptr, -1)); + a.lsr(bitdata, bitdata, imm(8)); + a.b(rev64); + + /* This segment spans 8 bytes. */ + a.bind(handle_partial); + if (num_partial != 1) { + a.ldr(bitdata, arm::Mem(bin_byte_ptr)); + } + need_rev64 = true; + } else if (bits <= 64) { + a.ands(bit_offset, bin_offset, imm(7)); + a.ldr(bitdata, arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + + if (num_partial == 0) { + /* Byte-sized segment. If it is aligned it spans 8 bytes + * and we are done. */ + a.b_eq(read_done); + } else if (num_partial == 1) { + /* This segment is 57 bits wide. It always spans 8 bytes. */ + a.b(shift); + } else { + /* The segment is smaller than 8 bytes. Test whether it + * spans 8 or 9 bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_le(shift); + } + + /* This segments spans 9 bytes. Read an additional byte. */ + a.bind(handle_partial); + if (num_partial != 1) { + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 8)); + a.lsl(bitdata, bitdata, bit_offset); + a.lsl(tmp, tmp, bit_offset); + a.orr(bitdata, bitdata, tmp, arm::lsr(8)); + a.b(read_done); + } + } + + a.bind(rev64); + if (need_rev64) { + a.rev64(bitdata, bitdata); + } + + /* Shift the read data into the most significant bits of the + * word. */ + a.bind(shift); + a.lsl(bitdata, bitdata, bit_offset); + + a.bind(read_done); +} + +void BeamModuleAssembler::emit_extract_integer(const arm::Gp bitdata, + Uint flags, + Uint bits, + const ArgRegister &Dst) { + Label big = a.newLabel(); + Label done = a.newLabel(); + arm::Gp data_reg; + auto dst = init_destination(Dst, TMP1); + Uint num_partial = bits % 8; + Uint num_complete = 8 * (bits / 8); + + if (bits <= 8) { + /* Endian does not matter for values that fit in a byte. */ + flags &= ~BSF_LITTLE; + } + + /* If this segment is little-endian, reverse endianness. */ + if ((flags & BSF_LITTLE) != 0) { + comment("reverse endian for a little-endian segment"); + } + data_reg = TMP2; + if ((flags & BSF_LITTLE) == 0) { + data_reg = bitdata; + } else if (bits == 16) { + a.rev16(TMP2, bitdata); + } else if (bits == 32) { + a.rev32(TMP2, bitdata); + } else if (num_partial == 0) { + a.rev64(TMP2, bitdata); + a.lsr(TMP2, TMP2, arm::lsr(64 - bits)); + } else { + a.ubfiz(TMP3, bitdata, imm(num_complete), imm(num_partial)); + a.ubfx(TMP2, bitdata, imm(num_partial), imm(num_complete)); + a.rev64(TMP2, TMP2); + a.orr(TMP2, TMP3, TMP2, arm::lsr(64 - num_complete)); + } + + /* Sign-extend the number if the segment is signed. */ + if ((flags & BSF_SIGNED) != 0) { + if (0 < bits && bits < 64) { + comment("sign extend extracted value"); + a.lsl(TMP2, data_reg, imm(64 - bits)); + a.asr(TMP2, TMP2, imm(64 - bits)); + data_reg = TMP2; + } + } + + /* Handle segments whose values might not fit in a small integer. */ + if (bits >= SMALL_BITS) { + comment("test whether it fits in a small"); + if (bits < 64 && (flags & BSF_SIGNED) == 0) { + a.and_(TMP2, data_reg, imm((1ull << bits) - 1)); + data_reg = TMP2; + } + if ((flags & BSF_SIGNED) != 0) { + /* Signed segment. */ + a.adds(TMP3, ZERO, data_reg, arm::lsr(SMALL_BITS - 1)); + a.ccmp(TMP3, + imm(_TAG_IMMED1_MASK << 1 | 1), + imm(NZCV::kEqual), + imm(arm::CondCode::kNE)); + a.b_ne(big); + } else { + /* Unsigned segment. */ + a.lsr(TMP3, data_reg, imm(SMALL_BITS - 1)); + a.cbnz(TMP3, big); + } + } + + /* Tag and store the extracted small integer. */ + comment("store extracted integer as a small"); + mov_imm(dst.reg, _TAG_IMMED1_SMALL); + if ((flags & BSF_SIGNED) != 0) { + a.orr(dst.reg, dst.reg, data_reg, arm::lsl(_TAG_IMMED1_SIZE)); + } else { + if (bits >= SMALL_BITS) { + a.bfi(dst.reg, + data_reg, + arm::lsl(_TAG_IMMED1_SIZE), + imm(SMALL_BITS)); + } else if (bits != 0) { + a.bfi(dst.reg, data_reg, arm::lsl(_TAG_IMMED1_SIZE), imm(bits)); + } + } + + if (bits >= SMALL_BITS) { + a.b(done); + } + + /* Handle a bignum (up to 64 bits). */ + a.bind(big); + if (bits >= SMALL_BITS) { + comment("store extracted integer as a bignum"); + a.add(dst.reg, HTOP, imm(TAG_PRIMARY_BOXED)); + mov_imm(TMP3, make_pos_bignum_header(1)); + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned. */ + a.stp(TMP3, data_reg, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + } else { + /* Signed. */ + Label store = a.newLabel(); + a.adds(TMP2, data_reg, ZERO); + a.b_pl(store); + + mov_imm(TMP3, make_neg_bignum_header(1)); + a.neg(TMP2, TMP2); + + a.bind(store); + a.stp(TMP3, TMP2, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + } + } + + a.bind(done); + flush_var(dst); +} + +void BeamModuleAssembler::emit_extract_binary(const arm::Gp bitdata, + Uint bits, + const ArgRegister &Dst) { + auto dst = init_destination(Dst, TMP1); + Uint num_bytes = bits / 8; + + a.add(dst.reg, HTOP, imm(TAG_PRIMARY_BOXED)); + mov_imm(TMP2, header_heap_bin(num_bytes)); + mov_imm(TMP3, num_bytes); + a.rev64(TMP4, bitdata); + a.stp(TMP2, TMP3, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + if (num_bytes != 0) { + a.str(TMP4, arm::Mem(HTOP).post(sizeof(Eterm[1]))); + } + flush_var(dst); +} + +static std::vector<BsmSegment> opt_bsm_segments( + const std::vector<BsmSegment> segments, + const ArgWord &Need, + const ArgWord &Live) { + std::vector<BsmSegment> segs; + + Uint heap_need = Need.get(); + + /* + * First calculate the total number of heap words needed for + * bignums and binaries. + */ + for (auto seg : segments) { + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + if (seg.size >= SMALL_BITS) { + heap_need += BIG_NEED_FOR_BITS(seg.size); + } + break; + case BsmSegment::action::GET_BINARY: + heap_need += heap_bin_size((seg.size + 7) / 8); + break; + case BsmSegment::action::GET_TAIL: + heap_need += EXTRACT_SUB_BIN_HEAP_NEED; + break; + default: + break; + } + } + + int index = 0; + int read_action_pos = -1; + + index = 0; + for (auto seg : segments) { + if (heap_need != 0 && seg.live.isWord()) { + BsmSegment s = seg; + + read_action_pos = -1; + s.action = BsmSegment::action::TEST_HEAP; + s.size = heap_need; + segs.push_back(s); + index++; + heap_need = 0; + } + + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + case BsmSegment::action::GET_BINARY: + if (seg.size > 64) { + read_action_pos = -1; + } else if (seg.action == BsmSegment::action::GET_BINARY && + seg.size % 8 != 0) { + read_action_pos = -1; + } else { + if ((seg.flags & BSF_LITTLE) != 0 || read_action_pos < 0 || + seg.size + segs.at(read_action_pos).size > 64) { + BsmSegment s; + + /* Create a new READ action. */ + read_action_pos = index; + s.action = BsmSegment::action::READ; + s.size = seg.size; + segs.push_back(s); + index++; + } else { + /* Reuse previous READ action. */ + segs.at(read_action_pos).size += seg.size; + } + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + seg.action = BsmSegment::action::EXTRACT_INTEGER; + break; + case BsmSegment::action::GET_BINARY: + seg.action = BsmSegment::action::EXTRACT_BINARY; + break; + default: + break; + } + } + segs.push_back(seg); + break; + case BsmSegment::action::EQ: { + if (read_action_pos < 0 || + seg.size + segs.at(read_action_pos).size > 64) { + BsmSegment s; + + /* Create a new READ action. */ + read_action_pos = index; + s.action = BsmSegment::action::READ; + s.size = seg.size; + segs.push_back(s); + index++; + } else { + /* Reuse previous READ action. */ + segs.at(read_action_pos).size += seg.size; + } + auto &prev = segs.back(); + if (prev.action == BsmSegment::action::EQ && + prev.size + seg.size <= 64) { + /* Coalesce with the previous EQ instruction. */ + prev.size += seg.size; + prev.unit = prev.unit << seg.size | seg.unit; + index--; + } else { + segs.push_back(seg); + } + break; + } + case BsmSegment::action::SKIP: + if (read_action_pos >= 0 && + seg.size + segs.at(read_action_pos).size <= 64) { + segs.at(read_action_pos).size += seg.size; + seg.action = BsmSegment::action::DROP; + } else { + read_action_pos = -1; + } + segs.push_back(seg); + break; + default: + read_action_pos = -1; + segs.push_back(seg); + break; + } + index++; + } + + /* Handle a trailing test_heap instruction (for the + * i_bs_match_test_heap instruction). */ + if (heap_need) { + BsmSegment seg; + + seg.action = BsmSegment::action::TEST_HEAP; + seg.size = heap_need; + seg.live = Live; + segs.push_back(seg); + } + return segs; +} + +UWord BeamModuleAssembler::bs_get_flags(const ArgVal &val) { + if (val.isNil()) { + return 0; + } else if (val.isLiteral()) { + Eterm term = beamfile_get_literal(beam, val.as<ArgLiteral>().get()); + UWord flags = 0; + + while (is_list(term)) { + Eterm *consp = list_val(term); + Eterm elem = CAR(consp); + switch (elem) { + case am_little: + case am_native: + flags |= BSF_LITTLE; + break; + case am_signed: + flags |= BSF_SIGNED; + break; + } + term = CDR(consp); + } + ASSERT(is_nil(term)); + return flags; + } else if (val.isWord()) { + /* Originates from bs_get_integer2 instruction. */ + return val.as<ArgWord>().get(); + } else { + ASSERT(0); /* Should not happen. */ + return 0; + } +} + +void BeamModuleAssembler::emit_i_bs_match(ArgLabel const &Fail, + ArgRegister const &Ctx, + Span<ArgVal> const &List) { + emit_i_bs_match_test_heap(Fail, Ctx, ArgWord(0), ArgWord(0), List); +} + +void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, + ArgRegister const &Ctx, + ArgWord const &Need, + ArgWord const &Live, + Span<ArgVal> const &List) { + const int orig_offset = offsetof(ErlBinMatchState, mb.orig); + const int base_offset = offsetof(ErlBinMatchState, mb.base); + const int position_offset = offsetof(ErlBinMatchState, mb.offset); + const int size_offset = offsetof(ErlBinMatchState, mb.size); + + std::vector<BsmSegment> segments; + + auto current = List.begin(); + auto end = List.begin() + List.size(); + + while (current < end) { + auto cmd = current++->as<ArgImmed>().get(); + BsmSegment seg; + + switch (cmd) { + case am_ensure_at_least: { + seg.action = BsmSegment::action::ENSURE_AT_LEAST; + seg.size = current[0].as<ArgWord>().get(); + seg.unit = current[1].as<ArgWord>().get(); + current += 2; + break; + } + case am_ensure_exactly: { + seg.action = BsmSegment::action::ENSURE_EXACTLY; + seg.size = current[0].as<ArgWord>().get(); + current += 1; + break; + } + case am_binary: + case am_integer: { + auto size = current[2].as<ArgWord>().get(); + auto unit = current[3].as<ArgWord>().get(); + + switch (cmd) { + case am_integer: + seg.action = BsmSegment::action::GET_INTEGER; + break; + case am_binary: + seg.action = BsmSegment::action::GET_BINARY; + break; + } + + seg.live = current[0]; + seg.size = size * unit; + seg.unit = unit; + seg.flags = bs_get_flags(current[1]); + seg.dst = current[4].as<ArgRegister>(); + current += 5; + break; + } + case am_get_tail: { + seg.action = BsmSegment::action::GET_TAIL; + seg.live = current[0].as<ArgWord>(); + seg.dst = current[2].as<ArgRegister>(); + current += 3; + break; + } + case am_skip: { + seg.action = BsmSegment::action::SKIP; + seg.size = current[0].as<ArgWord>().get(); + seg.flags = 0; + current += 1; + break; + } + case am_Eq: { + seg.action = BsmSegment::action::EQ; + seg.live = current[0]; + seg.size = current[1].as<ArgWord>().get(); + seg.unit = current[2].as<ArgWord>().get(); + current += 3; + break; + } + default: + abort(); + break; + } + segments.push_back(seg); + } + + segments = opt_bsm_segments(segments, Need, Live); + + const arm::Gp bin_base = ARG2; + const arm::Gp bin_position = ARG3; + const arm::Gp bin_size = ARG4; + const arm::Gp bitdata = ARG8; + bool position_is_valid = false; + + for (auto seg : segments) { + switch (seg.action) { + case BsmSegment::action::ENSURE_AT_LEAST: { + comment("ensure_at_least %ld %ld", seg.size, seg.unit); + auto ctx_reg = load_source(Ctx, TMP1); + auto stride = seg.size; + auto unit = seg.unit; + + a.ldur(bin_position, emit_boxed_val(ctx_reg.reg, position_offset)); + a.ldur(bin_size, emit_boxed_val(ctx_reg.reg, size_offset)); + a.sub(TMP5, bin_size, bin_position); + if (stride != 0) { + cmp(TMP5, stride); + a.b_lo(resolve_beam_label(Fail, disp1MB)); + } + + if (unit != 1) { + if (stride % unit != 0) { + sub(TMP5, TMP5, stride); + } + + if ((unit & (unit - 1)) != 0) { + mov_imm(TMP4, unit); + + a.udiv(TMP3, TMP5, TMP4); + a.msub(TMP5, TMP3, TMP4, TMP5); + + a.cbnz(TMP5, resolve_beam_label(Fail, disp1MB)); + } else { + a.tst(TMP5, imm(unit - 1)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + } + } + + position_is_valid = true; + break; + } + case BsmSegment::action::ENSURE_EXACTLY: { + comment("ensure_exactly %ld", seg.size); + auto ctx_reg = load_source(Ctx, TMP1); + auto size = seg.size; + + a.ldur(bin_position, emit_boxed_val(ctx_reg.reg, position_offset)); + a.ldur(TMP3, emit_boxed_val(ctx_reg.reg, size_offset)); + if (size != 0) { + a.sub(TMP1, TMP3, bin_position); + cmp(TMP1, size); + } else { + a.subs(TMP1, TMP3, bin_position); + } + a.b_ne(resolve_beam_label(Fail, disp1MB)); + position_is_valid = true; + break; + } + case BsmSegment::action::EQ: { + comment("=:= %ld %ld", seg.size, seg.unit); + if (seg.size != 0 && seg.size != 64) { + a.ror(bitdata, bitdata, imm(64 - seg.size)); + } + if (seg.size == 64) { + cmp(bitdata, seg.unit); + } else if (seg.size == 32) { + cmp(bitdata.w(), seg.unit); + } else if (seg.unit == 0) { + a.tst(bitdata, imm((1ull << seg.size) - 1)); + } else { + a.and_(TMP1, bitdata, imm((1ull << seg.size) - 1)); + cmp(TMP1, seg.unit); + } + a.b_ne(resolve_beam_label(Fail, disp1MB)); + break; + } + case BsmSegment::action::TEST_HEAP: { + comment("test_heap %ld", seg.size); + emit_gc_test(ArgWord(0), ArgWord(seg.size), seg.live); + position_is_valid = false; + break; + } + case BsmSegment::action::READ: { + comment("read %ld", seg.size); + if (seg.size == 0) { + comment("(nothing to do)"); + } else { + auto ctx = load_source(Ctx, ARG1); + + if (!position_is_valid) { + a.ldur(bin_position, + emit_boxed_val(ctx.reg, position_offset)); + position_is_valid = true; + } + a.ldur(bin_base, emit_boxed_val(ctx.reg, base_offset)); + + emit_read_bits(seg.size, bin_base, bin_position, bitdata); + + a.add(bin_position, bin_position, imm(seg.size)); + a.stur(bin_position, emit_boxed_val(ctx.reg, position_offset)); + } + break; + } + case BsmSegment::action::EXTRACT_BINARY: { + auto bits = seg.size; + auto Dst = seg.dst; + + comment("extract binary %ld", bits); + emit_extract_binary(bitdata, bits, Dst); + if (bits != 0 && bits != 64) { + a.ror(bitdata, bitdata, imm(64 - bits)); + } + break; + } + case BsmSegment::action::EXTRACT_INTEGER: { + auto bits = seg.size; + auto flags = seg.flags; + auto Dst = seg.dst; + + comment("extract integer %ld", bits); + if (bits != 0 && bits != 64) { + a.ror(bitdata, bitdata, imm(64 - bits)); + } + emit_extract_integer(bitdata, flags, bits, Dst); + break; + } + case BsmSegment::action::GET_INTEGER: { + Uint live = seg.live.as<ArgWord>().get(); + Uint flags = seg.flags; + auto bits = seg.size; + auto Dst = seg.dst; + + comment("get integer %ld", bits); + auto ctx = load_source(Ctx, TMP1); + + a.mov(ARG1, c_p); + a.mov(ARG2, bits); + a.mov(ARG3, flags); + lea(ARG4, emit_boxed_val(ctx.reg, offsetof(ErlBinMatchState, mb))); + + if (bits >= SMALL_BITS) { + emit_enter_runtime<Update::eHeapOnlyAlloc>(live); + } else { + emit_enter_runtime(live); + } + + runtime_call<4>(erts_bs_get_integer_2); + + if (bits >= SMALL_BITS) { + emit_leave_runtime<Update::eHeapOnlyAlloc>(live); + } else { + emit_leave_runtime(live); + } + + mov_arg(Dst, ARG1); + + position_is_valid = false; + break; + } + case BsmSegment::action::GET_BINARY: { + auto Live = seg.live; + comment("get binary %ld", seg.size); + auto ctx = load_source(Ctx, TMP1); + + lea(ARG1, arm::Mem(c_p, offsetof(Process, htop))); + a.ldur(ARG2, emit_boxed_val(ctx.reg, orig_offset)); + a.ldur(ARG3, emit_boxed_val(ctx.reg, base_offset)); + a.ldur(ARG4, emit_boxed_val(ctx.reg, position_offset)); + mov_imm(ARG5, seg.size); + a.add(TMP2, ARG4, ARG5); + a.stur(TMP2, emit_boxed_val(ctx.reg, position_offset)); + + emit_enter_runtime<Update::eHeapOnlyAlloc>( + Live.as<ArgWord>().get()); + + runtime_call<5>(erts_extract_sub_binary); + + emit_leave_runtime<Update::eHeapOnlyAlloc>( + Live.as<ArgWord>().get()); + + mov_arg(seg.dst, ARG1); + position_is_valid = false; + break; + } + case BsmSegment::action::GET_TAIL: { + comment("get_tail"); + + mov_arg(ARG1, Ctx); + fragment_call(ga->get_bs_get_tail_shared()); + mov_arg(seg.dst, ARG1); + position_is_valid = false; + break; + } + case BsmSegment::action::SKIP: { + comment("skip %ld", seg.size); + auto ctx = load_source(Ctx, TMP1); + if (!position_is_valid) { + a.ldur(bin_position, emit_boxed_val(ctx.reg, position_offset)); + position_is_valid = true; + } + add(bin_position, bin_position, seg.size); + a.stur(bin_position, emit_boxed_val(ctx.reg, position_offset)); + break; + } + case BsmSegment::action::DROP: + auto bits = seg.size; + comment("drop %ld", bits); + if (bits != 0 && bits != 64) { + a.ror(bitdata, bitdata, imm(64 - bits)); + } + break; + } + } +} |