diff options
author | Björn Gustavsson <bjorn@erlang.org> | 2022-08-02 09:58:05 +0200 |
---|---|---|
committer | Björn Gustavsson <bjorn@erlang.org> | 2022-09-02 05:52:15 +0200 |
commit | 4f0ec73674b5c042084b528642185f968f7d9981 (patch) | |
tree | f4ab47cc625dc7fc47a4e7dfd798dbd4a04693ab /erts/emulator/beam | |
parent | f962bc3636d4f8bd25d77ace27294c274c49a6ca (diff) | |
download | erlang-4f0ec73674b5c042084b528642185f968f7d9981.tar.gz |
Optimize binary matching for fixed-width segments
Consider this function:
foo(<<A:6, B:6, C:6, D:6>>) ->
{A, B, C, D}.
The compiler in Erlang/OTP 25 and earlier would generate the following
code for doing the binary matching:
{test,bs_start_match3,{f,1},1,[{x,0}],{x,1}}.
{bs_get_position,{x,1},{x,0},2}.
{test,bs_get_integer2,
{f,3},
2,
[{x,1},
{integer,6},
1,
{field_flags,[{anno,[4,{file,"t.erl"}]},unsigned,big]}],
{x,2}}.
{test,bs_get_integer2,
{f,3},
3,
[{x,1},
{integer,6},
1,
{field_flags,[{anno,[4,{file,"t.erl"}]},unsigned,big]}],
{x,3}}.
{test,bs_get_integer2,
{f,3},
4,
[{x,1},
{integer,6},
1,
{field_flags,[{anno,[4,{file,"t.erl"}]},unsigned,big]}],
{x,4}}.
{test,bs_get_integer2,
{f,3},
5,
[{x,1},
{integer,6},
1,
{field_flags,[{anno,[4,{file,"t.erl"}]},unsigned,big]}],
{x,5}}.
{test,bs_test_tail2,{f,3},[{x,1},0]}.
That is, there would be one instruction for each segment being
matched. Having separate match instructions for each segment makes it
difficult for the JIT to do any serious optimization. Currently, when
matching a segment with a size that is not a multiple of 8, the JIT
will generate code that calls a helper function. Common sizes such as
8, 16, and 32 are specially optimized with inline code in the x86 JIT
and in the non-JIT BEAM VM.
This commit introduces a new `bs_match` instruction for matching of
integer and binary segments of fixed size. Here is the generated code
for the example:
{test,bs_start_match3,{f,1},1,[{x,0}],{x,1}}.
{bs_get_position,{x,1},{x,0},2}.
{bs_match,{f,3},
{x,1},
{commands,[{ensure_exactly,24},
{integer,2,{literal,[]},6,1,{x,2}},
{integer,3,{literal,[]},6,1,{x,3}},
{integer,4,{literal,[]},6,1,{x,4}},
{integer,5,{literal,[]},6,1,{x,5}}]}}.
Having only one instruction for the matching allows the JIT to
generate faster code. The generated code will do the following:
* Test that the size of the binary being matched is exactly 24 bits.
* Read 24 bits from the binary into a temporary CPU register.
* For each segment, extract the integer from the temporary register
by shifting and masking.
Because of the before-mentioned optimization for certain common
segment sizes, the main part of the Base64 encoding in the `base64`
module is currently implemented in the following non-intuitive way:
encode_binary(<<B1:8, B2:8, B3:8, Ls/bits>>, A) ->
BB = (B1 bsl 16) bor (B2 bsl 8) bor B3,
encode_binary(Ls,
<<A/bits,(b64e(BB bsr 18)):8,
(b64e((BB bsr 12) band 63)):8,
(b64e((BB bsr 6) band 63)):8,
(b64e(BB band 63)):8>>)
With the new optimization, it is now possible to express the Base64
encoding in a more natural way, which is also faster than before:
encode_binary(<<B1:6, B2:6, B3:6, B4:6, Ls/bits>>, A) ->
encode_binary(Ls,
<<A/bits,
(b64e(B1)):8,
(b64e(B2)):8,
(b64e(B3)):8,
(b64e(B4)):8>>)
Diffstat (limited to 'erts/emulator/beam')
-rw-r--r-- | erts/emulator/beam/atom.names | 4 | ||||
-rw-r--r-- | erts/emulator/beam/emu/beam_emu.c | 2 | ||||
-rw-r--r-- | erts/emulator/beam/emu/bs_instrs.tab | 271 | ||||
-rw-r--r-- | erts/emulator/beam/emu/generators.tab | 256 | ||||
-rw-r--r-- | erts/emulator/beam/emu/ops.tab | 56 | ||||
-rw-r--r-- | erts/emulator/beam/jit/arm/beam_asm.hpp | 14 | ||||
-rw-r--r-- | erts/emulator/beam/jit/arm/generators.tab | 56 | ||||
-rw-r--r-- | erts/emulator/beam/jit/arm/instr_bs.cpp | 758 | ||||
-rw-r--r-- | erts/emulator/beam/jit/arm/ops.tab | 17 | ||||
-rw-r--r-- | erts/emulator/beam/jit/x86/beam_asm.hpp | 29 | ||||
-rwxr-xr-x | erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl | 1 | ||||
-rw-r--r-- | erts/emulator/beam/jit/x86/generators.tab | 56 | ||||
-rw-r--r-- | erts/emulator/beam/jit/x86/instr_bs.cpp | 1162 | ||||
-rw-r--r-- | erts/emulator/beam/jit/x86/ops.tab | 14 |
14 files changed, 2484 insertions, 212 deletions
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index b04a5f6052..1332f3a0d9 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -256,6 +256,7 @@ atom enable_trace atom enabled atom endian atom env +atom ensure_at_least ensure_exactly atom eof atom eol atom Eq='=:=' @@ -324,6 +325,7 @@ atom get_all_trap atom get_internal_state_blocked atom get_seq_token atom get_size +atom get_tail atom get_tcw atom gather_gc_info_result atom gather_io_bytes @@ -645,6 +647,7 @@ atom set_tcw_fake atom short atom shutdown atom sighup +atom signed atom sigterm atom sigusr1 atom sigusr2 @@ -659,6 +662,7 @@ atom sigtstp atom sigquit atom silent atom size +atom skip atom spawn_executable atom spawn_driver atom spawn_init diff --git a/erts/emulator/beam/emu/beam_emu.c b/erts/emulator/beam/emu/beam_emu.c index dff6a274dc..5130eed5a2 100644 --- a/erts/emulator/beam/emu/beam_emu.c +++ b/erts/emulator/beam/emu/beam_emu.c @@ -313,6 +313,8 @@ void process_main(ErtsSchedulerData *esdp) #endif #endif + Uint bitdata = 0; + Uint64 start_time = 0; /* Monitor long schedule */ ErtsCodePtr start_time_i = NULL; diff --git a/erts/emulator/beam/emu/bs_instrs.tab b/erts/emulator/beam/emu/bs_instrs.tab index fe37b6fe2c..f84cb29805 100644 --- a/erts/emulator/beam/emu/bs_instrs.tab +++ b/erts/emulator/beam/emu/bs_instrs.tab @@ -1853,3 +1853,274 @@ i_bs_start_match3.execute(Live, Fail, Dst) { } %endif + +// +// New instructions introduced in OTP 26 for matching of integers and +// binaries of fixed sizes follow. +// + +// +// i_bs_ensure_bits Ctx Size Fail +// + +i_bs_ensure_bits := i_bs_ensure_bits.fetch.execute; + +i_bs_ensure_bits.head() { + Eterm context; +} + +i_bs_ensure_bits.fetch(Src) { + context = $Src; +} + +i_bs_ensure_bits.execute(NumBits, Fail) { + ErlBinMatchBuffer* mb = ms_matchbuffer(context); + Uint size = $NumBits; + if (mb->size - mb->offset < size) { + $FAIL($Fail); + } +} + +// +// i_bs_ensure_bits_unit Ctx Size Unit Fail +// + +i_bs_ensure_bits_unit := i_bs_ensure_bits_unit.fetch.execute; + +i_bs_ensure_bits_unit.head() { + Eterm context; +} + +i_bs_ensure_bits_unit.fetch(Src) { + context = $Src; +} + +i_bs_ensure_bits_unit.execute(NumBits, Unit, Fail) { + ErlBinMatchBuffer* mb = ms_matchbuffer(context); + Uint size = $NumBits; + Uint diff; + + if ((diff = mb->size - mb->offset) < size) { + $FAIL($Fail); + } + if ((diff - size) % $Unit != 0) { + $FAIL($Fail); + } +} + +// +// i_bs_read_bits Ctx Size +// i_bs_ensure_bits_read Ctx Size Fail +// + +i_bs_read_bits := i_bs_read_bits.fetch.execute; +i_bs_ensure_bits_read := i_bs_read_bits.fetch.ensure_bits.execute; + +i_bs_read_bits.head() { + ErlBinMatchBuffer* mb; + Uint size; +} + +i_bs_read_bits.fetch(Src, NumBits) { + mb = ms_matchbuffer($Src); + size = $NumBits; +} + +i_bs_read_bits.ensure_bits(Fail) { + if (mb->size - mb->offset < size) { + $FAIL($Fail); + } +} + +i_bs_read_bits.execute() { + byte *byte_ptr; + Uint bit_offset = mb->offset % 8; + Uint num_bytes_to_read = (size + 7) / 8; + Uint num_partial = size % 8; + + if ((num_partial == 0 && bit_offset != 0) || + (num_partial != 0 && bit_offset > 8 - num_partial)) { + num_bytes_to_read++; + } + + bitdata = 0; + byte_ptr = mb->base + (mb->offset >> 3); + mb->offset += size; + switch (num_bytes_to_read) { +#ifdef ARCH_64 + case 9: + case 8: + bitdata = bitdata << 8 | *byte_ptr++; + case 7: + bitdata = bitdata << 8 | *byte_ptr++; + case 6: + bitdata = bitdata << 8 | *byte_ptr++; + case 5: + bitdata = bitdata << 8 | *byte_ptr++; +#else + case 5: +#endif + case 4: + bitdata = bitdata << 8 | *byte_ptr++; + case 3: + bitdata = bitdata << 8 | *byte_ptr++; + case 2: + bitdata = bitdata << 8 | *byte_ptr++; + case 1: + bitdata = bitdata << 8 | *byte_ptr++; + } + + if (num_bytes_to_read <= sizeof(Uint)) { + bitdata <<= 8 * (sizeof(Uint) - num_bytes_to_read) + bit_offset; + } else { + bitdata <<= bit_offset; + bitdata = bitdata | ((*byte_ptr << bit_offset) >> 8); + } +} + +// i_bs_extract_integer Size Dst +i_bs_extract_integer(NumBits, Dst) { + Uint size = $NumBits; + Eterm result; + + result = bitdata >> (8 * sizeof(Uint) - size); + result = make_small(result); + bitdata <<= size; + $Dst = result; +} + +// i_bs_read_integer_8 Ctx Dst +i_bs_read_integer_8(Ctx, Dst) { + ErlBinMatchBuffer* mb = ms_matchbuffer($Ctx); + byte *byte_ptr; + Uint bit_offset = mb->offset % 8; + Eterm result; + + byte_ptr = mb->base + (mb->offset >> 3); + mb->offset += 8; + result = byte_ptr[0]; + if (bit_offset != 0) { + result = result << 8 | byte_ptr[1]; + result = ((result << bit_offset) >> 8) & 0xff; + } + result = make_small(result); + $Dst = result; +} + +// +// i_bs_get_fixed_integer Ctx Size Flags Dst +// + +i_bs_get_fixed_integer := i_bs_get_fixed_integer.fetch.execute; + +i_bs_get_fixed_integer.head() { + Eterm context; +} + +i_bs_get_fixed_integer.fetch(Src) { + context = $Src; +} + +i_bs_get_fixed_integer.execute(Size, Flags, Dst) { + ErlBinMatchBuffer* mb; + Uint size = $Size; + Eterm result; + + mb = ms_matchbuffer(context); + LIGHT_SWAPOUT; + result = erts_bs_get_integer_2(c_p, size, $Flags, mb); + LIGHT_SWAPIN; + HEAP_SPACE_VERIFIED(0); + $Dst = result; +} + +// +// i_get_fixed_binary Ctx Size Dst +// + +i_bs_get_fixed_binary := i_bs_get_fixed_binary.fetch.execute; + +i_bs_get_fixed_binary.head() { + Eterm context; +} + +i_bs_get_fixed_binary.fetch(Src) { + context = $Src; +} + +i_bs_get_fixed_binary.execute(Size, Dst) { + ErlBinMatchBuffer* mb; + Uint size = $Size; + Eterm* htop; + Eterm result; + + ASSERT(header_is_bin_matchstate(*boxed_val(context))); + + htop = HTOP; + mb = ms_matchbuffer(context); + result = erts_extract_sub_binary(&htop, mb->orig, mb->base, + mb->offset, size); + HTOP = htop; + + mb->offset += size; + + $Dst = result; +} + +// +// i_get_tail Ctx Dst +// + +i_bs_get_tail := i_bs_get_tail.fetch.execute; + +i_bs_get_tail.head() { + Eterm context; +} + +i_bs_get_tail.fetch(Src) { + context = $Src; +} + +i_bs_get_tail.execute(Dst) { + ErlBinMatchBuffer* mb; + Eterm* htop; + Eterm result; + + ASSERT(header_is_bin_matchstate(*boxed_val(context))); + + htop = HTOP; + mb = ms_matchbuffer(context); + result = erts_extract_sub_binary(&htop, mb->orig, mb->base, + mb->offset, mb->size - mb->offset); + HTOP = htop; + + $Dst = result; +} + +// +// i_bs_skip Ctx Size +// + +i_bs_skip := i_bs_skip.fetch.execute; + +i_bs_skip.head() { + Eterm context; +} + +i_bs_skip.fetch(Src) { + context = $Src; +} + +i_bs_skip.execute(Size) { + ErlBinMatchBuffer* mb; + Uint size = $Size; + + ASSERT(header_is_bin_matchstate(*boxed_val(context))); + mb = ms_matchbuffer(context); + mb->offset += size; +} + +// i_bs_drop Size +i_bs_drop(Size) { + bitdata <<= $Size; +} diff --git a/erts/emulator/beam/emu/generators.tab b/erts/emulator/beam/emu/generators.tab index 13ae1b30fa..535e5479c6 100644 --- a/erts/emulator/beam/emu/generators.tab +++ b/erts/emulator/beam/emu/generators.tab @@ -1074,3 +1074,259 @@ gen.update_record(Size, Src, Dst, N, Updates) { return begin; } + +gen.bs_match(Fail, Ctx, N, List) { + BeamOp* first_op = 0; + BeamOp** next_ptr = &first_op; + BeamOp* test_heap_op = 0; + BeamOp* read_op = 0; + int src; + + src = 0; + while (src < N.val) { + Uint unit; + Uint size; + Uint words_needed; + BeamOp* op; + + /* Calculate the number of heap words needed for this + * instruction. */ + words_needed = 0; + switch (List[src].val) { + case am_binary: + ASSERT(List[src+3].type == TAG_u); + ASSERT(List[src+4].type == TAG_u); + size = List[src+3].val * List[src+4].val; + words_needed = heap_bin_size((size + 7) / 8); + break; + case am_integer: + ASSERT(List[src+3].type == TAG_u); + ASSERT(List[src+4].type == TAG_u); + size = List[src+3].val * List[src+4].val; + if (size >= SMALL_BITS) { + words_needed = BIG_NEED_FOR_BITS(size); + } + break; + case am_get_tail: + words_needed = EXTRACT_SUB_BIN_HEAP_NEED; + break; + } + + /* Emit a test_heap instrution if needed and there is + * no previous one. */ + if (words_needed && test_heap_op == 0) { + $NewBeamOp(S, test_heap_op); + $BeamOpNameArity(test_heap_op, test_heap, 2); + + test_heap_op->a[0].type = TAG_u; + test_heap_op->a[0].val = 0; /* Number of heap words */ + + ASSERT(List[src+1].type == TAG_u); + test_heap_op->a[1] = List[src+1]; /* Live */ + + *next_ptr = test_heap_op; + next_ptr = &test_heap_op->next; + } + + if (words_needed) { + test_heap_op->a[0].val += words_needed; + } + + /* Translate this sub-instruction to a BEAM instruction. */ + op = 0; + switch (List[src].val) { + case am_ensure_at_least: { + Uint size = List[src+1].val; + unit = List[src+2].val; + if (size != 0 && unit == 1) { + $NewBeamOp(S, op); + $BeamOpNameArity(op, i_bs_ensure_bits, 3); + op->a[0] = Ctx; + op->a[1].type = TAG_u; + op->a[1].val = size; + op->a[2] = Fail; + } else if (size != 0 && unit != 1) { + $NewBeamOp(S, op); + $BeamOpNameArity(op, i_bs_ensure_bits_unit, 4); + + op->a[0] = Ctx; + op->a[1].type = TAG_u; + op->a[1].val = size; /* Size */ + op->a[2].type = TAG_u; + op->a[2].val = unit; /* Unit */ + op->a[3] = Fail; + } else if (size == 0 && unit != 1) { + $NewBeamOp(S, op); + $BeamOpNameArity(op, bs_test_unit, 3); + + op->a[0] = Fail; + op->a[1] = Ctx; + op->a[2].type = TAG_u; + op->a[2].val = unit; + } else if (size == 0 && unit == 1) { + /* This test is redundant because it always + * succeeds. This should only happen for unoptimized + * code. Generate a dummy instruction to ensure that + * we don't trigger the sanity check at the end of + * this generator. */ + $NewBeamOp(S, op); + $BeamOpNameArity(op, delete_me, 0); + } + src += 3; + break; + } + case am_ensure_exactly: { + $NewBeamOp(S, op); + $BeamOpNameArity(op, bs_test_tail2, 3); + + op->a[0] = Fail; + op->a[1] = Ctx; + op->a[2]= List[src+1]; /* Size */ + + src += 2; + break; + } + case am_binary: { + ASSERT(List[src+3].type == TAG_u); + ASSERT(List[src+4].type == TAG_u); + size = List[src+3].val; + unit = List[src+4].val; + + $NewBeamOp(S, op); + $BeamOpNameArity(op, i_bs_get_fixed_binary, 3); + + op->a[0] = Ctx; + op->a[1].type = TAG_u; + op->a[1].val = size * unit; /* Size */ + op->a[2] = List[src+5]; /* Dst */ + + read_op = 0; + src += 6; + break; + } + case am_integer: { + Uint flags = 0; + BeamOpArg Flags; + + /* Translate flags. */ + Flags = List[src+2]; + if (Flags.type == TAG_n) { + Flags.type = TAG_u; + Flags.val = 0; + } else if (Flags.type == TAG_q) { + Eterm term = beamfile_get_literal(&S->beam, Flags.val); + while (is_list(term)) { + Eterm* consp = list_val(term); + Eterm elem = CAR(consp); + switch (elem) { + case am_little: + flags |= BSF_LITTLE; + break; + case am_native: + flags |= BSF_NATIVE; + break; + case am_signed: + flags |= BSF_SIGNED; + break; + } + term = CDR(consp); + } + ASSERT(is_nil(term)); + Flags.type = TAG_u; + Flags.val = flags; + $NativeEndian(Flags); + } + + ASSERT(List[src+3].type == TAG_u); + ASSERT(List[src+4].type == TAG_u); + size = List[src+3].val * List[src+4].val; + +#define READ_OP_SIZE 1 + if (size < SMALL_BITS && flags == 0) { + /* This is a suitable segment -- an unsigned big + * endian integer that fits in a small. */ + if (read_op == 0 || read_op->a[READ_OP_SIZE].val + size > 8*sizeof(Uint)) { + /* There is either no previous i_bs_read_bits instruction or + * size of this segment don't fit into it. */ + $NewBeamOp(S, read_op); + $BeamOpNameArity(read_op, i_bs_read_bits, 2); + + read_op->a[0] = Ctx; + read_op->a[1].type = TAG_u; + read_op->a[1].val = 0; + + *next_ptr = read_op; + next_ptr = &read_op->next; + } + + read_op->a[READ_OP_SIZE].val += size; + + $NewBeamOp(S, op); + $BeamOpNameArity(op, i_bs_extract_integer, 2); + op->a[0].type = TAG_u; + op->a[0].val = size; + op->a[1] = List[src+5]; /* Dst */ + } else { + /* Little endian, signed, or might not fit in a small. */ + $NewBeamOp(S, op); + $BeamOpNameArity(op, i_bs_get_fixed_integer, 4); + + op->a[0] = Ctx; + op->a[1].type = TAG_u; + op->a[1].val = size; /* Size */ + op->a[2] = Flags; /* Flags */ + op->a[3] = List[src+5]; /* Dst */ + + read_op = 0; + } + + src += 6; + break; + } + case am_get_tail: + $NewBeamOp(S, op); + $BeamOpNameArity(op, i_bs_get_tail, 2); + + op->a[0] = Ctx; + op->a[1] = List[src+3]; /* Dst */ + + read_op = 0; + src += 4; + break; + case am_skip: + ASSERT(List[src+1].type == TAG_u); + size = List[src+1].val; + + $NewBeamOp(S, op); + + if (read_op && read_op->a[READ_OP_SIZE].val + size <= 8*sizeof(Uint)) { + read_op->a[READ_OP_SIZE].val += size; + $BeamOpNameArity(op, i_bs_drop, 1); + op->a[0] = List[src+1]; /* Size */ + } else { + $BeamOpNameArity(op, i_bs_skip, 2); + op->a[0] = Ctx; + op->a[1] = List[src+1]; /* Size */ + read_op = 0; + } + + src += 2; + break; + default: + abort(); + } + + if (op) { + *next_ptr = op; + next_ptr = &op->next; + } + } + + if (first_op == 0) { + erts_exit(ERTS_ERROR_EXIT, "loading bs_match in %T:%T/%d: no instructions loaded", + S->module, S->function, S->arity); + } + + ASSERT(first_op); + return first_op; +} diff --git a/erts/emulator/beam/emu/ops.tab b/erts/emulator/beam/emu/ops.tab index 2d670cbb80..90ff4af92f 100644 --- a/erts/emulator/beam/emu/ops.tab +++ b/erts/emulator/beam/emu/ops.tab @@ -1106,11 +1106,65 @@ is_function Fail=f c => jump Fail func_info M F A => i_func_info u M F A # ================================================================ -# New bit syntax matching (R11B). +# New bit syntax matching for fixed sizes (from OTP 26). # ================================================================ %warm +bs_match Fail Ctx Size Rest=* => bs_match(Fail, Ctx, Size, Rest) + +# The bs_match generator breaks the bs_match instruction into +# the instructions that follow. + +i_bs_ensure_bits xy I f +i_bs_ensure_bits_unit xy I t f + +i_bs_read_bits xy t + +i_bs_extract_integer t d + +i_bs_read_bits Ctx=x u==8 | i_bs_extract_integer u==8 Dst=x => + i_bs_read_integer_8 Ctx Dst + +i_bs_read_integer_8 x x + +i_bs_get_fixed_integer xy I t d + +i_bs_get_fixed_binary xy I d + +i_bs_get_tail xy d + +i_bs_skip xy I + +i_bs_drop I + +i_bs_ensure_bits Ctx1 Size1 Fail | i_bs_read_bits Ctx2 Size2 | + equal(Ctx1, Ctx2) | equal(Size1, Size2) => + i_bs_ensure_bits_read Ctx1 Size1 Fail + +i_bs_ensure_bits_read xy t f + +# Optimize extraction of a single segment for some popular sizes. + +i_bs_ensure_bits Ctx1 u==8 Fail | i_bs_read_bits Ctx2 u==8 | + i_bs_extract_integer u==8 Dst=x | equal(Ctx1, Ctx2) => + i_bs_get_integer_8 Ctx1 Fail Dst + +i_bs_ensure_bits Ctx1 u==16 Fail | i_bs_read_bits Ctx2 u==16 | + i_bs_extract_integer u==16 Dst=x | equal(Ctx1, Ctx2) => + i_bs_get_integer_16 Ctx1 Fail Dst + +%if ARCH_64 +i_bs_ensure_bits Ctx1 u==32 Fail | i_bs_read_bits Ctx2 u==32 | + i_bs_extract_integer u==32 Dst=x | equal(Ctx1, Ctx2) => + i_bs_get_integer_32 Ctx1 Fail Dst +%endif + + +# ================================================================ +# Bit syntax matching (from R11B). +# ================================================================ + # Matching integers bs_match_string Fail Ms Bits Val => i_bs_match_string Ms Fail Bits Val diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp index b76b92368c..e2946fdbc7 100644 --- a/erts/emulator/beam/jit/arm/beam_asm.hpp +++ b/erts/emulator/beam/jit/arm/beam_asm.hpp @@ -1323,6 +1323,20 @@ protected: arm::Gp size_reg); void set_zero(Sint effectiveSize); + void emit_read_bits(Uint bits, + const arm::Gp bin_offset, + const arm::Gp bin_base, + const arm::Gp bitdata); + + void emit_extract_integer(const arm::Gp bitdata, + Uint flags, + Uint bits, + const ArgRegister &Dst); + + void emit_extract_binary(const arm::Gp bitdata, + Uint bits, + const ArgRegister &Dst); + void emit_raise_exception(); void emit_raise_exception(const ErtsCodeMFA *exp); void emit_raise_exception(Label I, const ErtsCodeMFA *exp); diff --git a/erts/emulator/beam/jit/arm/generators.tab b/erts/emulator/beam/jit/arm/generators.tab index 3df71180db..63916ce130 100644 --- a/erts/emulator/beam/jit/arm/generators.tab +++ b/erts/emulator/beam/jit/arm/generators.tab @@ -606,3 +606,59 @@ gen.create_bin(Fail, Alloc, Live, Unit, Dst, N, Segments) { return op; } + +gen.bs_match(Fail, Ctx, N, List) { + BeamOp* op; + int fixed_args; + int i; + + $NewBeamOp(S, op); + $BeamOpNameArity(op, i_bs_match, 2); + fixed_args = op->arity; + $BeamOpArity(op, (N.val + fixed_args)); + + op->a[0] = Fail; + op->a[1] = Ctx; + + for (i = 0; i < N.val; i++) { + BeamOpArg current; + Uint flags = 0; + + current = List[i]; + if (current.type == TAG_n) { + current.type = TAG_u; + current.val = 0; + } else if (current.type == TAG_q) { + Eterm term = beamfile_get_literal(&S->beam, current.val); + while (is_list(term)) { + Eterm* consp = list_val(term); + Eterm elem = CAR(consp); + switch (elem) { + case am_little: + flags |= BSF_LITTLE; + break; + case am_native: + flags |= BSF_NATIVE; + break; + case am_signed: + flags |= BSF_SIGNED; + break; + } + term = CDR(consp); + } + ASSERT(is_nil(term)); + current.type = TAG_u; + current.val = flags; + $NativeEndian(current); + } else if (current.type == TAG_o) { + /* An overflow tag (in ensure_at_least or ensure_exactly) + * means that the match will always fail. */ + $BeamOpNameArity(op, jump, 1); + op->a[0] = Fail; + return op; + } + op->a[i+fixed_args] = current; + } + + return op; +} diff --git a/erts/emulator/beam/jit/arm/instr_bs.cpp b/erts/emulator/beam/jit/arm/instr_bs.cpp index ac57063b4f..e23b09fa80 100644 --- a/erts/emulator/beam/jit/arm/instr_bs.cpp +++ b/erts/emulator/beam/jit/arm/instr_bs.cpp @@ -2611,3 +2611,761 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, comment("done"); mov_arg(Dst, TMP_MEM1q); } + +/* + * Here follows the bs_match instruction and friends. + */ + +struct BsmSegment { + BsmSegment() + : action(action::TEST_HEAP), live(ArgNil()), size(0), unit(1), + flags(0), dst(ArgXRegister(0)){}; + + enum class action { + TEST_HEAP, + ENSURE_AT_LEAST, + ENSURE_EXACTLY, + READ, + EXTRACT_BINARY, + EXTRACT_INTEGER, + GET_INTEGER, + GET_BINARY, + SKIP, + DROP, + GET_TAIL + } action; + ArgVal live; + Uint size; + Uint unit; + Uint flags; + ArgRegister dst; +}; + +void BeamModuleAssembler::emit_read_bits(Uint bits, + const arm::Gp bin_base, + const arm::Gp bin_offset, + const arm::Gp bitdata) { + Label handle_partial = a.newLabel(); + Label shift = a.newLabel(); + Label read_done = a.newLabel(); + + const arm::Gp bin_byte_ptr = TMP2; + const arm::Gp bit_offset = TMP4; + const arm::Gp tmp = TMP5; + + auto num_partial = bits % 8; + + ASSERT(1 <= bits && bits <= 64); + + a.add(bin_byte_ptr, bin_base, bin_offset, arm::lsr(3)); + + if (bits == 1) { + a.and_(bit_offset, bin_offset, imm(7)); + a.ldrb(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + + a.bind(handle_partial); /* Not used, but must bind. */ + } else if (bits <= 8) { + a.ands(bit_offset, bin_offset, imm(7)); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans two bytes. */ + a.b_ne(handle_partial); + } else { + /* Segment smaller than one byte. Test whether the segment + * fits within the current byte. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_gt(handle_partial); + } + + /* The segment fits in the current byte. */ + a.ldrb(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + a.b(num_partial ? shift : read_done); + + /* The segment is unaligned and spans two bytes. */ + a.bind(handle_partial); + a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + } else if (bits <= 16) { + a.ands(bit_offset, bin_offset, imm(7)); + + /* We always need to read at least two bytes. */ + a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + a.b_eq(read_done); /* Done if segment is byte-aligned. */ + + /* The segment is unaligned. */ + a.bind(handle_partial); + if (num_partial != 0) { + /* If segment size is less than 15 bits or less, it is + * possible that it fits into two bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_le(shift); + } + + /* The segment spans three bytes. Read an additional byte and + * shift into place (right below the already read two bytes a + * the top of the word). */ + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 2)); + a.orr(bitdata, bitdata, tmp, arm::lsl(40)); + } else if (bits <= 24) { + a.ands(bit_offset, bin_offset, imm(7)); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans four bytes. */ + a.b_ne(handle_partial); + } else { + /* The segment is smaller than three bytes. Test whether + * it spans three or four bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_gt(handle_partial); + } + + /* This segment spans three bytes. */ + a.ldrh(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 2)); + a.orr(bitdata, bitdata, tmp, arm::lsl(16)); + a.rev64(bitdata, bitdata); + a.b(num_partial ? shift : read_done); + + /* This segment spans four bytes. */ + a.bind(handle_partial); + a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + } else if (bits <= 32) { + a.ands(bit_offset, bin_offset, imm(7)); + + /* We always need to read at least four bytes. */ + a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + a.b_eq(read_done); + + a.bind(handle_partial); + if (num_partial != 0) { + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_le(shift); + } + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 4)); + a.orr(bitdata, bitdata, tmp, arm::lsl(24)); + } else if (bits <= 40) { + a.ands(bit_offset, bin_offset, imm(7)); + + /* We always need to read four bytes. */ + a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans six bytes. */ + a.b_ne(handle_partial); + } else { + /* The segment is smaller than five bytes. Test whether it + * spans five or six bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_gt(handle_partial); + } + + /* This segment spans five bytes. Read an additional byte. */ + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 4)); + a.orr(bitdata, bitdata, tmp, arm::lsl(24)); + a.b(num_partial ? shift : read_done); + + /* This segment spans six bytes. Read two additional bytes. */ + a.bind(handle_partial); + a.ldrh(tmp.w(), arm::Mem(bin_byte_ptr, 4)); + a.rev16(tmp.w(), tmp.w()); + a.orr(bitdata, bitdata, tmp, arm::lsl(16)); + } else if (bits <= 48) { + a.ands(bit_offset, bin_offset, imm(7)); + a.ldr(bitdata.w(), arm::Mem(bin_byte_ptr)); + a.ldrh(tmp.w(), arm::Mem(bin_byte_ptr, 4)); + a.orr(bitdata, bitdata, tmp, arm::lsl(32)); + a.rev64(bitdata, bitdata); + a.b_eq(read_done); + + a.bind(handle_partial); + if (num_partial != 0) { + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_le(shift); + } + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 6)); + a.orr(bitdata, bitdata, tmp, arm::lsl(8)); + } else if (bits <= 56) { + a.ands(bit_offset, bin_offset, imm(7)); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans 8 bytes. */ + a.b_ne(handle_partial); + } else { + /* The segment is smaller than 8 bytes. Test whether it + * spans 7 or 8 bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_gt(handle_partial); + } + + /* This segment spans 7 bytes. */ + a.ldr(bitdata, arm::Mem(bin_byte_ptr, -1)); + a.lsr(bitdata, bitdata, imm(8)); + a.rev64(bitdata, bitdata); + a.b(shift); + + /* This segment spans 8 bytes. */ + a.bind(handle_partial); + a.ldr(bitdata, arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + } else if (bits <= 64) { + a.ands(bit_offset, bin_offset, imm(7)); + a.ldr(bitdata, arm::Mem(bin_byte_ptr)); + a.rev64(bitdata, bitdata); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, + * this segment always spans 8 bytes. */ + a.b_eq(read_done); + } else { + /* The segment is smaller than 8 bytes. Test whether it + * spans 8 or 9 bytes. */ + a.cmp(bit_offset, imm(8 - num_partial)); + a.b_le(shift); + } + + /* This segments spans 9 bytes. Read an additional byte. */ + a.bind(handle_partial); + a.ldrb(tmp.w(), arm::Mem(bin_byte_ptr, 8)); + a.lsl(bitdata, bitdata, bit_offset); + a.lsl(tmp, tmp, bit_offset); + a.orr(bitdata, bitdata, tmp, arm::lsr(8)); + a.b(read_done); + } + + /* Shift the read data into the most significant bits of the + * word. */ + a.bind(shift); + a.lsl(bitdata, bitdata, bit_offset); + + a.bind(read_done); +} + +void BeamModuleAssembler::emit_extract_integer(const arm::Gp bitdata, + Uint flags, + Uint bits, + const ArgRegister &Dst) { + Label big = a.newLabel(); + Label done = a.newLabel(); + arm::Gp data_reg; + auto dst = init_destination(Dst, TMP1); + Uint num_partial = bits % 8; + Uint num_complete = 8 * (bits / 8); + + if (bits <= 8) { + /* Endian does not matter for values that fit in a byte. */ + flags &= ~BSF_LITTLE; + } + + /* If this segment is little-endian, reverse endianness. */ + if ((flags & BSF_LITTLE) != 0) { + comment("reverse endian for a little-endian segment"); + } + data_reg = TMP2; + if ((flags & BSF_LITTLE) == 0) { + data_reg = bitdata; + } else if (bits == 16) { + a.rev16(TMP2, bitdata); + } else if (bits == 32) { + a.rev32(TMP2, bitdata); + } else if (num_partial == 0) { + a.rev64(TMP2, bitdata); + a.lsr(TMP2, TMP2, arm::lsr(64 - bits)); + } else { + a.ubfiz(TMP3, bitdata, imm(num_complete), imm(num_partial)); + a.ubfx(TMP2, bitdata, imm(num_partial), imm(num_complete)); + a.rev64(TMP2, TMP2); + a.orr(TMP2, TMP3, TMP2, arm::lsr(64 - num_complete)); + } + + /* Sign-extend the number if the segment is signed. */ + if ((flags & BSF_SIGNED) != 0) { + if (bits < 64) { + comment("sign extend extracted value"); + a.lsl(TMP2, data_reg, imm(64 - bits)); + a.asr(TMP2, TMP2, imm(64 - bits)); + data_reg = TMP2; + } + } + + /* Handle segments whose values might not fit in a small integer. */ + if (bits >= SMALL_BITS) { + comment("test whether it fits in a small"); + if (bits < 64 && (flags & BSF_SIGNED) == 0) { + a.and_(TMP2, data_reg, imm((1ull << bits) - 1)); + data_reg = TMP2; + } + if ((flags & BSF_SIGNED) != 0) { + /* Signed segment. */ + a.adds(TMP3, ZERO, data_reg, arm::lsr(SMALL_BITS - 1)); + a.ccmp(TMP3, + imm(_TAG_IMMED1_MASK << 1 | 1), + imm(NZCV::kEqual), + imm(arm::CondCode::kNE)); + a.b_ne(big); + } else { + /* Unsigned segment. */ + a.lsr(TMP3, data_reg, imm(SMALL_BITS - 1)); + a.cbnz(TMP3, big); + } + } + + /* Tag and store the extracted small integer. */ + comment("store extracted integer as a small"); + mov_imm(dst.reg, _TAG_IMMED1_SMALL); + if ((flags & BSF_SIGNED) != 0) { + a.orr(dst.reg, dst.reg, data_reg, arm::lsl(_TAG_IMMED1_SIZE)); + } else { + if (bits >= SMALL_BITS) { + a.bfi(dst.reg, + data_reg, + arm::lsl(_TAG_IMMED1_SIZE), + imm(SMALL_BITS)); + } else { + a.bfi(dst.reg, data_reg, arm::lsl(_TAG_IMMED1_SIZE), imm(bits)); + } + } + + if (bits >= SMALL_BITS) { + a.b(done); + } + + /* Handle a bignum (up to 64 bits). */ + a.bind(big); + if (bits >= SMALL_BITS) { + comment("store extracted integer as a bignum"); + a.add(dst.reg, HTOP, imm(TAG_PRIMARY_BOXED)); + mov_imm(TMP3, make_pos_bignum_header(1)); + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned. */ + a.stp(TMP3, data_reg, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + } else { + /* Signed. */ + Label store = a.newLabel(); + a.adds(TMP2, data_reg, ZERO); + a.b_pl(store); + + mov_imm(TMP3, make_neg_bignum_header(1)); + a.neg(TMP2, TMP2); + + a.bind(store); + a.stp(TMP3, TMP2, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + } + } + + a.bind(done); + flush_var(dst); +} + +void BeamModuleAssembler::emit_extract_binary(const arm::Gp bitdata, + Uint bits, + const ArgRegister &Dst) { + auto dst = init_destination(Dst, TMP1); + Uint num_bytes = bits / 8; + + a.add(dst.reg, HTOP, imm(TAG_PRIMARY_BOXED)); + mov_imm(TMP2, header_heap_bin(num_bytes)); + mov_imm(TMP3, num_bytes); + a.rev64(TMP4, bitdata); + a.stp(TMP2, TMP3, arm::Mem(HTOP).post(sizeof(Eterm[2]))); + a.str(TMP4, arm::Mem(HTOP).post(sizeof(Eterm[1]))); + flush_var(dst); +} + +static std::vector<BsmSegment> opt_bsm_segments( + const std::vector<BsmSegment> segments, + const ArgWord &Need, + const ArgWord &Live) { + std::vector<BsmSegment> segs; + + Uint heap_need = Need.get(); + + /* + * First calculate the total number of heap words needed for + * bignums and binaries. + */ + for (auto seg : segments) { + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + if (seg.size >= SMALL_BITS) { + heap_need += BIG_NEED_FOR_BITS(seg.size); + } + break; + case BsmSegment::action::GET_BINARY: + heap_need += heap_bin_size((seg.size + 7) / 8); + break; + case BsmSegment::action::GET_TAIL: + heap_need += EXTRACT_SUB_BIN_HEAP_NEED; + break; + default: + break; + } + } + + int index = 0; + int read_action_pos = -1; + + index = 0; + for (auto seg : segments) { + if (heap_need != 0 && seg.live.isWord()) { + BsmSegment s = seg; + + s.action = BsmSegment::action::TEST_HEAP; + s.size = heap_need; + segs.push_back(s); + index++; + heap_need = 0; + } + + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + case BsmSegment::action::GET_BINARY: + if (seg.size > 64) { + read_action_pos = -1; + } else if (seg.action == BsmSegment::action::GET_BINARY && + seg.size % 8 != 0) { + read_action_pos = -1; + } else { + if ((seg.flags & BSF_LITTLE) != 0 || read_action_pos < 0 || + seg.size + segs.at(read_action_pos).size > 64) { + BsmSegment s; + + /* Create a new READ action. */ + read_action_pos = index; + s.action = BsmSegment::action::READ; + s.size = seg.size; + segs.push_back(s); + index++; + } else { + /* Reuse previous READ action. */ + segs.at(read_action_pos).size += seg.size; + } + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + seg.action = BsmSegment::action::EXTRACT_INTEGER; + break; + case BsmSegment::action::GET_BINARY: + seg.action = BsmSegment::action::EXTRACT_BINARY; + break; + default: + break; + } + } + segs.push_back(seg); + break; + case BsmSegment::action::SKIP: + if (read_action_pos >= 0 && + seg.size + segs.at(read_action_pos).size <= 64) { + segs.at(read_action_pos).size += seg.size; + seg.action = BsmSegment::action::DROP; + } else { + read_action_pos = -1; + } + segs.push_back(seg); + break; + default: + read_action_pos = -1; + segs.push_back(seg); + break; + } + index++; + } + + /* Handle a trailing test_heap instruction (for the + * i_bs_match_test_heap instruction). */ + if (heap_need) { + BsmSegment seg; + + seg.action = BsmSegment::action::TEST_HEAP; + seg.size = heap_need; + seg.live = Live; + segs.push_back(seg); + } + return segs; +} + +void BeamModuleAssembler::emit_i_bs_match(ArgLabel const &Fail, + ArgRegister const &Ctx, + Span<ArgVal> const &List) { + emit_i_bs_match_test_heap(Fail, Ctx, ArgWord(0), ArgWord(0), List); +} + +void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, + ArgRegister const &Ctx, + ArgWord const &Need, + ArgWord const &Live, + Span<ArgVal> const &List) { + const int orig_offset = offsetof(ErlBinMatchState, mb.orig); + const int base_offset = offsetof(ErlBinMatchState, mb.base); + const int position_offset = offsetof(ErlBinMatchState, mb.offset); + const int size_offset = offsetof(ErlBinMatchState, mb.size); + + std::vector<BsmSegment> segments; + + auto current = List.begin(); + auto end = List.begin() + List.size(); + + while (current < end) { + auto cmd = current++->as<ArgImmed>().get(); + BsmSegment seg; + + switch (cmd) { + case am_ensure_at_least: { + seg.action = BsmSegment::action::ENSURE_AT_LEAST; + seg.size = current[0].as<ArgWord>().get(); + seg.unit = current[1].as<ArgWord>().get(); + current += 2; + break; + } + case am_ensure_exactly: { + seg.action = BsmSegment::action::ENSURE_EXACTLY; + seg.size = current[0].as<ArgWord>().get(); + current += 1; + break; + } + case am_binary: + case am_integer: { + auto size = current[2].as<ArgWord>().get(); + auto unit = current[3].as<ArgWord>().get(); + + switch (cmd) { + case am_integer: + seg.action = BsmSegment::action::GET_INTEGER; + break; + case am_binary: + seg.action = BsmSegment::action::GET_BINARY; + break; + } + + seg.live = current[0]; + seg.size = size * unit; + seg.unit = unit; + seg.flags = current[1].as<ArgWord>().get(); + seg.dst = current[4].as<ArgRegister>(); + current += 5; + break; + } + case am_get_tail: { + seg.action = BsmSegment::action::GET_TAIL; + seg.live = current[0].as<ArgWord>(); + seg.dst = current[2].as<ArgRegister>(); + current += 3; + break; + } + case am_skip: { + seg.action = BsmSegment::action::SKIP; + seg.size = current[0].as<ArgWord>().get(); + seg.flags = 0; + current += 1; + break; + } + default: + abort(); + break; + } + segments.push_back(seg); + } + + segments = opt_bsm_segments(segments, Need, Live); + + const arm::Gp bin_base = ARG2; + const arm::Gp bin_position = ARG3; + const arm::Gp bin_size = ARG4; + const arm::Gp bitdata = ARG8; + bool position_is_valid = false; + + for (auto seg : segments) { + switch (seg.action) { + case BsmSegment::action::ENSURE_AT_LEAST: { + comment("ensure_at_least %ld %ld", seg.size, seg.unit); + auto ctx_reg = load_source(Ctx, TMP1); + auto stride = seg.size; + auto unit = seg.unit; + + a.ldur(bin_position, emit_boxed_val(ctx_reg.reg, position_offset)); + a.ldur(bin_size, emit_boxed_val(ctx_reg.reg, size_offset)); + a.sub(TMP5, bin_size, bin_position); + cmp(TMP5, stride); + a.b_lo(resolve_beam_label(Fail, disp1MB)); + + if (unit != 1) { + if (stride % unit != 0) { + sub(TMP5, TMP5, stride); + } + + if ((unit & (unit - 1)) != 0) { + mov_imm(TMP4, unit); + + a.udiv(TMP3, TMP5, TMP4); + a.msub(TMP5, TMP3, TMP4, TMP5); + + a.cbnz(TMP5, resolve_beam_label(Fail, disp1MB)); + } else { + a.tst(TMP5, imm(unit - 1)); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + } + } + + position_is_valid = true; + break; + } + case BsmSegment::action::ENSURE_EXACTLY: { + comment("ensure_exactly %ld", seg.size); + auto ctx_reg = load_source(Ctx, TMP1); + auto size = seg.size; + + a.ldur(bin_position, emit_boxed_val(ctx_reg.reg, position_offset)); + a.ldur(TMP3, emit_boxed_val(ctx_reg.reg, size_offset)); + if (size != 0) { + a.sub(TMP1, TMP3, bin_position); + cmp(TMP1, size); + } else { + a.subs(TMP1, TMP3, bin_position); + } + a.b_ne(resolve_beam_label(Fail, disp1MB)); + position_is_valid = true; + break; + } + case BsmSegment::action::TEST_HEAP: { + comment("test_heap %ld", seg.size); + emit_gc_test(ArgWord(0), ArgWord(seg.size), seg.live); + position_is_valid = false; + break; + } + case BsmSegment::action::READ: { + comment("read %ld", seg.size); + if (seg.size == 0) { + comment("(nothing to do)"); + } else { + auto ctx = load_source(Ctx, ARG1); + + if (!position_is_valid) { + a.ldur(bin_position, + emit_boxed_val(ctx.reg, position_offset)); + position_is_valid = true; + } + a.ldur(bin_base, emit_boxed_val(ctx.reg, base_offset)); + + emit_read_bits(seg.size, bin_base, bin_position, bitdata); + + a.add(bin_position, bin_position, imm(seg.size)); + a.stur(bin_position, emit_boxed_val(ctx.reg, position_offset)); + } + break; + } + case BsmSegment::action::EXTRACT_BINARY: { + auto bits = seg.size; + auto Dst = seg.dst; + + comment("extract binary %ld", bits); + emit_extract_binary(bitdata, bits, Dst); + if (bits != 0 && bits != 64) { + a.ror(bitdata, bitdata, imm(64 - bits)); + } + break; + } + case BsmSegment::action::EXTRACT_INTEGER: { + auto bits = seg.size; + auto flags = seg.flags; + auto Dst = seg.dst; + + comment("extract integer %ld", bits); + if (bits != 0 && bits != 64) { + a.ror(bitdata, bitdata, imm(64 - bits)); + } + emit_extract_integer(bitdata, flags, bits, Dst); + break; + } + case BsmSegment::action::GET_INTEGER: { + Uint live = seg.live.as<ArgWord>().get(); + Uint flags = seg.flags; + auto bits = seg.size; + auto Dst = seg.dst; + + comment("get integer %ld", bits); + auto ctx = load_source(Ctx, TMP1); + + if (bits >= SMALL_BITS) { + emit_enter_runtime<Update::eHeap>(live); + } else { + emit_enter_runtime(live); + } + + a.mov(ARG1, c_p); + a.mov(ARG2, bits); + a.mov(ARG3, flags); + lea(ARG4, emit_boxed_val(ctx.reg, offsetof(ErlBinMatchState, mb))); + runtime_call<4>(erts_bs_get_integer_2); + + if (bits >= SMALL_BITS) { + emit_leave_runtime<Update::eHeap>(live); + } else { + emit_leave_runtime(live); + } + + mov_arg(Dst, ARG1); + + position_is_valid = false; + break; + } + case BsmSegment::action::GET_BINARY: { + auto Live = seg.live; + comment("get binary %ld", seg.size); + auto ctx = load_source(Ctx, TMP1); + + emit_enter_runtime<Update::eHeap>(Live.as<ArgWord>().get()); + + lea(ARG1, arm::Mem(c_p, offsetof(Process, htop))); + a.ldur(ARG2, emit_boxed_val(ctx.reg, orig_offset)); + a.ldur(ARG3, emit_boxed_val(ctx.reg, base_offset)); + a.ldur(ARG4, emit_boxed_val(ctx.reg, position_offset)); + mov_imm(ARG5, seg.size); + a.add(TMP2, ARG4, ARG5); + a.stur(TMP2, emit_boxed_val(ctx.reg, position_offset)); + runtime_call<5>(erts_extract_sub_binary); + + emit_leave_runtime<Update::eHeap>(Live.as<ArgWord>().get()); + + mov_arg(seg.dst, ARG1); + position_is_valid = false; + break; + } + case BsmSegment::action::GET_TAIL: { + comment("get_tail"); + + mov_arg(ARG1, Ctx); + fragment_call(ga->get_bs_get_tail_shared()); + mov_arg(seg.dst, ARG1); + position_is_valid = false; + break; + } + case BsmSegment::action::SKIP: { + comment("skip %ld", seg.size); + auto ctx = load_source(Ctx, TMP1); + if (!position_is_valid) { + a.ldur(bin_position, emit_boxed_val(ctx.reg, position_offset)); + position_is_valid = true; + } + add(bin_position, bin_position, seg.size); + a.stur(bin_position, emit_boxed_val(ctx.reg, position_offset)); + break; + } + case BsmSegment::action::DROP: + auto bits = seg.size; + comment("drop %ld", bits); + if (bits != 0 && bits != 64) { + a.ror(bitdata, bitdata, imm(64 - bits)); + } + break; + } + } +} diff --git a/erts/emulator/beam/jit/arm/ops.tab b/erts/emulator/beam/jit/arm/ops.tab index e915ece301..c1c33ed309 100644 --- a/erts/emulator/beam/jit/arm/ops.tab +++ b/erts/emulator/beam/jit/arm/ops.tab @@ -275,11 +275,12 @@ load_tuple_ptr s # If positions are in consecutive memory, fetch and store two words at # once. +## FIXME: Fix this bug in maint, too. i_get_tuple_element Tuple Pos1 Dst1 | current_tuple Tuple2 | get_tuple_element Tuple3 Pos2 Dst2 | equal(Tuple, Tuple2) | equal(Tuple, Tuple3) | - consecutive_words(Pos1, Pos2) => + consecutive_words(Pos1, Pos2) | distinct(Dst1, Dst2) => get_two_tuple_elements Tuple Pos1 Dst1 Dst2 | current_tuple Tuple Dst2 @@ -881,7 +882,19 @@ i_flush_stubs i_breakpoint_trampoline # ================================================================ -# New bit syntax matching (R11B). +# New bit syntax matching for fixed sizes (from OTP 26). +# ================================================================ + +bs_match Fail Ctx Size Rest=* => bs_match(Fail, Ctx, Size, Rest) + +i_bs_match Fail Ctx Rest=* | test_heap Need Live => + i_bs_match_test_heap Fail Ctx Need Live Rest + +i_bs_match f S * +i_bs_match_test_heap f S I t * + +# ================================================================ +# Bit syntax matching (from R11B). # ================================================================ %warm diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp index 71294190b9..9f2040da00 100644 --- a/erts/emulator/beam/jit/x86/beam_asm.hpp +++ b/erts/emulator/beam/jit/x86/beam_asm.hpp @@ -1371,10 +1371,12 @@ protected: void emit_error(int code); - x86::Mem emit_bs_get_integer_prologue(Label next, - Label fail, - int flags, - int size); + void emit_bs_get_integer(const ArgRegister &Ctx, + const ArgLabel &Fail, + const ArgWord &Live, + const ArgWord Flags, + int bits, + const ArgRegister &Dst); int emit_bs_get_field_size(const ArgSource &Size, int unit, @@ -1396,6 +1398,25 @@ protected: bool bs_maybe_enter_runtime(bool entered); void bs_maybe_leave_runtime(bool entered); + void emit_read_bits(Uint bits, + const x86::Gp bin_base, + const x86::Gp bin_offset, + const x86::Gp bitdata); + void emit_extract_integer(const x86::Gp bitdata, + const x86::Gp tmp, + Uint flags, + Uint bits, + const ArgRegister &Dst); + void emit_extract_binary(const x86::Gp bitdata, + Uint bits, + const ArgRegister &Dst); + void emit_read_integer(const x86::Gp bin_base, + const x86::Gp bin_position, + const x86::Gp tmp, + Uint flags, + Uint bits, + const ArgRegister &Dst); + void emit_raise_exception(); void emit_raise_exception(const ErtsCodeMFA *exp); void emit_raise_exception(Label I, const ErtsCodeMFA *exp); diff --git a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl index 2fa14f3ad9..bd4e6f9f43 100755 --- a/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl +++ b/erts/emulator/beam/jit/x86/beam_asm_global.hpp.pl @@ -31,7 +31,6 @@ my @beam_global_funcs = qw( bs_add_shared bs_create_bin_error_shared bs_size_check_shared - bs_fixed_integer_shared bs_get_tail_shared call_bif_shared call_light_bif_shared diff --git a/erts/emulator/beam/jit/x86/generators.tab b/erts/emulator/beam/jit/x86/generators.tab index 143e91dab7..ba896f9c0b 100644 --- a/erts/emulator/beam/jit/x86/generators.tab +++ b/erts/emulator/beam/jit/x86/generators.tab @@ -674,3 +674,59 @@ gen.create_bin(Fail, Alloc, Live, Unit, Dst, N, Segments) { return op; } + +gen.bs_match(Fail, Ctx, N, List) { + BeamOp* op; + int fixed_args; + int i; + + $NewBeamOp(S, op); + $BeamOpNameArity(op, i_bs_match, 2); + fixed_args = op->arity; + $BeamOpArity(op, (N.val + fixed_args)); + + op->a[0] = Fail; + op->a[1] = Ctx; + + for (i = 0; i < N.val; i++) { + BeamOpArg current; + Uint flags = 0; + + current = List[i]; + if (current.type == TAG_n) { + current.type = TAG_u; + current.val = 0; + } else if (current.type == TAG_q) { + Eterm term = beamfile_get_literal(&S->beam, current.val); + while (is_list(term)) { + Eterm* consp = list_val(term); + Eterm elem = CAR(consp); + switch (elem) { + case am_little: + flags |= BSF_LITTLE; + break; + case am_native: + flags |= BSF_NATIVE; + break; + case am_signed: + flags |= BSF_SIGNED; + break; + } + term = CDR(consp); + } + ASSERT(is_nil(term)); + current.type = TAG_u; + current.val = flags; + $NativeEndian(current); + } else if (current.type == TAG_o) { + /* An overflow tag (in ensure_at_least or ensure_exactly) + * means that the match will always fail. */ + $BeamOpNameArity(op, jump, 1); + op->a[0] = Fail; + return op; + } + op->a[i+fixed_args] = current; + } + + return op; +} diff --git a/erts/emulator/beam/jit/x86/instr_bs.cpp b/erts/emulator/beam/jit/x86/instr_bs.cpp index 2b9445a75a..e97678e4a6 100644 --- a/erts/emulator/beam/jit/x86/instr_bs.cpp +++ b/erts/emulator/beam/jit/x86/instr_bs.cpp @@ -652,63 +652,49 @@ void BeamModuleAssembler::emit_i_bs_get_position(const ArgRegister &Ctx, mov_arg(Dst, ARG1); } -/* ARG3 = flags | (size << 3), - * ARG4 = tagged match context */ -void BeamGlobalAssembler::emit_bs_fixed_integer_shared() { - emit_enter_runtime<Update::eStack | Update::eHeap>(); - - a.mov(ARG1, c_p); - /* Unpack size ... */ - a.mov(ARG2, ARG3); - a.shr(ARG2, imm(3)); - /* ... flags. */ - a.and_(ARG3, imm(BSF_ALIGNED | BSF_LITTLE | BSF_SIGNED)); - a.lea(ARG4, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb))); - runtime_call<4>(erts_bs_get_integer_2); - - emit_leave_runtime<Update::eStack | Update::eHeap>(); - - a.ret(); -} - -x86::Mem BeamModuleAssembler::emit_bs_get_integer_prologue(Label next, - Label fail, - int flags, - int size) { - Label aligned = a.newLabel(); - - a.mov(ARG2, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.offset))); - a.lea(ARG3, x86::qword_ptr(ARG2, size)); - a.cmp(ARG3, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.size))); - a.ja(fail); - - a.test(ARG2.r8(), imm(CHAR_BIT - 1)); - a.short_().je(aligned); - - /* Actually unaligned reads are quite rare, so we handle everything in a - * shared fragment. */ - mov_imm(ARG3, flags | (size << 3)); - safe_fragment_call(ga->get_bs_fixed_integer_shared()); +void BeamModuleAssembler::emit_bs_get_integer(const ArgRegister &Ctx, + const ArgLabel &Fail, + const ArgWord &Live, + const ArgWord Flags, + int bits, + const ArgRegister &Dst) { + const int base_offset = offsetof(ErlBinMatchState, mb.base); + const int position_offset = offsetof(ErlBinMatchState, mb.offset); + const int size_offset = offsetof(ErlBinMatchState, mb.size); - /* The above call can't fail since we work on small numbers and - * bounds-tested above. */ -#ifdef JIT_HARD_DEBUG - a.jmp(next); +#ifdef WIN32 + const x86::Gp bin_position = ARG1; + const x86::Gp bitdata = ARG4; #else - a.short_().jmp(next); + const x86::Gp bin_position = ARG4; + const x86::Gp bitdata = ARG1; #endif + ASSERT(bin_position == x86::rcx); + const x86::Gp bin_base = ARG2; + const x86::Gp ctx = ARG3; + const x86::Gp tmp = ARG5; - a.bind(aligned); - { - /* Read base address and convert offset to bytes. */ - a.mov(ARG1, emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.base))); - a.shr(ARG2, imm(3)); + mov_arg(ctx, Ctx); - /* We cannot fail from here on; bump the match context's position. */ - a.mov(emit_boxed_val(ARG4, offsetof(ErlBinMatchState, mb.offset)), - ARG3); + if (bits == 64) { + a.mov(ARG1, ctx); + emit_gc_test_preserve(ArgWord(BIG_UINT_HEAP_SIZE), Live, Ctx, ARG1); + a.mov(ARG3, ARG1); + } + + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + a.lea(RET, qword_ptr(bin_position, bits)); + a.cmp(RET, emit_boxed_val(ctx, size_offset)); + a.ja(resolve_beam_label(Fail)); - return x86::Mem(ARG1, ARG2, 0, 0, size / 8); + a.mov(bin_base, emit_boxed_val(ctx, base_offset)); + a.mov(emit_boxed_val(ctx, position_offset), RET); + + if (bits == 64) { + emit_read_bits(bits, bin_base, bin_position, bitdata); + emit_extract_integer(bitdata, tmp, Flags.get(), bits, Dst); + } else { + emit_read_integer(bin_base, bin_position, tmp, Flags.get(), bits, Dst); } } @@ -716,113 +702,21 @@ void BeamModuleAssembler::emit_i_bs_get_integer_8(const ArgRegister &Ctx, const ArgWord &Flags, const ArgLabel &Fail, const ArgRegister &Dst) { - int flags = Flags.get(); - Label next = a.newLabel(); - x86::Mem address; - - mov_arg(ARG4, Ctx); - - address = emit_bs_get_integer_prologue(next, - resolve_beam_label(Fail), - flags, - 8); - - if (flags & BSF_SIGNED) { - a.movsx(RET, address); - } else { - a.movzx(RET, address); - } - - a.shl(RET, imm(_TAG_IMMED1_SIZE)); - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - - a.bind(next); - mov_arg(Dst, RET); + emit_bs_get_integer(Ctx, Fail, ArgWord(0), Flags, 8, Dst); } void BeamModuleAssembler::emit_i_bs_get_integer_16(const ArgRegister &Ctx, const ArgWord &Flags, const ArgLabel &Fail, const ArgRegister &Dst) { - int flags = Flags.get(); - Label next = a.newLabel(); - x86::Mem address; - - mov_arg(ARG4, Ctx); - - address = emit_bs_get_integer_prologue(next, - resolve_beam_label(Fail), - flags, - 16); - - if (flags & BSF_LITTLE) { - if (flags & BSF_SIGNED) { - a.movsx(RET, address); - } else { - a.movzx(RET, address); - } - } else { - if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { - a.movbe(x86::ax, address); - } else { - a.mov(x86::ax, address); - a.xchg(x86::al, x86::ah); - } - - if (flags & BSF_SIGNED) { - a.movsx(RET, x86::ax); - } else { - a.movzx(RET, x86::ax); - } - } - - a.shl(RET, imm(_TAG_IMMED1_SIZE)); - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - - a.bind(next); - mov_arg(Dst, RET); + emit_bs_get_integer(Ctx, Fail, ArgWord(0), Flags, 16, Dst); } void BeamModuleAssembler::emit_i_bs_get_integer_32(const ArgRegister &Ctx, const ArgWord &Flags, const ArgLabel &Fail, const ArgRegister &Dst) { - int flags = Flags.get(); - Label next = a.newLabel(); - x86::Mem address; - - mov_arg(ARG4, Ctx); - - address = emit_bs_get_integer_prologue(next, - resolve_beam_label(Fail), - flags, - 32); - - if (flags & BSF_LITTLE) { - if (flags & BSF_SIGNED) { - a.movsxd(RET, address); - } else { - /* Implicitly zero-extends to 64 bits */ - a.mov(RETd, address); - } - } else { - if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { - a.movbe(RETd, address); - } else { - a.mov(RETd, address); - a.bswap(RETd); - } - - if (flags & BSF_SIGNED) { - a.movsxd(RET, RETd); - } - } - - a.shl(RET, imm(_TAG_IMMED1_SIZE)); - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - - a.bind(next); - mov_arg(Dst, RET); + emit_bs_get_integer(Ctx, Fail, ArgWord(0), Flags, 32, Dst); } void BeamModuleAssembler::emit_i_bs_get_integer_64(const ArgRegister &Ctx, @@ -830,62 +724,7 @@ void BeamModuleAssembler::emit_i_bs_get_integer_64(const ArgRegister &Ctx, const ArgLabel &Fail, const ArgWord &Live, const ArgRegister &Dst) { - int flags = Flags.get(); - Label next = a.newLabel(); - x86::Mem address; - - mov_arg(ARG4, Ctx); - - emit_gc_test_preserve(ArgWord(BIG_UINT_HEAP_SIZE), Live, Ctx, ARG4); - - address = emit_bs_get_integer_prologue(next, - resolve_beam_label(Fail), - flags, - 64); - - if (flags & BSF_LITTLE) { - a.mov(RET, address); - } else { - if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { - a.movbe(RET, address); - } else { - a.mov(RET, address); - a.bswap(RET); - } - } - - a.mov(ARG1, RET); - a.mov(ARG2, RET); - - /* Speculatively make a small out of the result even though it might not - * be one, and jump to the next instruction if it is. */ - a.shl(RET, imm(_TAG_IMMED1_SIZE)); - a.or_(RET, imm(_TAG_IMMED1_SMALL)); - - if (flags & BSF_SIGNED) { - a.sar(ARG2, imm(SMALL_BITS - 1)); - a.add(ARG2, imm(1)); - a.cmp(ARG2, imm(1)); - a.jbe(next); - } else { - a.shr(ARG2, imm(SMALL_BITS - 1)); - a.jz(next); - } - - emit_enter_runtime(); - - a.mov(ARG2, HTOP); - if (flags & BSF_SIGNED) { - runtime_call<2>(small_to_big); - } else { - runtime_call<2>(uword_to_big); - } - a.add(HTOP, imm(sizeof(Eterm) * BIG_UINT_HEAP_SIZE)); - - emit_leave_runtime(); - - a.bind(next); - mov_arg(Dst, RET); + emit_bs_get_integer(Ctx, Fail, Live, Flags, 64, Dst); } void BeamModuleAssembler::emit_i_bs_get_integer(const ArgRegister &Ctx, @@ -1040,7 +879,6 @@ void BeamModuleAssembler::emit_i_bs_skip_bits2(const ArgRegister &Ctx, Label fail; fail = resolve_beam_label(Fail); - if (emit_bs_get_field_size(Bits, Unit.get(), fail, RET) >= 0) { emit_bs_skip_bits(Fail, Ctx); } @@ -2799,3 +2637,921 @@ void BeamModuleAssembler::emit_i_bs_create_bin(const ArgLabel &Fail, a.mov(RET, TMP_MEM1q); mov_arg(Dst, RET); } + +/* + * Here follows the bs_match instruction and friends. + */ + +struct BsmSegment { + BsmSegment() + : action(action::TEST_HEAP), live(ArgNil()), size(0), unit(1), + flags(0), dst(ArgXRegister(0)){}; + + enum class action { + TEST_HEAP, + ENSURE_AT_LEAST, + ENSURE_EXACTLY, + READ, + EXTRACT_BINARY, + EXTRACT_INTEGER, + READ_INTEGER, + GET_INTEGER, + GET_BINARY, + SKIP, + DROP, + GET_TAIL + } action; + ArgVal live; + Uint size; + Uint unit; + Uint flags; + ArgRegister dst; +}; + +void BeamModuleAssembler::emit_read_bits(Uint bits, + const x86::Gp bin_base, + const x86::Gp bin_offset, + const x86::Gp bitdata) { + Label handle_partial = a.newLabel(); + Label shift = a.newLabel(); + Label read_done = a.newLabel(); + auto num_partial = bits % 8; + auto num_bytes_to_read = (bits + 7) / 8; + + a.mov(RET, bin_offset); + a.shr(RET, imm(3)); + a.and_(bin_offset.r32(), imm(7)); + + if (num_partial == 0) { + /* Byte-sized segment. If bit_offset is not byte-aligned, this + * segment always needs an additional byte. */ + a.jnz(handle_partial); + } else { + /* Non-byte-sized segment. Test whether we will need an + * additional byte. */ + a.cmp(bin_offset.r32(), imm(8 - num_partial)); + a.jg(handle_partial); + } + + /* We don't need an extra byte. */ + if (num_bytes_to_read == 1) { + a.movzx(bitdata.r32(), x86::byte_ptr(bin_base, RET)); + if (num_partial == 0) { + a.bswap(bitdata); + a.short_().jmp(read_done); + } else { + a.add(bin_offset.r32(), imm(56)); + a.short_().jmp(shift); + } + } else if (num_bytes_to_read <= 4) { + a.add(bin_base, RET); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(bitdata.r32(), + x86::dword_ptr(bin_base, num_bytes_to_read - 4)); + } else { + a.mov(bitdata.r32(), + x86::dword_ptr(bin_base, num_bytes_to_read - 4)); + a.bswap(bitdata.r32()); + } + a.add(bin_offset.r32(), imm(64 - 8 * num_bytes_to_read)); + a.short_().jmp(shift); + } else { + a.add(bin_base, RET); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8)); + } else { + a.mov(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 8)); + a.bswap(bitdata); + } + if (num_bytes_to_read < 8) { + a.add(bin_offset.r32(), imm(64 - 8 * num_bytes_to_read)); + } + a.short_().jmp(shift); + } + + /* We'll need an extra byte and we will need to shift. */ + a.bind(handle_partial); + + if (num_bytes_to_read == 1) { + a.mov(bitdata.r16(), x86::word_ptr(bin_base, RET)); + a.bswap(bitdata); + } else if (num_bytes_to_read < 8) { + a.add(bin_base, RET); + a.mov(bitdata, x86::qword_ptr(bin_base, num_bytes_to_read - 7)); + a.shr(bitdata, imm(64 - 8 * (num_bytes_to_read + 1))); + a.bswap(bitdata); + } else { + a.add(bin_base, RET); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(bitdata, x86::qword_ptr(bin_base)); + } else { + a.mov(bitdata, x86::qword_ptr(bin_base)); + a.bswap(bitdata); + } + ASSERT(bitdata != x86::rcx); + if (bin_offset != x86::rcx) { + a.mov(x86::cl, bin_offset.r8()); + } + a.shl(bitdata, x86::cl); + a.mov(RET.r8(), x86::byte_ptr(bin_base, 8)); + a.movzx(RET.r32(), RET.r8()); + a.shl(RET.r32(), x86::cl); + a.shr(RET.r32(), imm(8)); + a.or_(bitdata, RET); + a.short_().jmp(read_done); + } + + /* Shift the read data into the most significant bits of the + * word. */ + a.bind(shift); + ASSERT(bitdata != x86::rcx); + if (bin_offset != x86::rcx) { + a.mov(x86::cl, bin_offset.r8()); + } + a.shl(bitdata, x86::cl); + + a.bind(read_done); +} + +/* + * Read an integer and store as a term. This function only handles + * integers of certain common sizes. This is a special optimization + * when only one integer is to be extracted from a binary. + * + * Input: bin_base, bin_offset + * + * Clobbers: bin_base, bin_offset, tmp, RET + */ +void BeamModuleAssembler::emit_read_integer(const x86::Gp bin_base, + const x86::Gp bin_offset, + const x86::Gp tmp, + Uint flags, + Uint bits, + const ArgRegister &Dst) { + Label handle_unaligned = a.newLabel(); + Label store = a.newLabel(); + x86::Mem address; + + a.mov(tmp, bin_offset); + a.shr(tmp, imm(3)); + a.and_(bin_offset.r32(), imm(7)); + + switch (bits) { + case 8: + address = x86::Mem(bin_base, tmp, 0, 0, 1); + if ((flags & BSF_SIGNED) == 0) { + a.movzx(RETd, address); + } else { + a.movsx(RET, address); + } + + a.short_().jz(store); + + a.bind(handle_unaligned); + address = x86::Mem(bin_base, tmp, 0, 0, 2); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RET.r16(), address); + } else { + a.mov(RET.r16(), address); + a.xchg(x86::al, x86::ah); + } + ASSERT(bin_offset == x86::rcx); + a.shl(RETd, bin_offset.r8()); + a.shr(RETd, imm(8)); + if ((flags & BSF_SIGNED) == 0) { + a.movzx(RETd, RETb); + } else { + a.movsx(RET, RETb); + } + break; + case 16: + address = x86::Mem(bin_base, tmp, 0, 0, 2); + if ((flags & BSF_LITTLE) != 0) { + if ((flags & BSF_SIGNED) == 0) { + a.movzx(RETd, address); + } else { + a.movsx(RET, address); + } + } else { + /* Big-endian segment. */ + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RET.r16(), address); + } else { + a.mov(RET.r16(), address); + a.xchg(x86::al, x86::ah); + } + + if ((flags & BSF_SIGNED) != 0) { + a.movsx(RET, RET.r16()); + } else { + a.movzx(RET, RET.r16()); + } + } + + a.short_().jz(store); + + a.bind(handle_unaligned); + a.add(bin_base, tmp); + address = x86::Mem(bin_base, -1, 4); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RETd, address); + } else { + a.mov(RETd, address); + a.bswap(RETd); + } + ASSERT(bin_offset == x86::rcx); + a.shl(RETd, bin_offset.r8()); + a.shr(RETd, imm(8)); + + if ((flags & BSF_LITTLE) != 0) { + a.xchg(x86::al, x86::ah); + } + + if ((flags & BSF_SIGNED) == 0) { + a.movzx(RETd, RET.r16()); + } else { + a.movsx(RET, RET.r16()); + } + break; + case 32: + address = x86::Mem(bin_base, tmp, 0, 0, 4); + if ((flags & BSF_LITTLE) != 0) { + /* Little-endian segment. */ + if ((flags & BSF_SIGNED) == 0) { + a.mov(RETd, address); + } else { + a.movsxd(RET, address); + } + } else { + /* Big-endian segment. */ + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RETd, address); + } else { + a.mov(RETd, address); + a.bswap(RETd); + } + + if ((flags & BSF_SIGNED) != 0) { + a.movsxd(RET, RETd); + } + } + + a.short_().jz(store); + + a.bind(handle_unaligned); + a.add(bin_base, tmp); + address = x86::Mem(bin_base, -3, 8); + if (hasCpuFeature(CpuFeatures::X86::kMOVBE)) { + a.movbe(RET, address); + } else { + a.mov(RET, address); + a.bswap(RET); + } + ASSERT(bin_offset == x86::rcx); + a.shl(RET, bin_offset.r8()); + a.shr(RET, imm(8)); + + if ((flags & BSF_LITTLE) != 0) { + a.bswap(RETd); + } + + if ((flags & BSF_SIGNED) == 0) { + a.mov(RETd, RETd); + } else { + a.movsxd(RET, RETd); + } + break; + default: + ASSERT(0); + break; + } + + a.bind(store); + a.shl(RET, imm(_TAG_IMMED1_SIZE)); + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + mov_arg(Dst, RET); +} + +void BeamModuleAssembler::emit_extract_integer(const x86::Gp bitdata, + const x86::Gp tmp, + Uint flags, + Uint bits, + const ArgRegister &Dst) { + if (bits == 0) { + /* Necessary for correctness when matching a zero-size + * signed segment. + */ + mov_arg(Dst, make_small(0)); + return; + } + + Label big = a.newLabel(); + Label done = a.newLabel(); + Uint num_partial = bits % 8; + Uint num_complete = 8 * (bits / 8); + + if (bits <= 8) { + /* Endian does not matter for values that fit in a byte. */ + flags &= ~BSF_LITTLE; + } + + if ((flags & BSF_LITTLE) == 0) { + /* Big-endian segment. */ + a.mov(RET, bitdata); + } else if ((flags & BSF_LITTLE) != 0) { + /* Reverse endianness for this little-endian segment. */ + if (num_partial == 0) { + a.mov(RET, bitdata); + a.bswap(RET); + if (bits < 64) { + a.shl(RET, imm(64 - num_complete)); + } + } else { + Uint shifted_mask = ((1 << num_partial) - 1) << (8 - num_partial); + a.mov(tmp, bitdata); + a.shr(tmp, imm(64 - num_complete)); + a.bswap(tmp); + a.shr(tmp, imm(num_partial)); + + a.mov(RET, bitdata); + a.rol(RET, imm(num_complete + 8)); + a.and_(RETd, imm(shifted_mask)); + a.ror(RET, imm(8)); + a.or_(RET, tmp); + } + } + + /* Now the extracted data is in RET. */ + if (bits >= SMALL_BITS) { + /* Handle segments whose values might not fit in a small + * integer. */ + Label small = a.newLabel(); + comment("test whether this integer is a small"); + if (bits < 64) { + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned segment. */ + a.shr(RET, imm(64 - bits)); + } else { + /* Signed segment. */ + a.sar(RET, imm(64 - bits)); + } + } + a.mov(tmp, RET); + a.shr(tmp, imm(SMALL_BITS - 1)); + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned segment. */ + a.jnz(big); + } else { + /* Signed segment. */ + a.jz(small); + a.cmp(tmp.r32(), imm(_TAG_IMMED1_MASK << 1 | 1)); + a.jnz(big); + } + + comment("store extracted integer as a small"); + a.bind(small); + a.shl(RET, imm(_TAG_IMMED1_SIZE)); + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + a.short_().jmp(done); + } else { + /* This segment always fits in a small. */ + comment("store extracted integer as a small"); + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned segment. */ + a.shr(RET, imm(64 - bits - _TAG_IMMED1_SIZE)); + } else { + /* Signed segment. */ + a.sar(RET, imm(64 - bits - _TAG_IMMED1_SIZE)); + } + ERTS_CT_ASSERT(_TAG_IMMED1_SMALL == (1 << _TAG_IMMED1_SIZE) - 1); + a.or_(RET, imm(_TAG_IMMED1_SMALL)); + } + + a.bind(big); + if (bits >= SMALL_BITS) { + comment("store extracted integer as a bignum"); + if ((flags & BSF_SIGNED) == 0) { + /* Unsigned segment. */ + a.mov(x86::qword_ptr(HTOP), make_pos_bignum_header(1)); + a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), RET); + } else { + Label negative = a.newLabel(); + Label sign_done = a.newLabel(); + + /* Signed segment. */ + a.test(RET, RET); + a.short_().jl(negative); + + a.mov(x86::qword_ptr(HTOP), make_pos_bignum_header(1)); + a.short_().jmp(sign_done); + + a.bind(negative); + a.mov(x86::qword_ptr(HTOP), make_neg_bignum_header(1)); + a.neg(RET); + + a.bind(sign_done); + a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), RET); + } + a.lea(RET, x86::qword_ptr(HTOP, TAG_PRIMARY_BOXED)); + a.add(HTOP, imm(sizeof(Eterm[2]))); + } + + a.bind(done); + mov_arg(Dst, RET); +} + +/* + * Clobbers: RET + */ +void BeamModuleAssembler::emit_extract_binary(const x86::Gp bitdata, + Uint bits, + const ArgRegister &Dst) { + Uint num_bytes = bits / 8; + + a.lea(RET, x86::qword_ptr(HTOP, TAG_PRIMARY_BOXED)); + mov_arg(Dst, RET); + a.mov(x86::qword_ptr(HTOP), header_heap_bin(num_bytes)); + a.mov(x86::qword_ptr(HTOP, sizeof(Eterm)), imm(num_bytes)); + a.mov(RET, bitdata); + a.bswap(RET); + a.mov(x86::qword_ptr(HTOP, 2 * sizeof(Eterm)), RET); + a.add(HTOP, imm(sizeof(Eterm[3]))); +} + +static std::vector<BsmSegment> opt_bsm_segments( + const std::vector<BsmSegment> segments, + const ArgWord &Need, + const ArgWord &Live) { + std::vector<BsmSegment> segs; + + Uint heap_need = Need.get(); + + /* + * First calculate the total number of heap words needed for + * bignums and binaries. + */ + for (auto seg : segments) { + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + if (seg.size >= SMALL_BITS) { + heap_need += BIG_NEED_FOR_BITS(seg.size); + } + break; + case BsmSegment::action::GET_BINARY: + heap_need += heap_bin_size((seg.size + 7) / 8); + break; + case BsmSegment::action::GET_TAIL: + heap_need += EXTRACT_SUB_BIN_HEAP_NEED; + break; + default: + break; + } + } + + int read_action_pos = -1; + int seg_index = 0; + int count = segments.size(); + + for (int i = 0; i < count; i++) { + auto seg = segments[i]; + if (heap_need != 0 && seg.live.isWord()) { + BsmSegment s = seg; + + s.action = BsmSegment::action::TEST_HEAP; + s.size = heap_need; + segs.push_back(s); + heap_need = 0; + seg_index++; + } + + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + case BsmSegment::action::GET_BINARY: { + bool is_common_size; + switch (seg.size) { + case 8: + case 16: + case 32: + is_common_size = true; + break; + default: + is_common_size = false; + break; + } + + if (seg.size > 64) { + read_action_pos = -1; + } else if (seg.action == BsmSegment::action::GET_BINARY && + seg.size % 8 != 0) { + read_action_pos = -1; + } else if ((seg.flags & BSF_LITTLE) != 0 && is_common_size) { + seg.action = BsmSegment::action::READ_INTEGER; + read_action_pos = -1; + } else if (read_action_pos < 0 && + seg.action == BsmSegment::action::GET_INTEGER && + is_common_size && i + 1 == count) { + seg.action = BsmSegment::action::READ_INTEGER; + read_action_pos = -1; + } else { + if ((seg.flags & BSF_LITTLE) != 0 || read_action_pos < 0 || + seg.size + segs.at(read_action_pos).size > 64) { + BsmSegment s; + + /* Create a new READ action. */ + read_action_pos = seg_index; + s.action = BsmSegment::action::READ; + s.size = seg.size; + segs.push_back(s); + seg_index++; + } else { + /* Reuse previous READ action. */ + segs.at(read_action_pos).size += seg.size; + } + switch (seg.action) { + case BsmSegment::action::GET_INTEGER: + seg.action = BsmSegment::action::EXTRACT_INTEGER; + break; + case BsmSegment::action::GET_BINARY: + seg.action = BsmSegment::action::EXTRACT_BINARY; + break; + default: + break; + } + } + segs.push_back(seg); + break; + } + case BsmSegment::action::SKIP: + if (read_action_pos >= 0 && + seg.size + segs.at(read_action_pos).size <= 64) { + segs.at(read_action_pos).size += seg.size; + seg.action = BsmSegment::action::DROP; + } else { + read_action_pos = -1; + } + segs.push_back(seg); + break; + default: + read_action_pos = -1; + segs.push_back(seg); + break; + } + seg_index++; + } + + /* Handle a trailing test_heap instruction (for the + * i_bs_match_test_heap instruction). */ + if (heap_need) { + BsmSegment seg; + + seg.action = BsmSegment::action::TEST_HEAP; + seg.size = heap_need; + seg.live = Live; + segs.push_back(seg); + } + return segs; +} + +void BeamModuleAssembler::emit_i_bs_match(ArgLabel const &Fail, + ArgRegister const &Ctx, + Span<ArgVal> const &List) { + emit_i_bs_match_test_heap(Fail, Ctx, ArgWord(0), ArgWord(0), List); +} + +void BeamModuleAssembler::emit_i_bs_match_test_heap(ArgLabel const &Fail, + ArgRegister const &Ctx, + ArgWord const &Need, + ArgWord const &Live, + Span<ArgVal> const &List) { + const int orig_offset = offsetof(ErlBinMatchState, mb.orig); + const int base_offset = offsetof(ErlBinMatchState, mb.base); + const int position_offset = offsetof(ErlBinMatchState, mb.offset); + const int size_offset = offsetof(ErlBinMatchState, mb.size); + + std::vector<BsmSegment> segments; + + auto current = List.begin(); + auto end = List.begin() + List.size(); + + while (current < end) { + auto cmd = current++->as<ArgImmed>().get(); + BsmSegment seg; + + switch (cmd) { + case am_ensure_at_least: { + seg.action = BsmSegment::action::ENSURE_AT_LEAST; + seg.size = current[0].as<ArgWord>().get(); + seg.unit = current[1].as<ArgWord>().get(); + current += 2; + break; + } + case am_ensure_exactly: { + seg.action = BsmSegment::action::ENSURE_EXACTLY; + seg.size = current[0].as<ArgWord>().get(); + current += 1; + break; + } + case am_binary: + case am_integer: { + auto size = current[2].as<ArgWord>().get(); + auto unit = current[3].as<ArgWord>().get(); + + switch (cmd) { + case am_integer: + seg.action = BsmSegment::action::GET_INTEGER; + break; + case am_binary: + seg.action = BsmSegment::action::GET_BINARY; + break; + } + + seg.live = current[0]; + seg.size = size * unit; + seg.unit = unit; + seg.flags = current[1].as<ArgWord>().get(); + seg.dst = current[4].as<ArgRegister>(); + current += 5; + break; + } + case am_get_tail: { + seg.action = BsmSegment::action::GET_TAIL; + seg.live = current[0].as<ArgWord>(); + seg.dst = current[2].as<ArgRegister>(); + current += 3; + break; + } + case am_skip: { + seg.action = BsmSegment::action::SKIP; + seg.size = current[0].as<ArgWord>().get(); + seg.flags = 0; + current += 1; + break; + } + default: + abort(); + break; + } + segments.push_back(seg); + } + + segments = opt_bsm_segments(segments, Need, Live); + +#ifdef WIN32 + const x86::Gp bin_position = ARG1; + const x86::Gp bitdata = ARG4; +#else + const x86::Gp bin_position = ARG4; + const x86::Gp bitdata = ARG1; +#endif + ASSERT(bin_position == x86::rcx); + const x86::Gp bin_base = ARG2; + const x86::Gp ctx = ARG3; + const x86::Gp tmp = ARG5; + + bool is_ctx_valid = false; + bool is_position_valid = false; + bool is_last = false; + int count = segments.size(); + + for (int i = 0; i < count; i++) { + auto seg = segments[i]; + is_last = i == count - 1; + switch (seg.action) { + case BsmSegment::action::ENSURE_AT_LEAST: { + auto size = seg.size; + auto unit = seg.unit; + comment("ensure_at_least %ld %ld", size, seg.unit); + mov_arg(ctx, Ctx); + if (unit == 1) { + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + a.lea(RET, qword_ptr(bin_position, size)); + a.cmp(RET, emit_boxed_val(ctx, size_offset)); + a.ja(resolve_beam_label(Fail)); + } else { + a.mov(RET, emit_boxed_val(ctx, size_offset)); + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + a.sub(RET, bin_position); + if (size != 0) { + cmp(RET, size, tmp); + } + a.jl(resolve_beam_label(Fail)); + } + + is_ctx_valid = is_position_valid = true; + + if (unit != 1) { + if (size % unit != 0) { + sub(RET, size, ARG1); + } + + if ((unit & (unit - 1))) { + /* Clobbers ARG3 */ + a.cqo(); + mov_imm(tmp, unit); + a.div(tmp); + a.test(x86::rdx, x86::rdx); + is_ctx_valid = is_position_valid = false; + } else { + a.test(RETb, imm(unit - 1)); + } + a.jnz(resolve_beam_label(Fail)); + } + break; + } + case BsmSegment::action::ENSURE_EXACTLY: { + auto size = seg.size; + comment("ensure_exactly %ld", size); + + mov_arg(ctx, Ctx); + a.mov(RET, emit_boxed_val(ctx, size_offset)); + if (is_last) { + a.sub(RET, emit_boxed_val(ctx, position_offset)); + is_ctx_valid = is_position_valid = false; + } else { + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + a.sub(RET, bin_position); + is_ctx_valid = is_position_valid = true; + } + if (size != 0) { + cmp(RET, size, tmp); + } + a.jne(resolve_beam_label(Fail)); + break; + } + case BsmSegment::action::TEST_HEAP: { + comment("test_heap %ld", seg.size); + emit_gc_test(ArgWord(0), ArgWord(seg.size), seg.live); + is_ctx_valid = is_position_valid = false; + break; + } + case BsmSegment::action::READ: { + comment("read %ld", seg.size); + if (seg.size == 0) { + comment("(nothing to do)"); + } else { + if (!is_ctx_valid) { + mov_arg(ctx, Ctx); + is_ctx_valid = true; + } + if (!is_position_valid) { + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + is_position_valid = true; + } + a.mov(bin_base, emit_boxed_val(ctx, base_offset)); + a.add(emit_boxed_val(ctx, position_offset), imm(seg.size)); + + emit_read_bits(seg.size, bin_base, bin_position, bitdata); + } + + is_position_valid = false; + break; + } + case BsmSegment::action::EXTRACT_BINARY: { + auto bits = seg.size; + auto Dst = seg.dst; + + comment("extract binary %ld", bits); + emit_extract_binary(bitdata, bits, Dst); + if (!is_last && bits != 0 && bits != 64) { + a.shl(bitdata, imm(bits)); + } + break; + } + case BsmSegment::action::EXTRACT_INTEGER: { + auto bits = seg.size; + auto flags = seg.flags; + auto Dst = seg.dst; + + comment("extract integer %ld", bits); + if (is_last && flags == 0 && bits < SMALL_BITS) { + a.shr(bitdata, imm(64 - bits - _TAG_IMMED1_SIZE)); + a.or_(bitdata, imm(_TAG_IMMED1_SMALL)); + mov_arg(Dst, bitdata); + } else { + emit_extract_integer(bitdata, tmp, flags, bits, Dst); + if (!is_last && bits != 0 && bits != 64) { + a.shl(bitdata, imm(bits)); + } + } + + /* bin_position is clobbered. */ + is_position_valid = false; + break; + } + case BsmSegment::action::READ_INTEGER: { + auto bits = seg.size; + auto flags = seg.flags; + auto Dst = seg.dst; + + comment("read integer %ld", bits); + if (!is_ctx_valid) { + mov_arg(ctx, Ctx); + is_ctx_valid = true; + } + if (!is_position_valid) { + a.mov(bin_position, emit_boxed_val(ctx, position_offset)); + is_position_valid = true; + } + + a.mov(bin_base, emit_boxed_val(ctx, base_offset)); + a.add(emit_boxed_val(ctx, position_offset), imm(seg.size)); + emit_read_integer(bin_base, bin_position, tmp, flags, bits, Dst); + + is_position_valid = false; + break; + } + case BsmSegment::action::GET_INTEGER: { + Uint flags = seg.flags; + auto bits = seg.size; + auto Dst = seg.dst; + + comment("get integer %ld", bits); + if (!is_ctx_valid) { + mov_arg(ctx, Ctx); + } + + a.lea(ARG4, emit_boxed_val(ctx, offsetof(ErlBinMatchState, mb))); + + if (bits >= SMALL_BITS) { + emit_enter_runtime<Update::eReductions | Update::eStack | + Update::eHeap>(); + } else { + emit_enter_runtime(); + } + + a.mov(ARG1, c_p); + a.mov(ARG2, bits); + a.mov(ARG3, flags); + /* ARG4 set above */ + runtime_call<4>(erts_bs_get_integer_2); + + if (bits >= SMALL_BITS) { + emit_leave_runtime<Update::eReductions | Update::eStack | + Update::eHeap>(); + } else { + emit_leave_runtime(); + } + + mov_arg(Dst, RET); + + is_ctx_valid = is_position_valid = false; + break; + } + case BsmSegment::action::GET_BINARY: { + comment("get binary %ld", seg.size); + if (is_ctx_valid) { + a.mov(RET, ctx); + } else { + mov_arg(RET, Ctx); + } + emit_enter_runtime<Update::eHeap>(); + a.lea(ARG1, x86::qword_ptr(c_p, offsetof(Process, htop))); + a.mov(ARG2, emit_boxed_val(RET, orig_offset)); + a.mov(ARG3, emit_boxed_val(RET, base_offset)); + a.mov(ARG4, emit_boxed_val(RET, position_offset)); + mov_imm(ARG5, seg.size); + a.add(emit_boxed_val(RET, position_offset), ARG5); + + runtime_call<5>(erts_extract_sub_binary); + + emit_leave_runtime<Update::eHeap>(); + mov_arg(seg.dst, RET); + + is_ctx_valid = is_position_valid = false; + break; + } + case BsmSegment::action::GET_TAIL: { + comment("get_tail"); + if (is_ctx_valid) { + a.mov(ARG1, ctx); + } else { + mov_arg(ARG1, Ctx); + } + safe_fragment_call(ga->get_bs_get_tail_shared()); + mov_arg(seg.dst, RET); + is_ctx_valid = is_position_valid = false; + break; + } + case BsmSegment::action::SKIP: { + comment("skip %ld", seg.size); + if (!is_ctx_valid) { + mov_arg(ctx, Ctx); + is_ctx_valid = true; + } + /* The compiler limits the size of any segment in a bs_match + * instruction to 24 bits. */ + ASSERT((seg.size >> 24) == 0); + a.add(emit_boxed_val(ctx, position_offset), imm(seg.size)); + is_position_valid = false; + break; + } + case BsmSegment::action::DROP: + auto bits = seg.size; + comment("drop %ld", bits); + if (bits != 0 && bits != 64) { + a.shl(bitdata, imm(bits)); + } + break; + } + } +} diff --git a/erts/emulator/beam/jit/x86/ops.tab b/erts/emulator/beam/jit/x86/ops.tab index 1da4f80e6d..37914d7363 100644 --- a/erts/emulator/beam/jit/x86/ops.tab +++ b/erts/emulator/beam/jit/x86/ops.tab @@ -849,7 +849,19 @@ i_lambda_trampoline F f W W i_breakpoint_trampoline # ================================================================ -# New bit syntax matching (R11B). +# New bit syntax matching for fixed sizes (from OTP 26). +# ================================================================ + +bs_match Fail Ctx Size Rest=* => bs_match(Fail, Ctx, Size, Rest) + +i_bs_match Fail Ctx Rest=* | test_heap Need Live => + i_bs_match_test_heap Fail Ctx Need Live Rest + +i_bs_match f S * +i_bs_match_test_heap f S I t * + +# ================================================================ +# Bit syntax matching (from R11B). # ================================================================ %warm |