From 4f0ec73674b5c042084b528642185f968f7d9981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Tue, 2 Aug 2022 09:58:05 +0200
Subject: Optimize binary matching for fixed-width segments

Consider this function:

    foo(<<A:6, B:6, C:6, D:6>>) ->
        {A, B, C, D}.

The compiler in Erlang/OTP 25 and earlier would generate the following
code for doing the binary matching:

    {test,bs_start_match3,{f,1},1,[{x,0}],{x,1}}.
    {bs_get_position,{x,1},{x,0},2}.
    {test,bs_get_integer2,
          {f,3},
          2,
          [{x,1},
           {integer,6},
           1,
           {field_flags,[{anno,[4,{file,"t.erl"}]},unsigned,big]}],
          {x,2}}.
    {test,bs_get_integer2,
          {f,3},
          3,
          [{x,1},
           {integer,6},
           1,
           {field_flags,[{anno,[4,{file,"t.erl"}]},unsigned,big]}],
          {x,3}}.
    {test,bs_get_integer2,
          {f,3},
          4,
          [{x,1},
           {integer,6},
           1,
           {field_flags,[{anno,[4,{file,"t.erl"}]},unsigned,big]}],
          {x,4}}.
    {test,bs_get_integer2,
          {f,3},
          5,
          [{x,1},
           {integer,6},
           1,
           {field_flags,[{anno,[4,{file,"t.erl"}]},unsigned,big]}],
          {x,5}}.
    {test,bs_test_tail2,{f,3},[{x,1},0]}.

That is, there would be one instruction for each segment being
matched. Having separate match instructions for each segment makes it
difficult for the JIT to do any serious optimization. Currently, when
matching a segment with a size that is not a multiple of 8, the JIT
will generate code that calls a helper function. Common sizes such as
8, 16, and 32 are specially optimized with inline code in the x86 JIT
and in the non-JIT BEAM VM.

This commit introduces a new `bs_match` instruction for matching of
integer and binary segments of fixed size. Here is the generated code
for the example:

    {test,bs_start_match3,{f,1},1,[{x,0}],{x,1}}.
    {bs_get_position,{x,1},{x,0},2}.
    {bs_match,{f,3},
              {x,1},
              {commands,[{ensure_exactly,24},
                         {integer,2,{literal,[]},6,1,{x,2}},
                         {integer,3,{literal,[]},6,1,{x,3}},
                         {integer,4,{literal,[]},6,1,{x,4}},
                         {integer,5,{literal,[]},6,1,{x,5}}]}}.

Having only one instruction for the matching allows the JIT to
generate faster code. The generated code will do the following:

* Test that the size of the binary being matched is exactly 24 bits.

* Read 24 bits from the binary into a temporary CPU register.

* For each segment, extract the integer from the temporary register
  by shifting and masking.

Because of the before-mentioned optimization for certain common
segment sizes, the main part of the Base64 encoding in the `base64`
module is currently implemented in the following non-intuitive way:

    encode_binary(<<B1:8, B2:8, B3:8, Ls/bits>>, A) ->
        BB = (B1 bsl 16) bor (B2 bsl 8) bor B3,
        encode_binary(Ls,
                      <<A/bits,(b64e(BB bsr 18)):8,
                        (b64e((BB bsr 12) band 63)):8,
                        (b64e((BB bsr 6) band 63)):8,
                        (b64e(BB band 63)):8>>)

With the new optimization, it is now possible to express the Base64
encoding in a more natural way, which is also faster than before:

    encode_binary(<<B1:6, B2:6, B3:6, B4:6, Ls/bits>>, A) ->
        encode_binary(Ls,
                      <<A/bits,
                        (b64e(B1)):8,
                        (b64e(B2)):8,
                        (b64e(B3)):8,
                        (b64e(B4)):8>>)
---
 erts/emulator/beam/atom.names | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'erts/emulator/beam/atom.names')

diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index b04a5f6052..1332f3a0d9 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -256,6 +256,7 @@ atom enable_trace
 atom enabled
 atom endian
 atom env
+atom ensure_at_least ensure_exactly
 atom eof
 atom eol
 atom Eq='=:='
@@ -324,6 +325,7 @@ atom get_all_trap
 atom get_internal_state_blocked
 atom get_seq_token
 atom get_size
+atom get_tail
 atom get_tcw
 atom gather_gc_info_result
 atom gather_io_bytes
@@ -645,6 +647,7 @@ atom set_tcw_fake
 atom short
 atom shutdown
 atom sighup
+atom signed
 atom sigterm
 atom sigusr1
 atom sigusr2
@@ -659,6 +662,7 @@ atom sigtstp
 atom sigquit
 atom silent
 atom size
+atom skip
 atom spawn_executable
 atom spawn_driver
 atom spawn_init
-- 
cgit v1.2.1