1 files changed, 124 insertions, 12 deletions
diff --git a/erts/emulator/test/bs_utf_SUITE.erl b/erts/emulator/test/bs_utf_SUITE.erl
index 68099c6f39..4a16ef44ab 100644
--- a/erts/emulator/test/bs_utf_SUITE.erl
+++ b/erts/emulator/test/bs_utf_SUITE.erl
@@ -20,11 +20,12 @@
 
 -module(bs_utf_SUITE).
 
--export([all/0, suite/0,
+-export([all/0, suite/0, init_per_suite/1, end_per_suite/1,
 	 utf8_roundtrip/1,utf16_roundtrip/1,utf32_roundtrip/1,
 	 utf8_illegal_sequences/1,utf16_illegal_sequences/1,
 	 utf32_illegal_sequences/1,
-	 bad_construction/1]).
+	 bad_construction/1,
+         utf8_big_file/1]).
 
 -include_lib("common_test/include/ct.hrl").
 
@@ -34,26 +35,93 @@ suite() ->
     [{ct_hooks,[ts_install_cth]},
      {timetrap, {minutes, 6}}].
 
-all() -> 
+all() ->
     [utf8_roundtrip, utf16_roundtrip, utf32_roundtrip,
      utf8_illegal_sequences, utf16_illegal_sequences,
-     utf32_illegal_sequences, bad_construction].
+     utf32_illegal_sequences, bad_construction,
+     utf8_big_file].
+
+init_per_suite(Config) ->
+    %% Make sure that calls to id/1 will hide types.
+    id(Config),
+    Config.
+
+end_per_suite(Config) ->
+    Config.
 
 utf8_roundtrip(Config) when is_list(Config) ->
     utf8_roundtrip(0, 16#D7FF),
     utf8_roundtrip(16#E000, 16#10FFFF),
     ok.
 
-utf8_roundtrip(First, Last) when First =< Last ->
-    Bin = int_to_utf8(First),
+utf8_roundtrip(First, Last) ->
+    %% Hide types.
+    do_utf8_roundtrip(id(First), id(Last)).
+
+do_utf8_roundtrip(First, Last) when First =< Last ->
+    Bin = int_to_utf8(id(First)),
     Bin = id(<<First/utf8>>),
     Bin = id(<<(id(<<>>))/binary,First/utf8>>),
-    Unaligned = id(<<3:2,First/utf8>>),
-    <<_:2,Bin/binary>> = Unaligned,
+
+    <<0:7/unit:8,Bin/binary>> = id(<<0:7/unit:8,First/utf8>>),
+
+    %% Here a heap binary and a sub binary will be allocated. If the
+    %% write in the utf8 segment extends beyond the end of heap binary,
+    %% it will will overwrite the header for the sub binary.
+    <<-1:(64-9)/signed,Bin/binary>> = id(<<-1:(64-9),First/utf8>>),
+    <<-1:63/signed,Bin/binary>> = id(<<-1:63,First/utf8>>),
+
+    if
+        is_integer(First) ->
+            Bin = id(<<First/utf8>>)
+    end,
+
+    <<1:1,Bin/binary>> = id(<<1:1,First/utf8>>),
+    <<0:1,Bin/binary>> = id(<<0:1,First/utf8>>),
+    <<3:2,Bin/binary>> = id(<<3:2,First/utf8>>),
+    <<5:3,Bin/binary>> = id(<<5:3,First/utf8>>),
+    <<13:4,Bin/binary>> = id(<<13:4,First/utf8>>),
+    <<21:5,Bin/binary>> = id(<<21:5,First/utf8>>),
+    <<51:6,Bin/binary>> = id(<<51:6,First/utf8>>),
+    <<107:7,Bin/binary>> = id(<<107:7,First/utf8>>),
+
     <<First/utf8>> = Bin,
     <<First/utf8>> = make_unaligned(Bin),
-    utf8_roundtrip(First+1, Last);
-utf8_roundtrip(_, _) -> ok.
+
+    %% Matching of utf8 segments use different code paths dependending
+    %% on the the number of bytes available in the binary. Make sure
+    %% we test both code paths.
+    <<First/utf8,0:64>> = id(<<Bin/binary,0:64>>),
+    <<0:3,First/utf8,0:64>> = id(<<0:3,Bin/binary,0:64>>),
+
+    unaligned_match(First),
+
+    Bin = id(<<First/utf8>>),
+    do_utf8_roundtrip(First+1, Last);
+do_utf8_roundtrip(_, _) -> ok.
+
+unaligned_match(Char) ->
+    %% We create a REFC binary so that we can create sub binaries
+    %% and control the contents just beyond the end of the binary.
+    _ = [begin
+             Bin = id(<<0:64/unit:8,0:Offset,Char/utf8>>),
+             <<0:64/unit:8,0:Offset,Char/utf8>> = Bin,
+             unaligned_match(Bin, Offset, 8)
+         end || Offset <- lists:seq(1, 7)],
+    ok.
+
+unaligned_match(_Bin, _Offset, 0) ->
+    ok;
+unaligned_match(Bin, Offset, N) ->
+    Size = bit_size(Bin),
+    <<Shorter:(Size-1)/bits,_:1>> = Bin,
+    try
+        <<0:64/unit:8,0:Offset,Char/utf8>> = Shorter,
+        ct:fail({short_binary_accepted,Shorter,Char})
+    catch
+        error:{badmatch,_} ->
+            unaligned_match(Shorter, Offset, N - 1)
+    end.
 
 utf16_roundtrip(Config) when is_list(Config) ->
     Big = fun utf16_big_roundtrip/1,
@@ -149,6 +217,7 @@ fail_range(Char, End) when Char =< End ->
     {'EXIT',_} = (catch <<Char/utf8>>),
     Bin = int_to_utf8(Char),
     fail(Bin),
+    fail(<<Bin/binary,0:64>>),
     fail_range(Char+1, End);
 fail_range(_, _) -> ok.
 
@@ -201,24 +270,39 @@ overlong(Char, Last, NumBytes) when Char =< Last ->
 overlong(_, _, _) -> ok.
 
 overlong(Char, NumBytes) when NumBytes < 5 ->
-    case int_to_utf8(Char, NumBytes) of
+    Bin = int_to_utf8(Char, NumBytes),
+    case <<(int_to_utf8(Char, NumBytes))/binary>> of
 	<<Char/utf8>>=Bin ->
 	    ct:fail({illegal_encoding_accepted,Bin,Char});
 	<<OtherChar/utf8>>=Bin ->
 	    ct:fail({illegal_encoding_accepted,Bin,Char,OtherChar});
 	_ -> ok
     end,
+    case <<(int_to_utf8(Char, NumBytes))/binary,0:64>> of
+	<<Char/utf8,0:64>>=Bin2 ->
+	    ct:fail({illegal_encoding_accepted,Bin2,Char});
+	<<OtherChar2/utf8,0:64>>=Bin2 ->
+	    ct:fail({illegal_encoding_accepted,Bin2,Char,OtherChar2});
+	_ -> ok
+    end,
     overlong(Char, NumBytes+1);
 overlong(_, _) -> ok.
 
 fail(Bin) ->
     fail_1(Bin),
-    fail_1(make_unaligned(Bin)).
+    fail_1(make_unaligned(Bin)),
+    BinExt = <<Bin/binary,0:64>>,
+    fail_2(BinExt),
+    fail_2(make_unaligned(BinExt)).
 
 fail_1(<<Char/utf8>>=Bin) ->
     ct:fail({illegal_encoding_accepted,Bin,Char});
 fail_1(_) -> ok.
 
+fail_2(<<Char/utf8,0:64>>=Bin) ->
+    ct:fail({illegal_encoding_accepted,Bin,Char});
+fail_2(_) -> ok.
+
 
 utf16_illegal_sequences(Config) when is_list(Config) ->
     utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
@@ -295,6 +379,9 @@ bad_construction(Config) when is_list(Config) ->
     ?FAIL(<<3.14/utf8>>),
     ?FAIL(<<3.1415/utf16>>),
     ?FAIL(<<3.1415/utf32>>),
+    {'EXIT',_} = (catch <<(id(3.14))/utf8>>),
+    {'EXIT',_} = (catch <<(id(3.1415))/utf16>>),
+    {'EXIT',_} = (catch <<(id(3.1415))/utf32>>),
 
     ?FAIL(<<(-1)/utf8>>),
     ?FAIL(<<(-1)/utf16>>),
@@ -305,9 +392,23 @@ bad_construction(Config) when is_list(Config) ->
     ?FAIL(<<16#D800/utf8>>),
     ?FAIL(<<16#D800/utf16>>),
     ?FAIL(<<16#D800/utf32>>),
+    {'EXIT',_} = (catch <<(id(16#D800))/utf8>>),
+    {'EXIT',_} = (catch <<(id(16#D800))/utf16>>),
+    {'EXIT',_} = (catch <<(id(16#D800))/utf32>>),
 
     ok.
 
+utf8_big_file(Config) ->
+    DataDir = get_data_dir(Config),
+    {ok, Bin} = file:read_file(filename:join(DataDir, "NormalizationTest.txt")),
+    List = unicode:characters_to_list(Bin),
+    _ = [begin
+             io:format("~p\n", [Offset]),
+             <<0:Offset, Rest/binary>> = id(<<0:Offset, Bin/binary>>),
+             List = [Char || <<Char/utf8>> <= Rest]
+         end || Offset <- lists:seq(0, 8)],
+    ok.
+
 %% This function intentionally allows construction of
 %% UTF-8 sequence in illegal ranges.
 int_to_utf8(I) when I =< 16#7F ->
@@ -384,4 +485,15 @@ evaluate(Str, Vars) ->
 	    Result
     end.
 
+%% Retrieve the original data directory for cloned modules.
+get_data_dir(Config) ->
+    Data = proplists:get_value(data_dir, Config),
+    Opts = [{return,list}],
+    Suffixes = ["_no_opt_SUITE",
+                "_r25_SUITE"],
+    lists:foldl(fun(Suffix, Acc) ->
+                        Opts = [{return,list}],
+                        re:replace(Acc, Suffix, "_SUITE", Opts)
+                end, Data, Suffixes).
+
 id(I) -> I.