diff options
Diffstat (limited to 'erts/emulator/test/bs_utf_SUITE.erl')
-rw-r--r-- | erts/emulator/test/bs_utf_SUITE.erl | 136 |
1 files changed, 124 insertions, 12 deletions
diff --git a/erts/emulator/test/bs_utf_SUITE.erl b/erts/emulator/test/bs_utf_SUITE.erl index 68099c6f39..4a16ef44ab 100644 --- a/erts/emulator/test/bs_utf_SUITE.erl +++ b/erts/emulator/test/bs_utf_SUITE.erl @@ -20,11 +20,12 @@ -module(bs_utf_SUITE). --export([all/0, suite/0, +-export([all/0, suite/0, init_per_suite/1, end_per_suite/1, utf8_roundtrip/1,utf16_roundtrip/1,utf32_roundtrip/1, utf8_illegal_sequences/1,utf16_illegal_sequences/1, utf32_illegal_sequences/1, - bad_construction/1]). + bad_construction/1, + utf8_big_file/1]). -include_lib("common_test/include/ct.hrl"). @@ -34,26 +35,93 @@ suite() -> [{ct_hooks,[ts_install_cth]}, {timetrap, {minutes, 6}}]. -all() -> +all() -> [utf8_roundtrip, utf16_roundtrip, utf32_roundtrip, utf8_illegal_sequences, utf16_illegal_sequences, - utf32_illegal_sequences, bad_construction]. + utf32_illegal_sequences, bad_construction, + utf8_big_file]. + +init_per_suite(Config) -> + %% Make sure that calls to id/1 will hide types. + id(Config), + Config. + +end_per_suite(Config) -> + Config. utf8_roundtrip(Config) when is_list(Config) -> utf8_roundtrip(0, 16#D7FF), utf8_roundtrip(16#E000, 16#10FFFF), ok. -utf8_roundtrip(First, Last) when First =< Last -> - Bin = int_to_utf8(First), +utf8_roundtrip(First, Last) -> + %% Hide types. + do_utf8_roundtrip(id(First), id(Last)). + +do_utf8_roundtrip(First, Last) when First =< Last -> + Bin = int_to_utf8(id(First)), Bin = id(<<First/utf8>>), Bin = id(<<(id(<<>>))/binary,First/utf8>>), - Unaligned = id(<<3:2,First/utf8>>), - <<_:2,Bin/binary>> = Unaligned, + + <<0:7/unit:8,Bin/binary>> = id(<<0:7/unit:8,First/utf8>>), + + %% Here a heap binary and a sub binary will be allocated. If the + %% write in the utf8 segment extends beyond the end of heap binary, + %% it will will overwrite the header for the sub binary. + <<-1:(64-9)/signed,Bin/binary>> = id(<<-1:(64-9),First/utf8>>), + <<-1:63/signed,Bin/binary>> = id(<<-1:63,First/utf8>>), + + if + is_integer(First) -> + Bin = id(<<First/utf8>>) + end, + + <<1:1,Bin/binary>> = id(<<1:1,First/utf8>>), + <<0:1,Bin/binary>> = id(<<0:1,First/utf8>>), + <<3:2,Bin/binary>> = id(<<3:2,First/utf8>>), + <<5:3,Bin/binary>> = id(<<5:3,First/utf8>>), + <<13:4,Bin/binary>> = id(<<13:4,First/utf8>>), + <<21:5,Bin/binary>> = id(<<21:5,First/utf8>>), + <<51:6,Bin/binary>> = id(<<51:6,First/utf8>>), + <<107:7,Bin/binary>> = id(<<107:7,First/utf8>>), + <<First/utf8>> = Bin, <<First/utf8>> = make_unaligned(Bin), - utf8_roundtrip(First+1, Last); -utf8_roundtrip(_, _) -> ok. + + %% Matching of utf8 segments use different code paths dependending + %% on the the number of bytes available in the binary. Make sure + %% we test both code paths. + <<First/utf8,0:64>> = id(<<Bin/binary,0:64>>), + <<0:3,First/utf8,0:64>> = id(<<0:3,Bin/binary,0:64>>), + + unaligned_match(First), + + Bin = id(<<First/utf8>>), + do_utf8_roundtrip(First+1, Last); +do_utf8_roundtrip(_, _) -> ok. + +unaligned_match(Char) -> + %% We create a REFC binary so that we can create sub binaries + %% and control the contents just beyond the end of the binary. + _ = [begin + Bin = id(<<0:64/unit:8,0:Offset,Char/utf8>>), + <<0:64/unit:8,0:Offset,Char/utf8>> = Bin, + unaligned_match(Bin, Offset, 8) + end || Offset <- lists:seq(1, 7)], + ok. + +unaligned_match(_Bin, _Offset, 0) -> + ok; +unaligned_match(Bin, Offset, N) -> + Size = bit_size(Bin), + <<Shorter:(Size-1)/bits,_:1>> = Bin, + try + <<0:64/unit:8,0:Offset,Char/utf8>> = Shorter, + ct:fail({short_binary_accepted,Shorter,Char}) + catch + error:{badmatch,_} -> + unaligned_match(Shorter, Offset, N - 1) + end. utf16_roundtrip(Config) when is_list(Config) -> Big = fun utf16_big_roundtrip/1, @@ -149,6 +217,7 @@ fail_range(Char, End) when Char =< End -> {'EXIT',_} = (catch <<Char/utf8>>), Bin = int_to_utf8(Char), fail(Bin), + fail(<<Bin/binary,0:64>>), fail_range(Char+1, End); fail_range(_, _) -> ok. @@ -201,24 +270,39 @@ overlong(Char, Last, NumBytes) when Char =< Last -> overlong(_, _, _) -> ok. overlong(Char, NumBytes) when NumBytes < 5 -> - case int_to_utf8(Char, NumBytes) of + Bin = int_to_utf8(Char, NumBytes), + case <<(int_to_utf8(Char, NumBytes))/binary>> of <<Char/utf8>>=Bin -> ct:fail({illegal_encoding_accepted,Bin,Char}); <<OtherChar/utf8>>=Bin -> ct:fail({illegal_encoding_accepted,Bin,Char,OtherChar}); _ -> ok end, + case <<(int_to_utf8(Char, NumBytes))/binary,0:64>> of + <<Char/utf8,0:64>>=Bin2 -> + ct:fail({illegal_encoding_accepted,Bin2,Char}); + <<OtherChar2/utf8,0:64>>=Bin2 -> + ct:fail({illegal_encoding_accepted,Bin2,Char,OtherChar2}); + _ -> ok + end, overlong(Char, NumBytes+1); overlong(_, _) -> ok. fail(Bin) -> fail_1(Bin), - fail_1(make_unaligned(Bin)). + fail_1(make_unaligned(Bin)), + BinExt = <<Bin/binary,0:64>>, + fail_2(BinExt), + fail_2(make_unaligned(BinExt)). fail_1(<<Char/utf8>>=Bin) -> ct:fail({illegal_encoding_accepted,Bin,Char}); fail_1(_) -> ok. +fail_2(<<Char/utf8,0:64>>=Bin) -> + ct:fail({illegal_encoding_accepted,Bin,Char}); +fail_2(_) -> ok. + utf16_illegal_sequences(Config) when is_list(Config) -> utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large. @@ -295,6 +379,9 @@ bad_construction(Config) when is_list(Config) -> ?FAIL(<<3.14/utf8>>), ?FAIL(<<3.1415/utf16>>), ?FAIL(<<3.1415/utf32>>), + {'EXIT',_} = (catch <<(id(3.14))/utf8>>), + {'EXIT',_} = (catch <<(id(3.1415))/utf16>>), + {'EXIT',_} = (catch <<(id(3.1415))/utf32>>), ?FAIL(<<(-1)/utf8>>), ?FAIL(<<(-1)/utf16>>), @@ -305,9 +392,23 @@ bad_construction(Config) when is_list(Config) -> ?FAIL(<<16#D800/utf8>>), ?FAIL(<<16#D800/utf16>>), ?FAIL(<<16#D800/utf32>>), + {'EXIT',_} = (catch <<(id(16#D800))/utf8>>), + {'EXIT',_} = (catch <<(id(16#D800))/utf16>>), + {'EXIT',_} = (catch <<(id(16#D800))/utf32>>), ok. +utf8_big_file(Config) -> + DataDir = get_data_dir(Config), + {ok, Bin} = file:read_file(filename:join(DataDir, "NormalizationTest.txt")), + List = unicode:characters_to_list(Bin), + _ = [begin + io:format("~p\n", [Offset]), + <<0:Offset, Rest/binary>> = id(<<0:Offset, Bin/binary>>), + List = [Char || <<Char/utf8>> <= Rest] + end || Offset <- lists:seq(0, 8)], + ok. + %% This function intentionally allows construction of %% UTF-8 sequence in illegal ranges. int_to_utf8(I) when I =< 16#7F -> @@ -384,4 +485,15 @@ evaluate(Str, Vars) -> Result end. +%% Retrieve the original data directory for cloned modules. +get_data_dir(Config) -> + Data = proplists:get_value(data_dir, Config), + Opts = [{return,list}], + Suffixes = ["_no_opt_SUITE", + "_r25_SUITE"], + lists:foldl(fun(Suffix, Acc) -> + Opts = [{return,list}], + re:replace(Acc, Suffix, "_SUITE", Opts) + end, Data, Suffixes). + id(I) -> I. |