summaryrefslogtreecommitdiff
path: root/lib/stdlib/uc_spec/gen_unicode_mod.escript
diff options
context:
space:
mode:
Diffstat (limited to 'lib/stdlib/uc_spec/gen_unicode_mod.escript')
-rw-r--r--lib/stdlib/uc_spec/gen_unicode_mod.escript468
1 files changed, 381 insertions, 87 deletions
diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript
index ecc30b40c6..a6ac3b0e60 100644
--- a/lib/stdlib/uc_spec/gen_unicode_mod.escript
+++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript
@@ -23,55 +23,69 @@
-mode(compile).
--record(cp, {name, class, dec, comp, cs}).
+-record(cp, {name, class, dec, comp, cs, cat}).
-define(MOD, "unicode_util").
-main(_) ->
+main(Args) ->
%% Parse main table
- {ok, UD} = file:open("../uc_spec/UnicodeData.txt", [read, raw, {read_ahead, 1000000}]),
+ UD = file_open("../uc_spec/UnicodeData.txt"),
Data0 = foldl(fun parse_unicode_data/2, [], UD),
Data1 = array:from_orddict(lists:reverse(Data0)),
ok = file:close(UD),
%% Special Casing table
- {ok, SC} = file:open("../uc_spec/SpecialCasing.txt", [read, raw, {read_ahead, 1000000}]),
+ SC = file_open("../uc_spec/SpecialCasing.txt"),
Data2 = foldl(fun parse_special_casing/2, Data1, SC),
ok = file:close(SC),
%% Casing Folding table
- {ok, CF} = file:open("../uc_spec/CaseFolding.txt", [read, raw, {read_ahead, 1000000}]),
+ CF = file_open("../uc_spec/CaseFolding.txt"),
Data = foldl(fun parse_case_folding/2, Data2, CF),
ok = file:close(CF),
%% Normalization
- {ok, ExclF} = file:open("../uc_spec/CompositionExclusions.txt", [read, raw, {read_ahead, 1000000}]),
+ ExclF = file_open("../uc_spec/CompositionExclusions.txt"),
ExclData = foldl(fun parse_comp_excl/2, Data, ExclF),
ok = file:close(ExclF),
%% GraphemeBreakProperty table
- {ok, Emoji} = file:open("../uc_spec/emoji-data.txt", [read, raw, {read_ahead, 1000000}]),
+ Emoji = file_open("../uc_spec/emoji-data.txt"),
Props00 = foldl(fun parse_properties/2, [], Emoji),
%% Filter Extended_Pictographic class which we are interested in.
Props0 = [EP || {extended_pictographic, _} = EP <- Props00],
ok = file:close(Emoji),
- {ok, GBPF} = file:open("../uc_spec/GraphemeBreakProperty.txt", [read, raw, {read_ahead, 1000000}]),
+ GBPF = file_open("../uc_spec/GraphemeBreakProperty.txt"),
Props1 = foldl(fun parse_properties/2, Props0, GBPF),
ok = file:close(GBPF),
- {ok, PropF} = file:open("../uc_spec/PropList.txt", [read, raw, {read_ahead, 1000000}]),
+ PropF = file_open("../uc_spec/PropList.txt"),
Props2 = foldl(fun parse_properties/2, Props1, PropF),
ok = file:close(PropF),
Props = sofs:to_external(sofs:relation_to_family(sofs:relation(Props2))),
+ WidthF = file_open("../uc_spec/EastAsianWidth.txt"),
+ WideCs = foldl(fun parse_widths/2, [], WidthF),
+ ok = file:close(WidthF),
+
%% Make module
+ UpdateTests = case Args of
+ ["update_tests"] -> true;
+ _ -> false
+ end,
+
{ok, Out} = file:open(?MOD++".erl", [write]),
- gen_file(Out, Data, ExclData, maps:from_list(Props)),
+ gen_file(Out, Data, ExclData, maps:from_list(Props), WideCs, UpdateTests),
ok = file:close(Out),
ok.
+file_open(File) ->
+ {ok, Fd} = file:open(File, [read, raw, {read_ahead, 1000000}]),
+ Fd.
+
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
parse_unicode_data(Line0, Acc) ->
Line = string:chomp(Line0),
- [CodePoint,Name,_Cat,Class,_BiDi,Decomp,
+ [CodePoint,Name,Cat,Class,_BiDi,Decomp,
_N1,_N2,_N3,_BDMirror,_Uni1,_Iso|Case] = tokens(Line, ";"),
{Dec,Comp} = case to_decomp(Decomp) of
{_, _} = Compabil -> {[], Compabil};
@@ -79,7 +93,7 @@ parse_unicode_data(Line0, Acc) ->
end,
[{hex_to_int(CodePoint),
#cp{name=list_to_binary(Name),class=to_class(Class),
- dec=Dec, comp=Comp, cs=to_case(Case)}}
+ dec=Dec, comp=Comp, cs=to_case(Case), cat=Cat}}
|Acc].
to_class(String) ->
@@ -152,7 +166,62 @@ parse_properties(Line0, Acc) ->
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-gen_file(Fd, Data, ExclData, Props) ->
+%% Pick ranges that are wide when seen from a non East Asian context,
+%% That way we can minimize the data, every other code point is considered narrow.
+%% We loose information but hopefully keep the important width for a standard
+%% terminal.
+parse_widths(Line0, Acc) ->
+ [{WidthClass, {From, _To}=Range}] = parse_properties(Line0, []),
+ case is_default_width(From, WidthClass) of
+ {true, narrow} ->
+ Acc;
+ {false, narrow} ->
+ [Range|Acc];
+ {true, RuleRange} ->
+ [RuleRange|Acc]
+%%% {false, rule_execption} -> i.e. narrow codepoint in wide range
+%%% Should not happen in current specs
+ end.
+
+is_default_width(Index, WD) ->
+ if
+ 16#3400 =< Index, Index =< 16#4DBF ->
+ if WD =:= w orelse WD =:= f ->
+ {true, {16#3400, 16#4DBF}};
+ true ->
+ {false, rule_execption}
+ end;
+ 16#4E00 =< Index, Index =< 16#9FFF ->
+ if WD =:= w orelse WD =:= f ->
+ {true, {16#4E00, 16#9FFF}};
+ true ->
+ {false, rule_execption}
+ end;
+ 16#F900 =< Index, Index =< 16#FAFF ->
+ if WD =:= w orelse WD =:= f ->
+ {true, {16#F900, 16#FAFF}};
+ true ->
+ {false, rule_execption}
+ end;
+ 16#20000 =< Index, Index =< 16#2FFFD ->
+ if WD =:= w orelse WD =:= f ->
+ {true, {16#20000, 16#2FFFD}};
+ true ->
+ {false, rule_execption}
+ end;
+ 16#30000 =< Index, Index =< 16#3FFFD ->
+ if WD =:= w orelse WD =:= f ->
+ {true, {16#30000, 16#3FFFD}};
+ true ->
+ {false, rule_execption}
+ end;
+ true ->
+ {WD =:= n orelse WD =:= na orelse WD == h orelse WD =:= a, narrow}
+ end.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+gen_file(Fd, Data, ExclData, Props, WideCs, UpdateTests) ->
gen_header(Fd),
gen_static(Fd),
gen_norm(Fd),
@@ -161,7 +230,8 @@ gen_file(Fd, Data, ExclData, Props) ->
gen_gc(Fd, Props),
gen_compose_pairs(Fd, ExclData, Data),
gen_case_table(Fd, Data),
- gen_unicode_table(Fd, Data),
+ gen_unicode_table(Fd, Data, UpdateTests),
+ gen_width_table(Fd, WideCs),
ok.
gen_header(Fd) ->
@@ -173,26 +243,32 @@ gen_header(Fd) ->
io:put_chars(Fd, "-export([whitespace/0, is_whitespace/1]).\n"),
io:put_chars(Fd, "-export([uppercase/1, lowercase/1, titlecase/1, casefold/1]).\n\n"),
io:put_chars(Fd, "-export([spec_version/0, lookup/1, get_case/1]).\n"),
+ io:put_chars(Fd, "-export([is_wide/1]).\n"),
io:put_chars(Fd, "-compile({inline, [class/1]}).\n"),
io:put_chars(Fd, "-compile(nowarn_unused_vars).\n"),
io:put_chars(Fd, "-dialyzer({no_improper_lists, [cp/1, gc/1, gc_prepend/2]}).\n"),
- io:put_chars(Fd, "-type gc() :: char()|[char()].\n\n\n"),
+ io:put_chars(Fd, "-type gc() :: char()|[char()].\n\n"),
+ io:put_chars(Fd, "-define(IS_CP(CP), (is_integer(CP) andalso 0 =< CP andalso CP < 16#110000)).\n\n\n"),
ok.
gen_static(Fd) ->
io:put_chars(Fd, "-spec lookup(char()) -> #{'canon':=[{byte(),char()}], 'ccc':=byte(), "
- "'compat':=[] | {atom(),[{byte(),char()}]}}.\n"),
- io:put_chars(Fd, "lookup(Codepoint) ->\n"
- " {CCC,Can,Comp} = unicode_table(Codepoint),\n"
- " #{ccc=>CCC, canon=>Can, compat=>Comp}.\n\n"),
+ "'compat':=[] | {atom(),[{byte(),char()}]}, 'category':={atom(),atom()}}.\n"),
+ io:put_chars(Fd, "lookup(Codepoint) when ?IS_CP(Codepoint) ->\n"
+ " {CCC,Can,Comp,Cat} = unicode_table(Codepoint),\n"
+ " #{ccc=>CCC, canon=>Can, compat=>Comp, category=>category(Codepoint,Cat)}.\n\n"),
+
io:put_chars(Fd, "-spec get_case(char()) -> #{'fold':=gc(), 'lower':=gc(), 'title':=gc(), 'upper':=gc()}.\n"),
- io:put_chars(Fd, "get_case(Codepoint) ->\n"
+ io:put_chars(Fd, "get_case(Codepoint) when ?IS_CP(Codepoint) ->\n"
" case case_table(Codepoint) of\n"
" {U,L} -> #{upper=>U,lower=>L,title=>U,fold=>L};\n"
" {U,L,T,F} -> #{upper=>U,lower=>L,title=>T,fold=>F}\n"
" end.\n\n"),
- io:put_chars(Fd, "spec_version() -> {14,0}.\n\n\n"),
- io:put_chars(Fd, "class(Codepoint) -> {CCC,_,_} = unicode_table(Codepoint),\n CCC.\n\n"),
+
+ io:put_chars(Fd, "spec_version() -> {15,0}.\n\n\n"),
+ io:put_chars(Fd, "class(Codepoint) when ?IS_CP(Codepoint) -> \n"
+ " {CCC,_,_,_} = unicode_table(Codepoint),\n CCC.\n\n"),
+
io:put_chars(Fd, "-spec uppercase(unicode:chardata()) -> "
"maybe_improper_list(gc(),unicode:chardata()).\n"),
io:put_chars(Fd, "uppercase(Str0) ->\n"),
@@ -217,6 +293,7 @@ gen_static(Fd) ->
io:put_chars(Fd, " [] -> [];\n"),
io:put_chars(Fd, " {error,Err} -> error({badarg, Err})\n"),
io:put_chars(Fd, " end.\n\n"),
+
io:put_chars(Fd, "-spec titlecase(unicode:chardata()) -> "
"maybe_improper_list(gc(),unicode:chardata()).\n"),
io:put_chars(Fd, "titlecase(Str0) ->\n"),
@@ -229,6 +306,7 @@ gen_static(Fd) ->
io:put_chars(Fd, " [] -> [];\n"),
io:put_chars(Fd, " {error,Err} -> error({badarg, Err})\n"),
io:put_chars(Fd, " end.\n\n"),
+
io:put_chars(Fd, "-spec casefold(unicode:chardata()) -> "
"maybe_improper_list(gc(),unicode:chardata()).\n"),
io:put_chars(Fd, "casefold(Str0) ->\n"),
@@ -242,6 +320,19 @@ gen_static(Fd) ->
io:put_chars(Fd, " {error,Err} -> error({badarg, Err})\n"),
io:put_chars(Fd, " end.\n\n"),
+ io:put_chars(Fd, "%% Returns true if the character is considered wide in non east asian context.\n"),
+ io:put_chars(Fd, "-spec is_wide(gc()) -> boolean().\n"),
+ io:put_chars(Fd, "is_wide(C) when ?IS_CP(C) ->\n"),
+ io:put_chars(Fd, " is_wide_cp(C);\n"),
+ io:put_chars(Fd, "is_wide([_, 16#FE0E|Cs]) -> true; %% Presentation sequence\n"),
+ io:put_chars(Fd, "is_wide([_, 16#FE0F|Cs]) -> true; %% Presentation sequence\n"),
+ io:put_chars(Fd, "is_wide([C|Cs]) when ?IS_CP(C) ->\n"),
+ io:put_chars(Fd, " is_wide_cp(C) orelse is_wide(Cs);\n"),
+ io:put_chars(Fd, "is_wide([]) ->\n false.\n\n"),
+
+ io:put_chars(Fd, "category(CP, lookup_category) ->\n"
+ " cat_translate(lookup_category(CP));\n"
+ "category(_, Def) -> cat_translate(Def).\n\n"),
ok.
gen_norm(Fd) ->
@@ -249,7 +340,7 @@ gen_norm(Fd) ->
"-spec nfd(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
"nfd(Str0) ->\n"
" case gc(Str0) of\n"
- " [GC|R] when GC < 128 -> [GC|R];\n"
+ " [GC|R] when is_integer(GC), 0 =< GC, GC < 128 -> [GC|R];\n"
" [GC|Str] -> [decompose(GC)|Str];\n"
" [] -> [];\n"
" {error,_}=Error -> Error\n end.\n\n"
@@ -259,7 +350,7 @@ gen_norm(Fd) ->
"-spec nfkd(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
"nfkd(Str0) ->\n"
" case gc(Str0) of\n"
- " [GC|R] when GC < 128 -> [GC|R];\n"
+ " [GC|R] when is_integer(GC), 0 =< GC, GC < 128 -> [GC|R];\n"
" [GC|Str] -> [decompose_compat(GC)|Str];\n"
" [] -> [];\n"
" {error,_}=Error -> Error\n end.\n\n"
@@ -269,7 +360,7 @@ gen_norm(Fd) ->
"-spec nfc(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
"nfc(Str0) ->\n"
" case gc(Str0) of\n"
- " [GC|R] when GC < 256 -> [GC|R];\n"
+ " [GC|R] when is_integer(GC), 0 =< GC, GC < 256 -> [GC|R];\n"
" [GC|Str] -> [compose(decompose(GC))|Str];\n"
" [] -> [];\n"
" {error,_}=Error -> Error\n end.\n\n"
@@ -279,7 +370,7 @@ gen_norm(Fd) ->
"-spec nfkc(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
"nfkc(Str0) ->\n"
" case gc(Str0) of\n"
- " [GC|R] when GC < 128 -> [GC|R];\n"
+ " [GC|R] when is_integer(GC), 0 =< GC, GC < 128 -> [GC|R];\n"
" [GC|Str] -> [compose_compat_0(decompose_compat(GC))|Str];\n"
" [] -> [];\n"
" {error,_}=Error -> Error\n end.\n\n"
@@ -288,13 +379,13 @@ gen_norm(Fd) ->
io:put_chars(Fd,
"decompose(CP) when is_integer(CP), CP < 16#AC00, 16#D7A3 > CP ->\n"
" case unicode_table(CP) of\n"
- " {_,[],_} -> CP;\n"
- " {_,CPs,_} -> canonical_order(CPs)\n"
+ " {_,[],_,_} -> CP;\n"
+ " {_,CPs,_,_} -> canonical_order(CPs)\n"
" end;\n"
"decompose(CP) ->\n"
" canonical_order(decompose_1(CP)).\n"
"\n"
- "decompose_1(CP) when 16#AC00 =< CP, CP =< 16#D7A3 ->\n"
+ "decompose_1(CP) when is_integer(CP), 16#AC00 =< CP, CP =< 16#D7A3 ->\n"
" Syll = CP-16#AC00,\n"
" T = 28,\n"
" N = 588,\n"
@@ -306,8 +397,8 @@ gen_norm(Fd) ->
" end;\n"
"decompose_1(CP) when is_integer(CP) ->\n"
" case unicode_table(CP) of\n"
- " {CCC, [],_} -> [{CCC,CP}];\n"
- " {_, CPs, _} -> CPs\n"
+ " {CCC, [],_,_} -> [{CCC,CP}];\n"
+ " {_,CPs,_,_} -> CPs\n"
" end;\n"
"decompose_1([CP|CPs]) ->\n"
" decompose_1(CP) ++ decompose_1(CPs);\n"
@@ -331,14 +422,14 @@ gen_norm(Fd) ->
io:put_chars(Fd,
"decompose_compat(CP) when is_integer(CP), CP < 16#AC00, 16#D7A3 > CP ->\n"
" case unicode_table(CP) of\n"
- " {_, [], []} -> CP;\n"
- " {_, _, {_,CPs}} -> canonical_order(CPs);\n"
- " {_, CPs, _} -> canonical_order(CPs)\n"
+ " {_, [], [], _} -> CP;\n"
+ " {_, _, {_,CPs}, _} -> canonical_order(CPs);\n"
+ " {_, CPs, _, _} -> canonical_order(CPs)\n"
" end;\n"
"decompose_compat(CP) ->\n"
" canonical_order(decompose_compat_1(CP)).\n"
"\n"
- "decompose_compat_1(CP) when 16#AC00 =< CP, CP =< 16#D7A3 ->\n"
+ "decompose_compat_1(CP) when is_integer(CP), 16#AC00 =< CP, CP =< 16#D7A3 ->\n"
" Syll = CP-16#AC00,\n"
" T = 28,\n"
" N = 588,\n"
@@ -350,23 +441,24 @@ gen_norm(Fd) ->
" end;\n"
"decompose_compat_1(CP) when is_integer(CP) ->\n"
" case unicode_table(CP) of\n"
- " {CCC, [], []} -> [{CCC,CP}];\n"
- " {_, _, {_,CPs}} -> CPs;\n"
- " {_, CPs, _} -> CPs\n"
+ " {CCC, [], [], _} -> [{CCC,CP}];\n"
+ " {_, _, {_,CPs}, _} -> CPs;\n"
+ " {_, CPs, _, _} -> CPs\n"
" end;\n"
"decompose_compat_1([CP|CPs]) ->\n"
" decompose_compat_1(CP) ++ decompose_compat_1(CPs);\n"
- "decompose_compat_1([]) -> [].\n"),
+ "decompose_compat_1([]) -> [].\n\n"),
io:put_chars(Fd,
"compose(CP) when is_integer(CP) -> CP;\n"
"compose([Lead,Vowel|Trail]) %% Hangul\n"
- " when 16#1100 =< Lead, Lead =< 16#1112 ->\n"
+ " when is_integer(Lead), 16#1100 =< Lead, Lead =< 16#1112, is_integer(Vowel) ->\n"
" if 16#1161 =< Vowel, Vowel =< 16#1175 ->\n"
" CP = 16#AC00 + ((Lead - 16#1100) * 588) + ((Vowel - 16#1161) * 28),\n"
" case Trail of\n"
- " [T|Acc] when 16#11A7 =< T, T =< 16#11C2 -> nolist(CP+T-16#11A7,Acc);\n"
+ " [T|Acc] when is_integer(T), 16#11A7 =< T, T =< 16#11C2 ->"
+ " nolist(CP+T-16#11A7,Acc);\n"
" Acc -> nolist(CP,Acc)\n"
" end;\n"
" true ->\n"
@@ -408,11 +500,12 @@ gen_norm(Fd) ->
" end.\n\n"
"compose_compat(CP) when is_integer(CP) -> CP;\n"
"compose_compat([Lead,Vowel|Trail]) %% Hangul\n"
- " when 16#1100 =< Lead, Lead =< 16#1112 ->\n"
+ " when is_integer(Lead), 16#1100 =< Lead, Lead =< 16#1112, is_integer(Vowel) ->\n"
" if 16#1161 =< Vowel, Vowel =< 16#1175 ->\n"
" CP = 16#AC00 + ((Lead - 16#1100) * 588) + ((Vowel - 16#1161) * 28),\n"
" case Trail of\n"
- " [T|Acc] when 16#11A7 =< T, T =< 16#11C2 -> nolist(CP+T-16#11A7,Acc);\n"
+ " [T|Acc] when is_integer(T), 16#11A7 =< T, T =< 16#11C2 ->"
+ " nolist(CP+T-16#11A7,Acc);\n"
" Acc -> nolist(CP,Acc)\n"
" end;\n"
" true ->\n"
@@ -462,7 +555,7 @@ gen_ws(Fd, Props) ->
gen_cp(Fd) ->
io:put_chars(Fd, "-spec cp(String::unicode:chardata()) ->"
" maybe_improper_list() | {error, unicode:chardata()}.\n"),
- io:put_chars(Fd, "cp([C|_]=L) when is_integer(C) -> L;\n"),
+ io:put_chars(Fd, "cp([C|_]=L) when ?IS_CP(C) -> L;\n"),
io:put_chars(Fd, "cp([List]) -> cp(List);\n"),
io:put_chars(Fd, "cp([List|R]) -> cpl(List, R);\n"),
io:put_chars(Fd, "cp([]) -> [];\n"),
@@ -470,8 +563,8 @@ gen_cp(Fd) ->
io:put_chars(Fd, "cp(<<>>) -> [];\n"),
io:put_chars(Fd, "cp(<<R/binary>>) -> {error,R}.\n"),
io:put_chars(Fd, "\n"),
- io:put_chars(Fd, "cpl([C], R) when is_integer(C) -> [C|cpl_1_cont(R)];\n"),
- io:put_chars(Fd, "cpl([C|T], R) when is_integer(C) -> [C|cpl_cont(T, R)];\n"),
+ io:put_chars(Fd, "cpl([C], R) when ?IS_CP(C) -> [C|cpl_1_cont(R)];\n"),
+ io:put_chars(Fd, "cpl([C|T], R) when ?IS_CP(C) -> [C|cpl_cont(T, R)];\n"),
io:put_chars(Fd, "cpl([List], R) -> cpl(List, R);\n"),
io:put_chars(Fd, "cpl([List|T], R) -> cpl(List, [T|R]);\n"),
io:put_chars(Fd, "cpl([], R) -> cp(R);\n"),
@@ -542,18 +635,18 @@ gen_gc(Fd, GBP) ->
" maybe_improper_list() | {error, unicode:chardata()}.\n"),
io:put_chars(Fd,
"gc([]=R) -> R;\n"
- "gc([CP]=R) when is_integer(CP) -> R;\n"
+ "gc([CP]=R) when ?IS_CP(CP) -> R;\n"
"gc([$\\r=CP|R0]) ->\n"
" case cp(R0) of % Don't break CRLF\n"
" [$\\n|R1] -> [[$\\r,$\\n]|R1];\n"
" T -> [CP|T]\n"
" end;\n"
- "gc([CP1|T1]=T) when CP1 < 256 ->\n"
+ "gc([CP1|T1]=T) when ?IS_CP(CP1), CP1 < 256 ->\n"
" case T1 of\n"
- " [CP2|_] when CP2 < 256 -> T; %% Ascii Fast path\n"
+ " [CP2|_] when is_integer(CP2), 0 =< CP2, CP2 < 256 -> T; %% Ascii Fast path\n"
" _ -> %% Keep the tail binary.\n"
" case cp_no_bin(T1) of\n"
- " [CP2|_]=T3 when CP2 < 256 -> [CP1|T3]; %% Asciii Fast path\n"
+ " [CP2|_]=T3 when is_integer(CP2), 0 =< CP2, CP2 < 256 -> [CP1|T3]; %% Asciii Fast path\n"
" binary_found -> gc_1(T);\n"
" T4 -> gc_1([CP1|T4])\n"
" end\n"
@@ -568,7 +661,7 @@ gen_gc(Fd, GBP) ->
" end;\n"
" true -> gc_1([CP1|Rest])\n"
" end;\n"
- "gc([CP|_]=T) when is_integer(CP) -> gc_1(T);\n"
+ "gc([CP|_]=T) when ?IS_CP(CP) -> gc_1(T);\n"
"gc(Str) ->\n"
" case cp(Str) of\n"
" {error,_}=Error -> Error;\n"
@@ -596,13 +689,14 @@ gen_gc(Fd, GBP) ->
io:put_chars(Fd, "\n%% Optimize Latin-1\n"),
[GenExtP(CP) || CP <- merge_ranges(ExtendedPictographicLow)],
- io:format(Fd,
- "gc_1([CP|R]=R0) when CP < 256 ->\n"
- " case R of\n"
- " [CP2|_] when CP2 < 256 -> R0;\n"
- " _ -> gc_extend(cp(R), R, CP)\n"
- " end;\n",
- []),
+ io:put_chars(Fd,
+ "gc_1([CP|R]=R0) when is_integer(CP), 0 =< CP, CP < 256 ->\n"
+ " case R of\n"
+ " [CP2|_] when is_integer(CP2), 0 =< CP2, CP2 < 256 -> R0;\n"
+ " _ -> gc_extend(cp(R), R, CP)\n"
+ " end;\n"
+ "gc_1([CP|_]) when not ?IS_CP(CP) ->\n"
+ " error({badarg,CP});\n"),
io:put_chars(Fd, "\n%% Continue control\n"),
[GenControl(CP) || CP <- Crs],
%% One clause per CP
@@ -623,7 +717,7 @@ gen_gc(Fd, GBP) ->
GenHangulT = fun(Range) -> io:format(Fd, "gc_1~s gc_h_T(R1,[CP]);\n", [gen_clause(Range)]) end,
[GenHangulT(CP) || CP <- merge_ranges(maps:get(t,GBP))],
io:put_chars(Fd, "%% Handle Hangul LV and LVT special, since they are large\n"),
- io:put_chars(Fd, "gc_1([CP|_]=R0) when 44000 < CP, CP < 56000 -> gc_h_lv_lvt(R0, R0, []);\n"),
+ io:put_chars(Fd, "gc_1([CP|_]=R0) when is_integer(CP), 44000 < CP, CP < 56000 -> gc_h_lv_lvt(R0, R0, []);\n"),
io:put_chars(Fd, "\n%% Handle Regional\n"),
GenRegional = fun(Range) -> io:format(Fd, "gc_1~s gc_regional(R1,CP);\n", [gen_clause(Range)]) end,
@@ -739,7 +833,9 @@ gen_gc(Fd, GBP) ->
[{RLess,RLarge}] = merge_ranges(maps:get(regional_indicator,GBP)),
io:put_chars(Fd,"gc_regional(R0, CP0) ->\n"
" case cp(R0) of\n"),
- io:format(Fd, " [CP|R1] when ~w =< CP,CP =< ~w-> gc_extend2(cp(R1),R1,[CP,CP0]);~n",[RLess, RLarge]),
+ io:format(Fd, " [CP|R1] when is_integer(CP), ~w =< CP, CP =< ~w ->\n"
+ " gc_extend2(cp(R1),R1,[CP,CP0]);~n",
+ [RLess, RLarge]),
io:put_chars(Fd," R1 -> gc_extend(R1, R0, CP0)\n"
" end.\n\n"),
@@ -780,6 +876,7 @@ gen_gc(Fd, GBP) ->
" _ -> gc_extend2(R1, R0, Acc)\n"
" end\n end.\n\n"),
io:put_chars(Fd, "%% Handle Hangul LV\n"),
+ io:put_chars(Fd, "gc_h_lv_lvt([CP|_], _R0, _Acc) when not ?IS_CP(CP) -> error(badarg);\n"),
GenHangulLV = fun(Range) -> io:format(Fd, "gc_h_lv_lvt~s gc_h_V(R1,[CP|Acc]);\n",
[gen_clause2(Range)]) end,
[GenHangulLV(CP) || CP <- merge_ranges(maps:get(lv,GBP))],
@@ -806,50 +903,241 @@ gen_compose_pairs(Fd, ExclData, Data) ->
[io:format(Fd, "compose_pair(~w,~w) -> ~w;~n", [A,B,CP]) || {[A,B],CP} <- lists:sort(DeCmp2)],
io:put_chars(Fd, "compose_pair(_,_) -> false.\n\n"),
- io:put_chars(Fd, "nolist(CP, []) -> CP;\nnolist(CP,L) -> [CP|L].\n\n"),
+ io:put_chars(Fd, "nolist(CP, []) when ?IS_CP(CP) -> CP;\n"
+ "nolist(CP, L) when ?IS_CP(CP) -> [CP|L].\n\n"),
ok.
gen_case_table(Fd, Data) ->
- Case = array:foldr(fun(CP, #cp{cs={U0,L0,T0,F0}}, Acc) ->
- U = def_cp(U0,CP),
- L = def_cp(L0,CP),
- T = def_cp(T0,CP),
- F = def_cp(F0,CP),
- case T =:= U andalso F =:= L of
- true ->
- [{CP,{U,L}}|Acc];
- false ->
- [{CP,{U,L,T,F}}|Acc]
- end;
- (_CP, _, Acc) -> Acc
- end, [], Data),
+ HC = fun(CP, #cp{cs=Cs}, Acc) ->
+ case case_data(CP, Cs) of
+ default -> Acc;
+ CaseData -> [{CP,CaseData}|Acc]
+ end
+ end,
+ Case = array:sparse_foldr(HC, [], Data),
[io:format(Fd, "case_table(~w) -> ~w;\n", [CP, Map])|| {CP,Map} <- Case],
io:format(Fd, "case_table(CP) -> {CP, CP}.\n\n",[]),
ok.
+case_data(CP, {U0,L0,T0,F0}) ->
+ U = def_cp(U0,CP),
+ L = def_cp(L0,CP),
+ T = def_cp(T0,CP),
+ F = def_cp(F0,CP),
+ case T =:= U andalso F =:= L of
+ true -> {U,L};
+ false -> {U,L,T,F}
+ end;
+case_data(_, _) ->
+ default.
+
def_cp([], CP) -> CP;
def_cp(CP, _) -> CP.
-gen_unicode_table(Fd, Data) ->
- FixCanon = fun(_, #cp{class=CCC, dec=Dec, comp=Comp}) ->
+gen_unicode_table(Fd, Data, UpdateTests) ->
+ FixCanon = fun(_, #cp{class=CCC, dec=Dec, comp=Comp, cat=Cat}) ->
Canon = decompose(Dec,Data),
- #{ccc=>CCC, canonical=>Canon, compat=>Comp}
+ #{ccc=>CCC, canonical=>Canon, compat=>Comp, cat=>Cat}
end,
AofMaps0 = array:sparse_map(FixCanon, Data),
- FixCompat = fun(_, #{ccc:=CCC, canonical:=Canon, compat:=Comp}) ->
+ FixCompat = fun(_, #{ccc:=CCC, canonical:=Canon, compat:=Comp, cat:=Cat}) ->
Compat = decompose_compat(Canon, Comp, AofMaps0),
- {CCC, Canon, Compat}
+ {CCC, Canon, Compat, category(Cat)}
end,
AofMaps1 = array:sparse_map(FixCompat, AofMaps0),
Dict0 = array:sparse_to_orddict(AofMaps1),
- Def = {0, [], []},
- Dict = lists:filter(fun({_, Map}) -> Map =/= Def end, Dict0),
+ Def = {0, [], [], lookup_category},
+ {NonDef, CatTable} = lists:partition(fun({_, {0,[],[],_Cat}}) -> false;
+ (_) -> true
+ end, Dict0),
+
+ %% Export testfile
+ case UpdateTests of
+ true ->
+ Dict1 = lists:map(fun({Id,{CCC, Canon, Compat, Cat}}) ->
+ {_, ECat} = lists:keyfind(Cat, 1, category_translate()),
+ {Id, {CCC, Canon, Compat, ECat}}
+ end, Dict0),
+ TestFile = "../test/unicode_util_SUITE_data/unicode_table.bin",
+ io:format("Updating: ~s~n", [TestFile]),
+ file:write_file(TestFile, term_to_binary(Dict1, [compressed]));
+ false ->
+ ignore
+ end,
- [io:format(Fd, "unicode_table(~w) -> ~w;~n", [CP, Map]) || {CP,Map} <- Dict],
+ [io:format(Fd, "unicode_table(~w) -> ~w;~n", [CP, Map]) || {CP,Map} <- NonDef],
io:format(Fd, "unicode_table(_) -> ~w.~n~n",[Def]),
+
+ [io:format(Fd, "cat_translate(~w) -> ~w;~n", [Cat, EC]) || {Cat,EC} <- category_translate()],
+ io:format(Fd, "cat_translate(Cat) -> error({internal_error, Cat}).~n~n",[]),
+ gen_category(Fd, CatTable, Data),
+ ok.
+
+category([C,Sub]) ->
+ list_to_atom([C-$A+$a, Sub]).
+
+category_translate() ->
+ [{lu, {letter, uppercase}}, % Letter, Uppercase
+ {ll, {letter, lowercase}}, % Letter, Lowercase
+ {lt, {letter, titlecase}}, % Letter, Titlecase
+ {mn, {mark, non_spacing}}, % Mark, Non-Spacing
+ {mc, {mark, spacing_combining}}, % Mark, Spacing Combining
+ {me, {mark, enclosing}}, % Mark, Enclosing
+ {nd, {number, decimal}}, % Number, Decimal Digit
+ {nl, {number, letter}}, % Number, Letter
+ {no, {number, other}}, % Number, Other
+ {zs, {separator, space}}, % Separator, Space
+ {zl, {separator, line}}, % Separator, Line
+ {zp, {separator, paragraph}}, % Separator, Paragraph
+ {cc, {other, control}}, % Other, Control
+ {cf, {other, format}}, % Other, Format
+ {cs, {other, surrogate}}, % Other, Surrogate
+ {co, {other, private}}, % Other, Private Use
+ {cn, {other, not_assigned}}, % Other, Not Assigned (no characters in the file have this property)
+ {lm, {letter, modifier}}, % Letter, Modifier
+ {lo, {letter, other}}, % Letter, Other
+ {pc, {punctuation, connector}}, % Punctuation, Connector
+ {pd, {punctuation, dash}}, % Punctuation, Dash
+ {ps, {punctuation, open}}, % Punctuation, Open
+ {pe, {punctuation, close}}, % Punctuation, Close
+ {pi, {punctuation, initial}}, % Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
+ {pf, {punctuation, final}}, % Punctuation, Final quote (may behave like Ps or Pe depending on usage)
+ {po, {punctuation, other}}, % Punctuation, Other
+ {sm, {symbol, math}}, % Symbol, Math
+ {sc, {symbol, currency}}, % Symbol, Currency
+ {sk, {symbol, modifier}}, % Symbol, Modifier
+ {so, {symbol, other}}]. % Symbol, Other
+
+gen_category(Fd, [{CP, {_, _, _, Cat}}|Rest], All) ->
+ gen_category(Fd, Rest, Cat, CP, CP, All, []).
+
+gen_category(Fd, [{CP, {_, _, _, NextCat}}|Rest], Cat, Start, End, All, Acc)
+ when End+1 =:= CP ->
+ IsLetterCat = letter_cat(NextCat, Cat),
+ if NextCat =:= Cat ->
+ gen_category(Fd, Rest, Cat, Start, CP, All, Acc);
+ IsLetterCat ->
+ gen_category(Fd, Rest, letter, Start, CP, All, Acc);
+ Start =:= End ->
+ io:format(Fd, "lookup_category(~w) -> ~w;~n", [Start, Cat]),
+ gen_category(Fd, Rest, NextCat, CP, CP, All, Acc);
+ true ->
+ case Cat of
+ letter ->
+ io:format(Fd, "lookup_category(CP) when ~w =< CP, CP =< ~w-> subcat_letter(CP);~n",
+ [Start, End]),
+ gen_category(Fd, Rest, NextCat, CP, CP, All,
+ lists:reverse(lists:seq(Start, End)) ++ Acc);
+ _ ->
+ io:format(Fd, "lookup_category(CP) when ~w =< CP, CP =< ~w-> ~w;~n", [Start, End, Cat]),
+ gen_category(Fd, Rest, NextCat, CP, CP, All, Acc)
+ end
+ end;
+gen_category(Fd, [{CP, {_, _, _, NewCat}}|Rest]=Cont, Cat, Start, End, All, Acc) ->
+ case array:get(End+1, All) of
+ undefined ->
+ if Start =:= End ->
+ io:format(Fd, "lookup_category(~w) -> ~w;~n", [Start, Cat]),
+ gen_category(Fd, Rest, NewCat, CP, CP, All, Acc);
+ true ->
+ case Cat of
+ letter ->
+ io:format(Fd, "lookup_category(CP) when ~w =< CP, CP =< ~w-> subcat_letter(CP);~n",
+ [Start, End]),
+ gen_category(Fd, Rest, NewCat, CP, CP, All,
+ lists:reverse(lists:seq(Start, End)) ++ Acc);
+ _ ->
+ io:format(Fd, "lookup_category(CP) when ~w =< CP, CP =< ~w -> ~w;~n",
+ [Start, End, Cat]),
+ gen_category(Fd, Rest, NewCat, CP, CP, All, Acc)
+ end
+ end;
+ _ -> %% We can make ranges larger by setting already assigned category
+ gen_category(Fd, Cont, Cat, Start, End+1, All, Acc)
+ end;
+gen_category(Fd, [], Cat, Start, End, All, Acc) ->
+ case Start =:= End of
+ true ->
+ io:format(Fd, "lookup_category(~w) -> ~w;~n", [Start, Cat]);
+ false ->
+ io:format(Fd, "lookup_category(CP) when ~w =< CP, CP =< ~w -> ~w;~n", [Start, End, Cat])
+ end,
+ io:put_chars(Fd, "lookup_category(Cp) -> cn.\n\n"),
+ gen_letter(Fd, lists:reverse(Acc), All),
ok.
+letter_cat(lm, _) ->
+ false;
+letter_cat(_, lm) ->
+ false;
+letter_cat(L1, L2) ->
+ is_letter(L1) andalso (L2 =:= letter orelse is_letter(L2)).
+
+is_letter(LC) ->
+ lists:member(LC, [lu,ll,lt,lo,lm]).
+
+gen_letter(Fd, Letters, All) ->
+ gen_letter(Fd, Letters, All, []).
+gen_letter(Fd, [CP|Rest], All, Acc) ->
+ case array:get(CP, All) of
+ undefined ->
+ gen_letter(Fd, Rest, All, Acc);
+ #cp{cat=Cat0, cs=Cs} ->
+ case {category(Cat0), case_table(CP,case_data(CP, Cs))} of
+ {Sub,Sub} ->
+ gen_letter(Fd, Rest, All, Acc);
+ {lm,_} ->
+ gen_letter(Fd, Rest, All, Acc);
+ {Cat, _Dbg} ->
+ case is_letter(Cat) of
+ true ->
+ gen_letter(Fd, Rest, All, [{CP, Cat}|Acc]);
+ false ->
+ gen_letter(Fd, Rest, All, Acc)
+ end
+ end
+ end;
+gen_letter(Fd, [], _, Acc) ->
+ [{Start, Cat}|SCletters] = lists:reverse(Acc),
+ subcat_letter(Fd, SCletters, Start, Start, Cat),
+ io:put_chars(Fd,
+ "subcat_letter(CP) ->\n"
+ " case case_table(CP) of\n"
+ " {CP, CP} -> lo; %{letter,other};\n"
+ " {CP, _} -> lu; %{letter,uppercase};\n"
+ " {_, CP} -> ll; %{letter,lowercase};\n"
+ " {_, _, CP, _} -> lt; %{letter,titlecase};\n"
+ " {CP, _, _, _} -> lu; %{letter,uppercase};\n"
+ " {_,CP,_,_} -> ll %{letter,lowercase}\n"
+ " end.\n\n").
+
+subcat_letter(Fd, [{CP, Cat}|R], Start, End, Cat) when End+1 =:= CP ->
+ subcat_letter(Fd, R, Start, CP, Cat);
+subcat_letter(Fd, Rest, Start, Start, Cat) ->
+ io:format(Fd, "subcat_letter(~w) -> ~w;\n",[Start,Cat]),
+ case Rest of
+ [] -> ok;
+ [{CP, NewCat}|R] -> subcat_letter(Fd, R, CP, CP, NewCat)
+ end;
+subcat_letter(Fd, Rest, Start, End, Cat) ->
+ io:format(Fd, "subcat_letter(CP) when ~w =< CP, CP =< ~w -> ~w;\n",[Start,End,Cat]),
+ case Rest of
+ [] -> ok;
+ [{CP, NewCat}|R] -> subcat_letter(Fd, R, CP, CP, NewCat)
+ end.
+
+case_table(CP, CaseData) ->
+ case CaseData of
+ {CP, CP} -> lo;
+ {CP, _} -> lu;
+ {_, CP} -> ll;
+ {_, _, CP, _} -> lt;
+ {CP, _, _, _} -> lu;
+ {_,CP,_,_} -> ll;
+ default -> lo
+ end.
+
decompose([], _Data) -> [];
decompose([CP|CPs], Data) when is_integer(CP) ->
case array:get(CP,Data) of
@@ -883,35 +1171,41 @@ decompose_compat([{_,CP}|CPs], Data) ->
decompose_compat([CP|CPs], Data).
+gen_width_table(Fd, WideChars) ->
+ MergedWCs = merge_ranges(WideChars),
+ Write = fun(Range) -> io:format(Fd, "is_wide_cp~s true;~n", [gen_single_clause(Range)]) end,
+ [Write(Range) || Range <- MergedWCs],
+ io:format(Fd, "is_wide_cp(_) -> false.~n", []).
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
gen_clause({R0, undefined}) ->
io_lib:format("([~w=CP|R1]=R0) ->", [R0]);
gen_clause({R0, R1}) ->
- io_lib:format("([CP|R1]=R0) when ~w =< CP, CP =< ~w ->", [R0,R1]).
+ io_lib:format("([CP|R1]=R0) when is_integer(CP), ~w =< CP, CP =< ~w ->", [R0,R1]).
gen_clause2({R0, undefined}) ->
io_lib:format("([~w=CP|R1], R0, Acc) ->", [R0]);
gen_clause2({R0, R1}) ->
- io_lib:format("([CP|R1], R0, Acc) when ~w =< CP, CP =< ~w ->", [R0,R1]).
+ io_lib:format("([CP|R1], R0, Acc) when is_integer(CP), ~w =< CP, CP =< ~w ->", [R0,R1]).
gen_case_clause({R0, undefined}) ->
io_lib:format("[~w=CP|R1] ->", [R0]);
gen_case_clause({R0, R1}) ->
- io_lib:format("[CP|R1] when ~w =< CP, CP =< ~w ->", [R0,R1]).
+ io_lib:format("[CP|R1] when is_integer(CP), ~w =< CP, CP =< ~w ->", [R0,R1]).
gen_single_clause({R0, undefined}) ->
io_lib:format("(~w) ->", [R0]);
gen_single_clause({R0, R1}) ->
- io_lib:format("(CP) when ~w =< CP, CP =< ~w ->", [R0,R1]).
+ io_lib:format("(CP) when is_integer(CP), ~w =< CP, CP =< ~w ->", [R0,R1]).
merge_ranges(List) ->
merge_ranges(List, true).
merge_ranges(List, Opt) ->
- Res0 = merge_ranges_1(lists:sort(List), []),
+ Res0 = merge_ranges_1(lists:usort(List), []),
case Opt of
split ->
split_ranges(Res0,[]); % One clause per CP