diff options
author | José Valim <jose.valim@plataformatec.com.br> | 2017-05-26 18:14:01 +0200 |
---|---|---|
committer | José Valim <jose.valim@plataformatec.com.br> | 2017-05-26 18:14:01 +0200 |
commit | 646821a78f9121b6f554e63f65e3eaa5a517c5fe (patch) | |
tree | 321ea2e4f0195ddf143f50e01ac9c7d038236bf4 | |
parent | 6eb05b9dd0ce32c122410c28e30809b7196d47a0 (diff) | |
download | elixir-646821a78f9121b6f554e63f65e3eaa5a517c5fe.tar.gz |
Add unicode tokenizer for atoms
-rw-r--r-- | Makefile | 1 | ||||
-rw-r--r-- | lib/elixir/unicode/tokenizer.ex | 171 | ||||
-rw-r--r-- | lib/elixir/unicode/unicode.ex | 8 |
3 files changed, 177 insertions, 3 deletions
@@ -91,6 +91,7 @@ $(UNICODE): lib/elixir/unicode/* @ echo "==> unicode (compile)"; $(Q) $(ELIXIRC) lib/elixir/unicode/unicode.ex -o lib/elixir/ebin; $(Q) $(ELIXIRC) lib/elixir/unicode/properties.ex -o lib/elixir/ebin; + $(Q) $(ELIXIRC) lib/elixir/unicode/tokenizer.ex -o lib/elixir/ebin; $(eval $(call APP_TEMPLATE,ex_unit,ExUnit)) $(eval $(call APP_TEMPLATE,logger,Logger)) diff --git a/lib/elixir/unicode/tokenizer.ex b/lib/elixir/unicode/tokenizer.ex new file mode 100644 index 000000000..adf688b6f --- /dev/null +++ b/lib/elixir/unicode/tokenizer.ex @@ -0,0 +1,171 @@ +defmodule String.Tokenizer do + @moduledoc false + @on_load :check_otp_release + + data_path = Path.join(__DIR__, "UnicodeData.txt") + + {letter_uptitlecase, start, continue} = + Enum.reduce File.stream!(data_path), {[], [], []}, fn + line, {letter_uptitlecase, start, continue} -> + [codepoint, line] = :binary.split(line, ";") + [_name, line] = :binary.split(line, ";") + [category, _] = :binary.split(line, ";") + + cond do + category in ~w(Lu Lt) -> + {[String.to_integer(codepoint, 16) | letter_uptitlecase], start, continue} + category in ~w(Ll Lm Lo Nl) -> + {letter_uptitlecase, [String.to_integer(codepoint, 16) | start], continue} + category in ~w(Mn Mc Nd Pc) -> + {letter_uptitlecase, start, [String.to_integer(codepoint, 16) | continue]} + true -> + {letter_uptitlecase, start, continue} + end + end + + prop_path = Path.join(__DIR__, "PropList.txt") + + {start, continue, patterns} = + Enum.reduce File.stream!(prop_path), {start, continue, []}, fn line, acc -> + [codepoints | category] = :binary.split(line, ";") + + pos = + case category do + [" Other_ID_Start" <> _] -> 0 + [" Other_ID_Continue" <> _] -> 1 + [" Pattern_White_Space" <> _] -> 2 + [" Pattern_Syntax" <> _] -> 2 + _ -> -1 + end + + if pos >= 0 do + entries = + case :binary.split(codepoints, "..") do + [<<codepoint::4-binary, _::binary>>] -> + [String.to_integer(codepoint, 16)] + [first, <<last::4-binary, _::binary>>] -> + Enum.to_list(String.to_integer(last, 16)..String.to_integer(first, 16)) + end + put_elem(acc, pos, entries ++ elem(acc, pos)) + else + acc + end + end + + id_upper = letter_uptitlecase -- patterns + id_start = start -- patterns + id_continue = continue -- patterns + + {ascii_upper, unicode_upper} = Enum.split_with(id_upper, & &1 <= 127) + {ascii_start, unicode_start} = Enum.split_with(id_start, & &1 <= 127) + + rangify = fn [head | tail] -> + {first, last, acc} = + Enum.reduce(tail, {head, head, []}, fn + number, {first, last, acc} when number == first - 1 -> + {number, last, acc} + number, {first, last, acc} -> + {number, number, [{first, last} | acc]} + end) + [{first, last} | acc] + end + + for {first, last} <- rangify.(ascii_upper) do + if first == last do + defp ascii_upper?(unquote(first)), do: true + else + defp ascii_upper?(entry) when entry in unquote(first)..unquote(last), do: true + end + end + + defp ascii_upper?(_), do: false + + for {first, last} <- rangify.(unicode_upper) do + if first == last do + defp unicode_upper?(unquote(first)), do: true + else + defp unicode_upper?(entry) when entry in unquote(first)..unquote(last), do: true + end + end + + defp unicode_upper?(_), do: false + + for {first, last} <- [{?_, ?_} | rangify.(ascii_start)] do + if first == last do + defp ascii_start?(unquote(first)), do: true + else + defp ascii_start?(entry) when entry in unquote(first)..unquote(last), do: true + end + end + + defp ascii_start?(_), do: false + + for {first, last} <- rangify.(unicode_start) do + if first == last do + defp unicode_start?(unquote(first)), do: true + else + defp unicode_start?(entry) when entry in unquote(first)..unquote(last), do: true + end + end + + defp unicode_start?(_), do: false + + for {first, last} <- rangify.(id_continue) do + if first == last do + defp continue?(unquote(first)), do: true + else + defp continue?(entry) when entry in unquote(first)..unquote(last), do: true + end + end + + defp continue?(_), do: false + + # Pattern is used as a performance check since most + # atoms and variables end with an atom character. + for {first, last} <- rangify.(patterns), last <= 127 do + if first == last do + defp ascii_pattern?(unquote(first)), do: true + else + defp ascii_pattern?(entry) when entry in unquote(first)..unquote(last), do: true + end + end + + defp ascii_pattern?(_), do: false + + def tokenize_atom([head | tail] = list) do + case ascii_start?(head) or ascii_upper?(head) or unicode_start?(head) or unicode_upper?(head) do + true -> validate_token(continue_atom(tail, [head])) + false -> {[], list} + end + end + + defp continue_atom([?@ | tail], acc) do + continue_atom(tail, [?@ | acc]) + end + defp continue_atom([head | tail] = list, acc) do + if ascii_start?(head) or ascii_upper?(head) or + (not ascii_pattern?(head) and (unicode_start?(head) or unicode_upper?(head) or continue?(head))) do + continue_atom(tail, [head | acc]) + else + {acc, list} + end + end + defp continue_atom([], acc) do + {acc, []} + end + + defp validate_token({acc, list}) do + acc = :lists.reverse(acc) + case :unicode.characters_to_nfc_list(acc) do + ^acc -> {:ok, acc, list} + _ -> {:error, "oops"} + end + end + + defp check_otp_release do + case List.to_integer(:erlang.system_info(:otp_release)) >= 20 do + true -> :ok + false -> :error + end + end +end diff --git a/lib/elixir/unicode/unicode.ex b/lib/elixir/unicode/unicode.ex index d3516c809..3b854458f 100644 --- a/lib/elixir/unicode/unicode.ex +++ b/lib/elixir/unicode/unicode.ex @@ -2,10 +2,12 @@ # # 1. Update CompositionExclusions.txt by copying original as is # 2. Update GraphemeBreakProperty.txt by copying original as is -# 3. Update SpecialCasing.txt by removing comments and conditional mappings from original -# 4. Update WhiteSpace.txt by copying the proper excerpt from PropList.txt -# 5. Update GraphemeBreakTest.txt and run graphemes_test.exs +# 3. Update PropList.txt by copying original as is +# 4. Update GraphemeBreakTest.txt by copying original as is +# 5. Update SpecialCasing.txt by removing comments and conditional mappings from original # 6. Update String.Unicode.version/0 and on String module docs +# 7. make unicode +# 8. elixir lib/elixir/unicode/graphemes_test.exs # defmodule String.Unicode do @moduledoc false |