summaryrefslogtreecommitdiff
path: root/lib/elixir/unicode/unicode.ex
diff options
context:
space:
mode:
authorJosé Valim <jose.valim@plataformatec.com.br>2017-05-25 21:03:10 +0200
committerJosé Valim <jose.valim@plataformatec.com.br>2017-05-26 16:50:57 +0200
commit6eb05b9dd0ce32c122410c28e30809b7196d47a0 (patch)
tree76a33c9c5c094be5b7f10059120d7d3a4de25328 /lib/elixir/unicode/unicode.ex
parent22145e286ce62305b405f4ddaef4196912c088be (diff)
downloadelixir-6eb05b9dd0ce32c122410c28e30809b7196d47a0.tar.gz
Move properties to a separate unicode file
Diffstat (limited to 'lib/elixir/unicode/unicode.ex')
-rw-r--r--lib/elixir/unicode/unicode.ex345
1 files changed, 0 insertions, 345 deletions
diff --git a/lib/elixir/unicode/unicode.ex b/lib/elixir/unicode/unicode.ex
index 6d6142bdf..d3516c809 100644
--- a/lib/elixir/unicode/unicode.ex
+++ b/lib/elixir/unicode/unicode.ex
@@ -316,348 +316,3 @@ defmodule String.Unicode do
[]
end
end
-
-to_binary = fn
- "" ->
- nil
- codepoints ->
- codepoints
- |> :binary.split(" ", [:global])
- |> Enum.map(&<<String.to_integer(&1, 16)::utf8>>)
- |> IO.iodata_to_binary
-end
-
-data_path = Path.join(__DIR__, "UnicodeData.txt")
-
-{codes, non_breakable, decompositions, combining_classes} =
- Enum.reduce File.stream!(data_path), {[], [], %{}, %{}}, fn line, {cacc, wacc, dacc, kacc} ->
- [codepoint, _name, _category,
- class, _bidi, decomposition,
- _numeric_1, _numeric_2, _numeric_3,
- _bidi_mirror, _unicode_1, _iso,
- upper, lower, title] = :binary.split(line, ";", [:global])
-
- title = :binary.part(title, 0, byte_size(title) - 1)
-
- cacc =
- if upper != "" or lower != "" or title != "" do
- [{to_binary.(codepoint), to_binary.(upper), to_binary.(lower), to_binary.(title)} | cacc]
- else
- cacc
- end
-
- wacc =
- case decomposition do
- "<noBreak>" <> _ -> [to_binary.(codepoint) | wacc]
- _ -> wacc
- end
-
- dacc =
- case decomposition do
- <<h, _::binary>> when h != ?< -> # Decomposition
- decomposition =
- decomposition
- |> :binary.split(" ", [:global])
- |> Enum.map(&String.to_integer(&1, 16))
- Map.put(dacc, String.to_integer(codepoint, 16), decomposition)
- _ ->
- dacc
- end
-
- kacc =
- case Integer.parse(class) do
- {0, ""} -> kacc
- {n, ""} -> Map.put(kacc, String.to_integer(codepoint, 16), n)
- end
-
- {cacc, wacc, dacc, kacc}
- end
-
-defmodule String.Casing do
- @moduledoc false
-
- special_path = Path.join(__DIR__, "SpecialCasing.txt")
-
- codes = Enum.reduce File.stream!(special_path), codes, fn line, acc ->
- [codepoint, lower, title, upper, _] = :binary.split(line, "; ", [:global])
- key = to_binary.(codepoint)
- :lists.keystore(key, 1, acc, {key,
- to_binary.(upper),
- to_binary.(lower),
- to_binary.(title)})
- end
-
- # Downcase
-
- def downcase(string), do: downcase(string, "")
-
- for {codepoint, _upper, lower, _title} <- codes, lower && lower != codepoint do
- defp downcase(unquote(codepoint) <> rest, acc) do
- downcase(rest, acc <> unquote(lower))
- end
- end
-
- defp downcase(<<char, rest::binary>>, acc) do
- downcase(rest, <<acc::binary, char>>)
- end
-
- defp downcase("", acc), do: acc
-
- # Upcase
-
- def upcase(string), do: upcase(string, "")
-
- for {codepoint, upper, _lower, _title} <- codes, upper && upper != codepoint do
- defp upcase(unquote(codepoint) <> rest, acc) do
- upcase(rest, acc <> unquote(upper))
- end
- end
-
- defp upcase(<<char, rest::binary>>, acc) do
- upcase(rest, <<acc::binary, char>>)
- end
-
- defp upcase("", acc), do: acc
-
- # Titlecase once
-
- def titlecase_once(""), do: {"", ""}
-
- for {codepoint, _upper, _lower, title} <- codes, title && title != codepoint do
- def titlecase_once(unquote(codepoint) <> rest) do
- {unquote(title), rest}
- end
- end
-
- def titlecase_once(<<char::utf8, rest::binary>>) do
- {<<char::utf8>>, rest}
- end
-
- def titlecase_once(<<char, rest::binary>>) do
- {<<char>>, rest}
- end
-end
-
-defmodule String.Break do
- @moduledoc false
- @whitespace_max_size 3
-
- prop_path = Path.join(__DIR__, "WhiteSpace.txt")
-
- whitespace = Enum.reduce File.stream!(prop_path), [], fn line, acc ->
- case line |> :binary.split(";") |> hd do
- <<first::4-bytes, "..", last::4-bytes, _::binary>> ->
- first = String.to_integer(first, 16)
- last = String.to_integer(last, 16)
- Enum.map(first..last, fn int -> <<int::utf8>> end) ++ acc
- <<single::4-bytes, _::binary>> ->
- [<<String.to_integer(single, 16)::utf8>> | acc]
- end
- end
-
- # trim_leading
-
- for codepoint <- whitespace do
- def trim_leading(unquote(codepoint) <> rest), do: trim_leading(rest)
- end
- def trim_leading(""), do: ""
- def trim_leading(string) when is_binary(string), do: string
-
- # trim_trailing
-
- for codepoint <- whitespace do
- # We need to increment @whitespace_max_size as well
- # as the small table (_s) if we add a new entry here.
- case byte_size(codepoint) do
- 3 ->
- defp do_trim_trailing_l(unquote(codepoint)), do: -3
- 2 ->
- defp do_trim_trailing_l(<<_, unquote(codepoint)>>), do: -2
-
- defp do_trim_trailing_s(unquote(codepoint)), do: <<>>
- 1 ->
- defp do_trim_trailing_l(<<unquote(codepoint), unquote(codepoint), unquote(codepoint)>>), do: -3
- defp do_trim_trailing_l(<<_, unquote(codepoint), unquote(codepoint)>>), do: -2
- defp do_trim_trailing_l(<<_, _, unquote(codepoint)>>), do: -1
-
- defp do_trim_trailing_s(<<x, unquote(codepoint)>>), do: do_trim_trailing_s(<<x>>)
- defp do_trim_trailing_s(unquote(codepoint)), do: <<>>
- end
- end
-
- defp do_trim_trailing_l(_), do: 0
- defp do_trim_trailing_s(o), do: o
-
- def trim_trailing(string) when is_binary(string) do
- trim_trailing(string, byte_size(string))
- end
-
- defp trim_trailing(string, size) when size < @whitespace_max_size do
- do_trim_trailing_s(string)
- end
-
- defp trim_trailing(string, size) do
- trail = binary_part(string, size, -@whitespace_max_size)
- case do_trim_trailing_l(trail) do
- 0 -> string
- x -> trim_trailing(binary_part(string, 0, size + x), size + x)
- end
- end
-
- # Split
-
- def split(string) do
- for piece <- :binary.split(string, unquote(whitespace -- non_breakable), [:global]),
- piece != "",
- do: piece
- end
-
- # Decompose
-
- def decompose(entries, map) do
- for entry <- entries do
- case map do
- %{^entry => match} -> decompose(match, map)
- %{} -> <<entry::utf8>>
- end
- end
- end
-end
-
-defmodule String.Normalizer do
- @moduledoc false
-
- exclusions_path = Path.join(__DIR__, "CompositionExclusions.txt")
-
- compositions = Enum.reduce File.stream!(exclusions_path), decompositions, fn
- <<h, _::binary>> = line, acc when h in ?0..?9 or h in ?A..?F ->
- [codepoint, _] = :binary.split(line, " ")
- Map.delete(acc, String.to_integer(codepoint, 16))
- _, acc ->
- acc
- end
-
- # Normalize
-
- def normalize(string, :nfd) when is_binary(string) do
- normalize_nfd(string, "")
- end
-
- def normalize(string, :nfc) when is_binary(string) do
- normalize_nfc(string, "")
- end
-
- defp normalize_nfd("", acc), do: acc
-
- defp normalize_nfd(<<cp::utf8, rest::binary>>, acc) when cp in 0xAC00..0xD7A3 do
- {syllable_index, t_count, n_count} = {cp - 0xAC00, 28, 588}
- lead = 0x1100 + div(syllable_index, n_count)
- vowel = 0x1161 + div(rem(syllable_index, n_count), t_count)
- trail = 0x11A7 + rem(syllable_index, t_count)
- binary =
- if trail == 0x11A7 do
- <<lead::utf8, vowel::utf8>>
- else
- <<lead::utf8, vowel::utf8, trail::utf8>>
- end
- normalize_nfd(rest, acc <> binary)
- end
-
- defp normalize_nfd(binary, acc) do
- {n, rest} = String.Unicode.next_grapheme_size(binary)
- part = :binary.part(binary, 0, n)
- case n do
- 1 -> normalize_nfd(rest, acc <> part)
- _ -> normalize_nfd(rest, acc <> canonical_order(part, []))
- end
- end
-
- defp normalize_nfc("", acc), do: acc
-
- defp normalize_nfc(<<cp::utf8, rest::binary>>, acc) when cp in 0xAC00..0xD7A3 do
- normalize_nfc(rest, acc <> <<cp::utf8>>)
- end
-
- defp normalize_nfc(binary, acc) do
- {n, rest} = String.Unicode.next_grapheme_size(binary)
- part = :binary.part(binary, 0, n)
- case n do
- 1 -> normalize_nfc(rest, acc <> part)
- _ -> normalize_nfc(rest, acc <> compose(normalize_nfd(part, "")))
- end
- end
-
- for {cp, decomposition} <- decompositions do
- decomposition =
- decomposition
- |> String.Break.decompose(decompositions)
- |> IO.iodata_to_binary()
-
- defp canonical_order(unquote(<<cp::utf8>>) <> rest, acc) do
- canonical_order(unquote(decomposition) <> rest, acc)
- end
- end
- defp canonical_order(<<h::utf8, t::binary>>, acc) do
- case combining_class(h) do
- 0 -> canonical_order(acc) <> canonical_order(t, [{h, 0}])
- n -> canonical_order(t, [{h, n} | acc])
- end
- end
- defp canonical_order(<<>>, acc) do
- canonical_order(acc)
- end
-
- defp canonical_order([{x, _}]) do
- <<x::utf8>>
- end
- defp canonical_order(acc) do
- :lists.keysort(2, Enum.reverse(acc))
- |> Enum.map(&<<elem(&1, 0)::utf8>>)
- |> IO.iodata_to_binary
- end
-
- for {codepoint, class} <- combining_classes do
- defp combining_class(unquote(codepoint)), do: unquote(class)
- end
-
- defp combining_class(_), do: 0
-
- defp compose(<<lead::utf8, vowel::utf8, rest::binary>>) when lead in 0x1100..0x1112 and vowel in 0x1161..0x1175 do
- codepoint = 0xAC00 + ((lead - 0x1100) * 588) + ((vowel - 0x1161) * 28)
- case rest do
- <<trail::utf8, accents::binary>> when trail in 0x11A7..0x11C2 ->
- <<codepoint + trail - 0x11A7::utf8, accents::binary>>
- _ ->
- <<codepoint::utf8, rest::binary>>
- end
- end
-
- defp compose(binary) do
- compose_one(binary) || (
- <<cp::utf8, rest::binary>> = binary
- compose_many(rest, <<cp::utf8>>, "", combining_class(cp) - 1)
- )
- end
-
- defp compose_many("", base, accents, _), do: base <> accents
-
- defp compose_many(<<cp::utf8, rest::binary>>, base, accents, last_class) do
- part_class = combining_class(cp)
- combined = <<base::binary, cp::utf8>>
- if composed = (last_class < part_class && compose_one(combined)) do
- compose_many(rest, composed, accents, last_class)
- else
- compose_many(rest, base, <<accents::binary, cp::utf8>>, part_class)
- end
- end
-
- # Compositions:
- # 1. We must exclude compositions with a single codepoint
- # 2. We must exclude compositions that do not start with 0 combining class
- for {cp, [fst, snd]} <- compositions,
- Map.get(combining_classes, fst, 0) == 0 do
- defp compose_one(unquote(<<fst::utf8, snd::utf8>>)), do: unquote(<<cp::utf8>>)
- end
-
- defp compose_one(_), do: nil
-end