summaryrefslogtreecommitdiff
path: root/lib/elixir/unicode/graphemes_test.exs
blob: a7d39089115403f2a559ecbcc744329e8daa2d2d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
defmodule GraphemesTest do
  def run do
    IO.puts("Running GraphemeBreakTest.txt")
    count = run_grapheme_break()
    IO.puts("Got #{count} failures")
  end

  defp run_grapheme_break do
    Path.join(__DIR__, "GraphemeBreakTest.txt")
    |> File.stream!()
    |> Stream.filter(&match?("÷" <> _, &1))
    |> Stream.reject(&(&1 =~ "D800"))
    |> Enum.reduce(0, fn line, acc ->
      [string | _] = String.split(line, "#", parts: 2)
      {string, graphemes} = parse_grapheme_break(string)

      if String.graphemes(string) == graphemes do
        acc
      else
        acc = acc + 1

        IO.puts("""
        ============== Failure ##{acc} ==============

            String.graphemes(#{inspect(string)})

        must be:

            #{inspect(graphemes)}

        got:

            #{inspect(String.graphemes(string))}

        On line:

            #{line}
        """)

        acc
      end
    end)
  end

  defp parse_grapheme_break(string) do
    string
    |> String.trim()
    |> String.trim_leading("÷ ")
    |> String.trim_trailing(" ÷")
    |> parse_grapheme_break("", [])
  end

  defp parse_grapheme_break(string, acc_string, acc_list) do
    case String.split(string, " ÷ ", parts: 2) do
      [left, right] ->
        grapheme = breaks_to_grapheme(left)
        parse_grapheme_break(right, acc_string <> grapheme, [grapheme | acc_list])

      [left] ->
        grapheme = breaks_to_grapheme(left)
        {acc_string <> grapheme, Enum.reverse([grapheme | acc_list])}
    end
  end

  defp breaks_to_grapheme(string) do
    for codepoint <- String.split(string, " × "),
        do: <<String.to_integer(codepoint, 16)::utf8>>,
        into: ""
  end
end

GraphemesTest.run()